## First approach using Word2vec for text embedding

In [2]:
import pandas as pd
import numpy as np

import gensim

### Para cuentas de gasto

In [3]:
#imortación de datos
df_cg = pd.read_excel('/home/jovyan/work/cuentas_gasto.xlsx')
df_cg.iloc[0:10]

Unnamed: 0,id,tipo_proyecto_id,cog,d_cog
0,1,1,1000,SERVICIOS PERSONALES
1,2,1,1100,REMUNERACIONES AL PERSONAL DE CARÁCTER PERMANENTE
2,3,1,113,Sueldos base al personal permanente
3,4,1,1131,Sueldos base al personal académico permanente
4,5,1,1132,Sueldos base al personal administrativo perman...
5,6,1,1133,"Sueldos base al personal mandos medios, interm..."
6,7,1,1200,REMUNERACIONES AL PERSONAL DE CARÁCTER TRANSIT...
7,8,1,121,Honorarios asimilables a salarios
8,9,1,1211,Honorarios asimilables a salarios personal aca...
9,10,1,1212,Honorarios asimilables a salarios personal adm...


In [4]:
text_processed = df_cg.d_cog.apply(lambda text:gensim.utils.simple_preprocess(text, min_len=4))
text_processed 

0                                [servicios, personales]
1       [remuneraciones, personal, carácter, permanente]
2                  [sueldos, base, personal, permanente]
3       [sueldos, base, personal, académico, permanente]
4      [sueldos, base, personal, administrativo, perm...
                             ...                        
585                                 [reactivos, insumos]
586                                [cuotas, inscripcion]
587                   [apoyo, para, material, didáctico]
588                                      [instalaciones]
589                              [póliza, mantenimiento]
Name: d_cog, Length: 590, dtype: object

In [5]:
model = gensim.models.Word2Vec(
    window=4,
    min_count=2,
    workers=6
)

model.build_vocab(text_processed, progress_per=500)

In [6]:
model.train(text_processed, total_examples=model.corpus_count, epochs=model.epochs)

(6588, 10485)

In [7]:
model.wv.most_similar('instalaciones')

[('servidores', 0.23951469361782074),
 ('gestión', 0.2288135141134262),
 ('comerciales', 0.22705486416816711),
 ('otras', 0.21289587020874023),
 ('estímulos', 0.20880089700222015),
 ('inversión', 0.2027464359998703),
 ('amortización', 0.20271936058998108),
 ('software', 0.19569683074951172),
 ('grupal', 0.19569313526153564),
 ('documentos', 0.19457469880580902)]

In [8]:
model.wv['mantenimiento']


array([ 2.6529317e-03,  1.2513287e-03, -2.6260323e-03,  9.5740268e-03,
        2.8884802e-03,  3.6415316e-03, -8.8850257e-04,  1.6141307e-03,
        6.2962114e-03, -1.1722341e-03,  3.0337060e-03, -1.1721196e-03,
        5.1175430e-03,  3.5746652e-03,  2.6243811e-03, -5.6694294e-03,
       -4.6312604e-03,  4.1155694e-03, -6.0021342e-03, -1.3410200e-03,
       -6.7453872e-04,  3.6976226e-03, -8.3091343e-03,  8.8881934e-03,
       -1.7052402e-03, -5.2224211e-03,  3.7781917e-03, -2.0591558e-03,
       -8.4968889e-03, -4.4628489e-03,  1.5634452e-05, -9.0866862e-03,
        6.0733740e-04,  2.0376486e-03, -3.5804505e-03,  3.0369349e-03,
        2.5630118e-03,  9.8499153e-03,  1.1554293e-03,  1.3725266e-03,
        2.9228365e-03, -2.3324785e-03, -8.9300843e-03,  8.0213370e-03,
       -1.3864191e-03, -9.7420840e-03, -6.6698748e-03, -4.1755894e-03,
        4.4125076e-03,  5.2853385e-03,  6.4409249e-03, -7.1796612e-03,
        6.9301651e-04, -2.8689953e-03, -5.5660675e-03,  7.7184485e-03,
      

In [9]:
model.wv.similarity('personal', 'grupal')

0.053354543


### Para articulos

In [10]:
df_art = pd.read_csv('/home/jovyan/work/articulos.csv')
df_art.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97369 entries, 0 to 97368
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             97369 non-null  int64  
 1   origen_id      97369 non-null  int64  
 2   origen_type    96939 non-null  object 
 3   articulo       97369 non-null  object 
 4   cantidad       97369 non-null  float64
 5   impuesto       97369 non-null  int64  
 6   costo          97369 non-null  float64
 7   descuento      97369 non-null  float64
 8   monto          97369 non-null  float64
 9   oc_id          97369 non-null  int64  
 10  unidad         97220 non-null  object 
 11  inventariable  97369 non-null  int64  
 12  no_cotizado    97369 non-null  int64  
 13  deleted_at     97369 non-null  object 
dtypes: float64(4), int64(6), object(4)
memory usage: 10.4+ MB


In [11]:
text_processed_art = df_art.articulo.apply(lambda text:gensim.utils.simple_preprocess(text, min_len=4))
text_processed_art 

0                                               [abrasivo]
1                         [taquete, expansor, tipo, broca]
2                              [tornillos, rosca, corrida]
3        [taquete, expansor, tipo, cuerda, diámetro, ro...
4                                             [pija, lath]
                               ...                        
97364      [taza, kabbi, cerámica, impresa, tinta, diseño]
97365    [sutura, sequilon, nylon, monofilamento, sutur...
97366    [racks, para, carga, pesada, largo, ancho, alt...
97367    [servicio, mantenimiento, general, limpieza, p...
97368    [racks, para, carga, pesada, largo, ancho, alt...
Name: articulo, Length: 97369, dtype: object

In [12]:
art_model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=6
)

art_model.build_vocab(text_processed_art, progress_per=1000)

In [13]:
art_model.train(text_processed_art, total_examples=art_model.corpus_count, epochs=art_model.epochs)

(3122401, 3343470)

In [14]:
art_model.wv.most_similar('servicio')


[('reparación', 0.9396077990531921),
 ('difractometro', 0.9177221655845642),
 ('arcos', 0.9127128720283508),
 ('remplazo', 0.9083824157714844),
 ('correctivos', 0.9073820114135742),
 ('póliza', 0.9050681591033936),
 ('mantenimiento', 0.9037003517150879),
 ('incluyen', 0.9001067876815796),
 ('refacciones', 0.8953447341918945),
 ('liners', 0.8947370648384094)]

## New try

In [28]:
import pandas as pd
import torch 
from torch.utils.data import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

df_articles = pd.read_csv('/home/jovyan/work/articulos_updated.csv')
df_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   articulo  10000 non-null  object
 1   cog_id    10000 non-null  int64 
 2   d_cog     10000 non-null  object
dtypes: int64(1), object(2)
memory usage: 234.5+ KB


In [49]:
df_articles["cog_id"] = df_articles["cog_id"].astype('category').cat.codes
df_articles.iloc[:]

Unnamed: 0,articulo,cog_id,d_cog
0,\t\r\nISBN 9780201626742\tTITULO The Early Uni...,5,Libros
1,\r\n\r\n\r\nPLUMA AZUL MOD. B2P Gel Roller 0.7,0,"Materiales, útiles y equipos menores de oficina"
2,\r\n\r\nPLUMA ROJA MOD. B2P Gel Roller 0.7,0,"Materiales, útiles y equipos menores de oficina"
3,"\r\n2-HYDROXYETHYL METHACRYLATE, CONTAINS <& C...",21,Otros productos químicos
4,\r\nALDRICH(R) ROUND-BOTTOM STORAGE FLASK W& C...,21,Otros productos químicos
...,...,...,...
9995,YODURO POTASIO A.S.C. 500 gr (FERMONT),21,Otros productos químicos
9996,ZAPAPICO ZP5MX\t,27,Herramientas menores
9997,ZAPOS AZUL COFLEX PARA WC\r\n,14,Otros materiales y artículos de construcción y...
9998,ZINC Zinc chloride anhydrous 500 gr. no. catal...,15,Productos químicos básicos


## DestilBERT

In [50]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_articles["articulo"], df_articles["cog_id"], test_size=0.2, random_state=42
)

print("training dfs")
print(train_texts.info())
print(train_labels.info())

print("test dfs")
print(test_texts.info())
print(test_labels.info())

training dfs
<class 'pandas.core.series.Series'>
Index: 8000 entries, 9254 to 7270
Series name: articulo
Non-Null Count  Dtype 
--------------  ----- 
8000 non-null   object
dtypes: object(1)
memory usage: 125.0+ KB
None
<class 'pandas.core.series.Series'>
Index: 8000 entries, 9254 to 7270
Series name: cog_id
Non-Null Count  Dtype
--------------  -----
8000 non-null   int8 
dtypes: int8(1)
memory usage: 70.3 KB
None
test dfs
<class 'pandas.core.series.Series'>
Index: 2000 entries, 6252 to 6929
Series name: articulo
Non-Null Count  Dtype 
--------------  ----- 
2000 non-null   object
dtypes: object(1)
memory usage: 31.2+ KB
None
<class 'pandas.core.series.Series'>
Index: 2000 entries, 6252 to 6929
Series name: cog_id
Non-Null Count  Dtype
--------------  -----
2000 non-null   int8 
dtypes: int8(1)
memory usage: 17.6 KB
None


In [52]:
# Get number of unique classes
num_classes = df_articles["cog_id"].nunique()

num_classes

97

In [23]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=1e-5,
    eval_steps=100,
    logging_dir="./logs",
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_classes)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
# Load DistilBERT Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Custom Dataset Class
class ArticleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Convert Data to PyTorch Dataset
train_dataset = ArticleDataset(train_texts, train_labels, tokenizer)
test_dataset = ArticleDataset(test_texts, test_labels, tokenizer)

train_dataset[:]

{'input_ids': tensor([[  101,  4309,  2099,  ...,     0,     0,     0],
         [  101,  5033, 12928,  ...,     0,     0,     0],
         [  101, 11122, 10875,  ...,     0,     0,     0],
         ...,
         [  101,  5622, 12618,  ...,     0,     0,     0],
         [  101, 21451, 16786,  ...,     0,     0,     0],
         [  101, 22088,  2050,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([ 1, 12,  2,  ...,  5, 31,  0])}

In [57]:
print("Unique labels:", train_labels.unique())
print("Max label:", train_labels.max())
print("Min label:", train_labels.min())


df_articles["cog_id"].nunique()


Unique labels: [ 1 12  2 15 19  0 59 95 14 73 21 30 60 72 29  6  5 96 85 82 57 79 31 34
 87 70 56 27 24 61 22 38 28 18 42 62 47 67 64 69 23 54 40  7 32 80 41 52
 46 75 74  9 20 44 58 25 63 90 11 86 39 65 13 94 89 88 55 66 91 37 76 78
 33 45 10 92 36 71 26 84  3 93 50 35 16 77 53  4  8 51 48 43 49 17 81 83
 68]
Max label: 96
Min label: 0


97

In [58]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
trainer.train()

Step,Training Loss
500,3.045
1000,1.5309
1500,1.1111
2000,0.9688
