# Objective
- O objetivo é treinar um modelo utilizando a descrição do produto para prever o setor e o território
- Vamos utilizar Word2Vec para trasformar a descrição em vetores (embeddings)
- Vamos utilizar um modelo XG boost para treinar

# Data
- Dados fornecidos pelo banco de Portugal

# Imports

In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from feature_engine.encoding import RareLabelEncoder
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
import gradio as gr
import pickle

# Reading Data

In [33]:
df = pd.read_excel('../../data/01.Dataset FI_06032024.xlsx', sheet_name=2)

# Small Preprocessing

Mapeando o código ao nome da descrição

In [34]:
MAP_TipoInformacao = {"A": "ativo", "P": "passivo"}
MAP_TipoInstrumento = {"F21": "Numerário", "F22": "Depósitos transferíveis", "F29": "Outros depósitos", "F3_P": "Títulos de dívida", "F4": "Empréstimos", "F511": "Ações cotadas", "F512": "Ações não cotadas", "F519": "Outras participações", "F521": "Unidades de Participação emitidas por FMM", "F522": "Unidades de Participação emitidas por FI, excluindo FMM", "F71": "Derivados financeiros"}
MAP_MaturidadeOriginal = {"01": "A vista", "10": "Ate 1 ano", "06": "De 1 a 2 anos", "07": "De 2 a 5 anos", "08": "A mais de 5 anos", "_Z": "Não aplicável"}

In [35]:
df.TipoInformacao = df.TipoInformacao.map(MAP_TipoInformacao)
df.TipoInstrumento = df.TipoInstrumento.map(MAP_TipoInstrumento)
df.MaturidadeOriginal = df.MaturidadeOriginal.map(MAP_MaturidadeOriginal)

In [36]:
df.drop(["CodEntidadeRef", "CodEntidadeCon"], axis=1, inplace=True)

In [37]:
df.head(3)

Unnamed: 0,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,SetorInstitucionalCon,TerritorioCon
0,ativo,Depósitos transferíveis,BST Futuros 2.90%,A vista,S122,PRT
1,ativo,Depósitos transferíveis,BPI EUR 0.00%,A vista,S122,PRT
2,ativo,Depósitos transferíveis,BST Futuros 2.65%,A vista,S122,PRT


# Feature Engineering

In [38]:
df_clean = df.copy()

### Encoding Rare Labels

### Label Enconder

Criando funções para fazer o enconding dos targets

In [39]:
def encode_target(label, category_mapping):
  # Check if label is unseen (not in the dictionary)
  if label not in category_mapping:
    # Assign next available integer as seen in training data
    new_value = len(category_mapping)
    category_mapping[label] = new_value
  
  return category_mapping[label]

In [40]:
def map_numbers_to_categories(numbers, category_mapping):
    """Maps numbers back to their corresponding category names using a provided mapping dictionary.

    Args:
        numbers: A list or array containing the numerical representations of categories.
        category_mapping: A dictionary mapping category names (keys) to their numerical representations (values).

    Returns:
        A list containing the corresponding category names for the input numbers.
    """

    category_names = [category_mapping.get(number, None) for number in numbers]
    return category_names

In [41]:
def return_map(df_clean):

    territory_map = {}
    sector_map = {}

    # Iterate through each row (assuming TerritorioCon and encoded_label_territorio are in the same order)
    for territorio, encoded_label in zip(df_clean["TerritorioCon"], df_clean["encoded_label_territorio"]):
      # Add the mapping to the dictionary if the TerritorioCon is not already present
      if territorio not in territory_map:
        territory_map[territorio] = encoded_label
    
    for sector, encoded_label_sector in zip(df_clean["SetorInstitucionalCon"], df_clean["encoded_label_setor"]):
      # Add the mapping to the dictionary if the TerritorioCon is not already present
      if sector not in sector_map:
        sector_map[sector] = encoded_label_sector

    return territory_map, sector_map

Aplicando funções para mapear os targets a códigos pra podermos treinar o modelo

In [42]:
# Get unique categories from 'TerritorioCon' column
unique_categories_ter = df_clean['TerritorioCon'].unique()
category_mapping_ter = dict(zip(unique_categories_ter, range(len(unique_categories_ter))))
inverted_mapping_ter = {value: key for key, value in category_mapping_ter.items()}

unique_categories_sec = df_clean["SetorInstitucionalCon"].unique()
category_mapping_sec = dict(zip(unique_categories_sec, range(len(unique_categories_sec))))
inverted_mapping_sec = {value: key for key, value in category_mapping_sec.items()}

df_clean["encoded_label_territorio"] = df_clean["TerritorioCon"].apply(encode_target, args=[category_mapping_ter])
df_clean["encoded_label_setor"] = df_clean['SetorInstitucionalCon'].apply(encode_target, args=[category_mapping_sec])

In [43]:
df_clean.head(3)

Unnamed: 0,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,SetorInstitucionalCon,TerritorioCon,encoded_label_territorio,encoded_label_setor
0,ativo,Depósitos transferíveis,BST Futuros 2.90%,A vista,S122,PRT,0,0
1,ativo,Depósitos transferíveis,BPI EUR 0.00%,A vista,S122,PRT,0,0
2,ativo,Depósitos transferíveis,BST Futuros 2.65%,A vista,S122,PRT,0,0


### Processing Description Column

Vamos agora:
1. Utilizar a função simple_preprocess para aplicarmos tecnicas de Text Mining para limpar a descrição
2. Treinando o modelo Word2Vec para processar coluna com a descrição para vetores (Embeddings)

In [44]:
df_clean['tokenized_Descricao_text'] = df_clean['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
word2vec_model = Word2Vec(sentences=df_clean['tokenized_Descricao_text'], vector_size=100, window=5, min_count=1, workers=4)

In [45]:
df_clean

Unnamed: 0,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,SetorInstitucionalCon,TerritorioCon,encoded_label_territorio,encoded_label_setor,tokenized_Descricao_text
0,ativo,Depósitos transferíveis,BST Futuros 2.90%,A vista,S122,PRT,0,0,"[bst, futuros]"
1,ativo,Depósitos transferíveis,BPI EUR 0.00%,A vista,S122,PRT,0,0,"[bpi, eur]"
2,ativo,Depósitos transferíveis,BST Futuros 2.65%,A vista,S122,PRT,0,0,"[bst, futuros]"
3,ativo,Depósitos transferíveis,BST EUR 3.15%,A vista,S122,PRT,0,0,"[bst, eur]"
4,ativo,Depósitos transferíveis,BST EUR 2.65%,A vista,S122,PRT,0,0,"[bst, eur]"
...,...,...,...,...,...,...,...,...,...
42402,passivo,Empréstimos,"Empréstimo Vic Management 0,01%",Não aplicável,S11,PRT,0,2,"[empréstimo, vic, management]"
42403,passivo,Títulos de dívida,Outros Passivos,De 2 a 5 anos,S122,BEL,10,0,"[outros, passivos]"
42404,passivo,Empréstimos,"Empréstimo Vic One Pest Sup 0,01%",Não aplicável,S11,PRT,0,2,"[empréstimo, vic, one, pest, sup]"
42405,passivo,Empréstimos,"Empréstimo Vic Management 0,01%",Não aplicável,S11,PRT,0,2,"[empréstimo, vic, management]"


In [46]:
def compute_avg_embedding(tokens, unknown_embedding=[0]*word2vec_model.vector_size):
    embeddings = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if embeddings:  # Embeddings found
        return np.array(embeddings).mean(axis=0)  # Return average embedding as a NumPy array
    else:  # No embeddings found
        return np.array(unknown_embedding)

In [47]:
df_clean['avg_embedding'] = df_clean['tokenized_Descricao_text'].apply(compute_avg_embedding)
X = df_clean['avg_embedding'].apply(pd.Series).to_numpy()
y1 = df_clean['encoded_label_territorio']
y2 = df_clean['encoded_label_setor']
embed_data = pd.DataFrame(X)

In [48]:
df_clean.head(3)

Unnamed: 0,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,SetorInstitucionalCon,TerritorioCon,encoded_label_territorio,encoded_label_setor,tokenized_Descricao_text,avg_embedding
0,ativo,Depósitos transferíveis,BST Futuros 2.90%,A vista,S122,PRT,0,0,"[bst, futuros]","[0.16835219, 0.2564944, 0.17026035, 0.07354168..."
1,ativo,Depósitos transferíveis,BPI EUR 0.00%,A vista,S122,PRT,0,0,"[bpi, eur]","[-0.33069888, 0.53979915, -0.01237967, -0.0759..."
2,ativo,Depósitos transferíveis,BST Futuros 2.65%,A vista,S122,PRT,0,0,"[bst, futuros]","[0.16835219, 0.2564944, 0.17026035, 0.07354168..."


### Train Test Split

Separando os dados em treino e test. Vamos ter dois targets. Um para o setor e outro para o território

In [49]:
# Stratified split with 'TerritorioCon' as the stratification factor
#df_train, df_test = train_test_split(embed_data, test_size=0.2, random_state=42)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y1, test_size=0.1, random_state=41)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y2, test_size=0.1, random_state=41)

# Train 

## Train the Model Territorio

In [50]:
clf_ter = XGBClassifier(random_state=42, max_depth=4)
clf_ter.fit(X_train_1, y_train_1, early_stopping_rounds=10, 
        eval_set=[(X_test_1, y_test_1)])



[0]	validation_0-mlogloss:2.16960
[1]	validation_0-mlogloss:4.89598
[2]	validation_0-mlogloss:2.11399
[3]	validation_0-mlogloss:1.97298
[4]	validation_0-mlogloss:1.86315
[5]	validation_0-mlogloss:1.76872
[6]	validation_0-mlogloss:1.72518
[7]	validation_0-mlogloss:1.70945
[8]	validation_0-mlogloss:1.65475
[9]	validation_0-mlogloss:1.65110
[10]	validation_0-mlogloss:1.50772
[11]	validation_0-mlogloss:1.45096
[12]	validation_0-mlogloss:1.43801
[13]	validation_0-mlogloss:1.44121
[14]	validation_0-mlogloss:1.41552
[15]	validation_0-mlogloss:1.33467
[16]	validation_0-mlogloss:1.30881
[17]	validation_0-mlogloss:1.26058
[18]	validation_0-mlogloss:1.25432
[19]	validation_0-mlogloss:1.31544
[20]	validation_0-mlogloss:1.23050
[21]	validation_0-mlogloss:1.20639
[22]	validation_0-mlogloss:1.18532
[23]	validation_0-mlogloss:1.16648
[24]	validation_0-mlogloss:1.13963
[25]	validation_0-mlogloss:1.10693
[26]	validation_0-mlogloss:1.04904
[27]	validation_0-mlogloss:0.99640
[28]	validation_0-mlogloss:0.9

## Evaluate 

In [51]:
y_pred_test = clf_ter.predict(X_test_1)
y_pred_train = clf_ter.predict(X_train_1)

In [52]:
pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_ter)
pred_train_str = map_numbers_to_categories(y_pred_train, inverted_mapping_ter)

In [53]:
# Evaluate accuracy
accuracy_test = accuracy_score(y_test_1.to_list(), y_pred_test)
accuracy_train = accuracy_score(y_train_1.to_list(), y_pred_train)
print("Accuracy Test:", accuracy_test)
print("Accuracy Train:", accuracy_train)

Accuracy Test: 0.8908276349917472
Accuracy Train: 0.9267672797778127


In [54]:
print("Classification Report:")
print(classification_report(y_test_1.to_list(), y_pred_test))

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1277
           1       0.88      0.91      0.89       533
           2       0.90      0.82      0.86       231
           3       0.87      0.76      0.81       283
           4       0.89      0.86      0.87       260
           5       0.87      0.86      0.86       339
           6       0.92      0.88      0.90       171
           7       0.96      0.96      0.96        23
           8       0.96      0.85      0.90        27
           9       0.88      0.93      0.90        56
          10       1.00      0.91      0.95        32
          11       0.90      0.86      0.88       326
          12       0.89      0.91      0.90       201
          13       0.96      0.94      0.95       161
          14       0.95      0.95      0.95        37
          15       0.98      0.92      0.95        53
          16       1.00      0.85      0.92        13
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Train Model Sector

In [55]:
clf_sec = XGBClassifier(random_state=42)
clf_sec.fit(X_train_2, y_train_2, early_stopping_rounds=10, 
        eval_set=[(X_test_2, y_test_2)])



[0]	validation_0-mlogloss:1.54527
[1]	validation_0-mlogloss:1.27229
[2]	validation_0-mlogloss:1.08452
[3]	validation_0-mlogloss:0.95495
[4]	validation_0-mlogloss:0.85527
[5]	validation_0-mlogloss:0.77815
[6]	validation_0-mlogloss:0.71438
[7]	validation_0-mlogloss:0.66678
[8]	validation_0-mlogloss:0.62487
[9]	validation_0-mlogloss:0.59196
[10]	validation_0-mlogloss:0.56465
[11]	validation_0-mlogloss:0.54287
[12]	validation_0-mlogloss:0.51913
[13]	validation_0-mlogloss:0.50235
[14]	validation_0-mlogloss:0.48463
[15]	validation_0-mlogloss:0.46672
[16]	validation_0-mlogloss:0.45122
[17]	validation_0-mlogloss:0.43783
[18]	validation_0-mlogloss:0.42691
[19]	validation_0-mlogloss:0.41645
[20]	validation_0-mlogloss:0.40615
[21]	validation_0-mlogloss:0.39824
[22]	validation_0-mlogloss:0.39045
[23]	validation_0-mlogloss:0.38362
[24]	validation_0-mlogloss:0.37253
[25]	validation_0-mlogloss:0.36586
[26]	validation_0-mlogloss:0.36061
[27]	validation_0-mlogloss:0.35347
[28]	validation_0-mlogloss:0.3

In [56]:
y_pred_test = clf_sec.predict(X_test_2)
y_pred_train = clf_sec.predict(X_train_2)

In [57]:
pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_sec)
pred_train_str = map_numbers_to_categories(y_pred_train, inverted_mapping_sec)

In [58]:
# Evaluate accuracy
accuracy_test = accuracy_score(y_test_2.to_list(), y_pred_test)
accuracy_train = accuracy_score(y_train_2.to_list(), y_pred_train)
print("Accuracy Test:", accuracy_test)
print("Accuracy Train:", accuracy_train)

Accuracy Test: 0.9318556944116954
Accuracy Train: 0.9611958287481004


In [59]:
print("Classification Report:")
print(classification_report(y_test_2.to_list(), y_pred_test))

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1248
           1       0.84      0.71      0.77       133
           2       0.93      0.97      0.95      1522
           3       0.80      0.71      0.76       298
           4       0.95      0.99      0.97       332
           5       0.80      0.75      0.77       138
           6       1.00      1.00      1.00         2
           7       0.79      0.75      0.77        20
           8       0.98      0.97      0.98       533
           9       1.00      1.00      1.00         5
          10       0.75      0.75      0.75         4
          11       0.00      0.00      0.00         1
          13       1.00      0.60      0.75         5

    accuracy                           0.93      4241
   macro avg       0.83      0.78      0.80      4241
weighted avg       0.93      0.93      0.93      4241



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Adding More Features

In [60]:
new_features = ["TipoInformacao", "TipoInstrumento", "MaturidadeOriginal"]
df_to_get_dummies = df_clean[new_features]
dummies = pd.get_dummies(df_to_get_dummies)

new_df_to_train = pd.concat([embed_data, dummies], axis=1)

In [61]:
# Stratified split with 'TerritorioCon' as the stratification factor
#df_train, df_test = train_test_split(embed_data, test_size=0.2, random_state=42)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(new_df_to_train, y1, test_size=0.1, random_state=41)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(new_df_to_train, y2, test_size=0.1, random_state=41)

# Train 

## Train the Model Territorio

In [62]:
clf_ter = XGBClassifier(random_state=42, max_depth=4)
clf_ter.fit(X_train_1, y_train_1, early_stopping_rounds=10, 
        eval_set=[(X_test_1, y_test_1)])



[0]	validation_0-mlogloss:2.05335
[1]	validation_0-mlogloss:4.17862
[2]	validation_0-mlogloss:1.92321
[3]	validation_0-mlogloss:1.78229
[4]	validation_0-mlogloss:1.78718
[5]	validation_0-mlogloss:1.61107
[6]	validation_0-mlogloss:1.54960
[7]	validation_0-mlogloss:1.44530
[8]	validation_0-mlogloss:1.39426
[9]	validation_0-mlogloss:1.36035
[10]	validation_0-mlogloss:1.32658
[11]	validation_0-mlogloss:1.30044
[12]	validation_0-mlogloss:1.28253
[13]	validation_0-mlogloss:1.27907
[14]	validation_0-mlogloss:1.24767
[15]	validation_0-mlogloss:1.22779
[16]	validation_0-mlogloss:1.21329
[17]	validation_0-mlogloss:1.21376
[18]	validation_0-mlogloss:1.22120
[19]	validation_0-mlogloss:1.21425
[20]	validation_0-mlogloss:1.20189
[21]	validation_0-mlogloss:1.18621
[22]	validation_0-mlogloss:1.17919
[23]	validation_0-mlogloss:1.15385
[24]	validation_0-mlogloss:1.14404
[25]	validation_0-mlogloss:1.13033
[26]	validation_0-mlogloss:1.11902
[27]	validation_0-mlogloss:1.11353
[28]	validation_0-mlogloss:1.1

## Evaluate 

In [63]:
y_pred_test = clf_ter.predict(X_test_1)
y_pred_train = clf_ter.predict(X_train_1)

In [64]:
pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_ter)
pred_train_str = map_numbers_to_categories(y_pred_train, inverted_mapping_ter)

In [65]:
# Evaluate accuracy
accuracy_test = accuracy_score(y_test_1.to_list(), y_pred_test)
accuracy_train = accuracy_score(y_train_1.to_list(), y_pred_train)
print("Accuracy Test:", accuracy_test)
print("Accuracy Train:", accuracy_train)

Accuracy Test: 0.8922423956614006
Accuracy Train: 0.9303306607975685


In [66]:
print("Classification Report:")
print(classification_report(y_test_1.to_list(), y_pred_test))

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1277
           1       0.89      0.93      0.91       533
           2       0.87      0.84      0.85       231
           3       0.70      0.78      0.74       283
           4       0.90      0.86      0.88       260
           5       0.87      0.87      0.87       339
           6       0.89      0.87      0.88       171
           7       0.96      0.96      0.96        23
           8       1.00      0.85      0.92        27
           9       0.87      0.95      0.91        56
          10       1.00      0.91      0.95        32
          11       0.86      0.88      0.87       326
          12       0.90      0.91      0.90       201
          13       0.97      0.97      0.97       161
          14       1.00      0.95      0.97        37
          15       1.00      0.94      0.97        53
          16       1.00      0.85      0.92        13
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Train Model Sector

In [67]:
clf_sec = XGBClassifier(random_state=42)
clf_sec.fit(X_train_2, y_train_2, early_stopping_rounds=10, 
        eval_set=[(X_test_2, y_test_2)])



[0]	validation_0-mlogloss:1.48332
[1]	validation_0-mlogloss:1.18085
[2]	validation_0-mlogloss:0.99302
[3]	validation_0-mlogloss:0.85817
[4]	validation_0-mlogloss:0.76103
[5]	validation_0-mlogloss:0.68454
[6]	validation_0-mlogloss:0.62241
[7]	validation_0-mlogloss:0.57389
[8]	validation_0-mlogloss:0.53241
[9]	validation_0-mlogloss:0.50109
[10]	validation_0-mlogloss:0.47669
[11]	validation_0-mlogloss:0.45674
[12]	validation_0-mlogloss:0.43698
[13]	validation_0-mlogloss:0.42201
[14]	validation_0-mlogloss:0.40803
[15]	validation_0-mlogloss:0.39380
[16]	validation_0-mlogloss:0.38097
[17]	validation_0-mlogloss:0.36940
[18]	validation_0-mlogloss:0.35914
[19]	validation_0-mlogloss:0.35139
[20]	validation_0-mlogloss:0.34465
[21]	validation_0-mlogloss:0.33679
[22]	validation_0-mlogloss:0.32559
[23]	validation_0-mlogloss:0.31869
[24]	validation_0-mlogloss:0.31226
[25]	validation_0-mlogloss:0.30598
[26]	validation_0-mlogloss:0.30258
[27]	validation_0-mlogloss:0.29619
[28]	validation_0-mlogloss:0.2

In [68]:
y_pred_test = clf_sec.predict(X_test_2)
y_pred_train = clf_sec.predict(X_train_2)

In [69]:
pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_sec)
pred_train_str = map_numbers_to_categories(y_pred_train, inverted_mapping_sec)

In [70]:
# Evaluate accuracy
accuracy_test = accuracy_score(y_test_2.to_list(), y_pred_test)
accuracy_train = accuracy_score(y_train_2.to_list(), y_pred_train)
print("Accuracy Test:", accuracy_test)
print("Accuracy Train:", accuracy_train)

Accuracy Test: 0.9450601273284602
Accuracy Train: 0.9670911282293141


In [71]:
print("Classification Report:")
print(classification_report(y_test_2.to_list(), y_pred_test))

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1248
           1       0.87      0.73      0.79       133
           2       0.94      0.98      0.96      1522
           3       0.83      0.72      0.77       298
           4       0.98      0.99      0.99       332
           5       0.82      0.78      0.80       138
           6       1.00      1.00      1.00         2
           7       0.79      0.75      0.77        20
           8       1.00      0.99      1.00       533
           9       1.00      1.00      1.00         5
          10       1.00      1.00      1.00         4
          11       0.00      0.00      0.00         1
          13       1.00      0.80      0.89         5

    accuracy                           0.95      4241
   macro avg       0.86      0.82      0.84      4241
weighted avg       0.94      0.95      0.94      4241



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Training with the whole data

Vamos treinar agora o modelo com os dados todos. São exatamente os mesmos passos aplicados aos dados todos

In [18]:
df = pd.read_excel('../../data/01.Dataset FI_06032024.xlsx', sheet_name=2)

In [19]:
df_clean = df.copy("../data_missing/sampled_indices.csv")

In [20]:
data_eval = df_clean.sample(1000, random_state=42)
not_sampled_mask = ~df_clean.index.isin(data_eval)  # Create a boolean mask for non-sampled rows
df_clean = df_clean[not_sampled_mask]


data_pred = df_clean.sample(1000, random_state=42)
not_sampled_mask = ~df_clean.index.isin(data_pred)
df_clean = df_clean[not_sampled_mask]

In [21]:
df_clean.to_csv("../data_train/data_train.csv")
data_eval.to_csv("../data_train/data_eval.csv")
data_pred.to_csv("../data_missing/data_pred.csv")

In [22]:
# Get unique categories from 'TerritorioCon' column
unique_categories_ter = df_clean['TerritorioCon'].unique()
category_mapping_ter = dict(zip(unique_categories_ter, range(len(unique_categories_ter))))
inverted_mapping_ter = {value: key for key, value in category_mapping_ter.items()}

unique_categories_sec = df_clean["SetorInstitucionalCon"].unique()
category_mapping_sec = dict(zip(unique_categories_sec, range(len(unique_categories_sec))))
inverted_mapping_sec = {value: key for key, value in category_mapping_sec.items()}

df_clean["encoded_label_territorio"] = df_clean["TerritorioCon"].apply(encode_target, args=[category_mapping_ter])
df_clean["encoded_label_setor"] = df_clean['SetorInstitucionalCon'].apply(encode_target, args=[category_mapping_sec])

In [23]:
df_clean['tokenized_Descricao_text'] = df_clean['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
word2vec_model = Word2Vec(sentences=df_clean['tokenized_Descricao_text'], vector_size=100, window=5, min_count=1, workers=4)

In [24]:
df_clean['tokenized_Descricao_text'] = df_clean['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
word2vec_model = Word2Vec(sentences=df_clean['tokenized_Descricao_text'], vector_size=100, window=5, min_count=1, workers=4)

df_clean['avg_embedding'] = df_clean['tokenized_Descricao_text'].apply(compute_avg_embedding)
X = df_clean['avg_embedding'].apply(pd.Series).to_numpy()
y1 = df_clean['encoded_label_territorio']
y2 = df_clean['encoded_label_setor']
embed_data = pd.DataFrame(X)

In [25]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y1, test_size=0.2, random_state=42)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y2, test_size=0.2, random_state=42)

In [26]:
clf_ter = XGBClassifier(random_state=42, max_depth=5)
clf_ter.fit(X, y1)

In [27]:
clf_sec = XGBClassifier(random_state=42, max_depth=5)
clf_sec.fit(X, y2)

Função que permite usar o modelo e retornar as previsões para novas entradas

In [28]:
def return_embeedings(string, word2vec_model,simple_preprocess, clf_t, clf_s, inverted_mapping_t, inverted_mapping_s):
    case = pd.DataFrame({'DescricaoInstrumento': [string]})
    case['tokenized_Descricao_text'] = case['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
    case['avg_embedding'] = case['tokenized_Descricao_text'].apply(compute_avg_embedding)
    
    X = case['avg_embedding'].apply(pd.Series).to_numpy()
    
    prediction_t = clf_t.predict(X)
    str_pred_t = map_numbers_to_categories(prediction_t, inverted_mapping_t)
    
    prediction_s = clf_s.predict(X)
    str_pred_s = map_numbers_to_categories(prediction_s, inverted_mapping_s)
    
    return str_pred_t, str_pred_s

In [29]:
str_to_try = 'HEATHROW FUNDING LTD 1.50% 12/10/2027'
str_pred_t, str_pred_s = return_embeedings(str_to_try, word2vec_model, simple_preprocess, clf_ter, clf_sec, inverted_mapping_ter, inverted_mapping_sec)

In [30]:
all_str_ter = []
all_str_sec = []

for i in range(len(sampled_indices)):

    string = sampled_indices.DescricaoInstrumento.iloc[i]
    str_pred_t, str_pred_s = return_embeedings(string, word2vec_model, simple_preprocess, clf_ter, clf_sec, inverted_mapping_ter, inverted_mapping_sec)
    
    all_str_ter.append(str_pred_t[0])
    all_str_sec.append(str_pred_s[0])
    
sampled_indices["sec_pred"] = all_str_sec  
sampled_indices["ter_pred"] = all_str_ter

NameError: name 'sampled_indices' is not defined

## Gradio App

Criando um aplicativo gradio para demostrar o trabalho

In [31]:
def return_embeedings_(string):
    case = pd.DataFrame({'DescricaoInstrumento': [string]})
    case['tokenized_Descricao_text'] = case['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
    case['avg_embedding'] = case['tokenized_Descricao_text'].apply(compute_avg_embedding)

    X = case['avg_embedding'].apply(pd.Series).to_numpy()

    try:
        prediction_t = clf_ter.predict(X)
        str_pred_t = map_numbers_to_categories(prediction_t, inverted_mapping_ter)

        prediction_s = clf_sec.predict(X)
        str_pred_s = map_numbers_to_categories(prediction_s, inverted_mapping_sec)

        return str(str_pred_t[0]), str(str_pred_s[0])
           
    except Exception as e:
        return "An error occurred: " + str(e)

    
examples = [
    ["HEATHROW FUNDING LTD 1.50% 12/10/2027"],
    ["FORTUM 1.625% A:27/02/2026"],
    ["BAC FLOAT 25/4/24"],
    ["DP DP 4M 0% 16/12/21CGD 0.00% 2020-12-16"],
]

# Create the Gradio app
iface = gr.Interface(
    # Argument 1: function (Required)
    fn=return_embeedings_,  # The function you want to expose as an interface

    # Argument 2: input components (Required)
    inputs=[gr.Textbox(lines=5, placeholder="Enter a text description")],  # Defines user input

    # Argument 3: examples (Optional)
    examples=examples,  # List of example text descriptions (or tuples with predictions)

    # Argument 4: output components (Required)
    outputs=[
        gr.Textbox(label="Prediction Territory"),
        gr.Textbox(label="Prediction Sector"),
    ],  # Defines how to display the output
)

# Launch the app
iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




## Saving and reading models

In [236]:
with open('model_data.pkl', 'wb') as f:
    pickle.dump(inverted_mapping_ter, f)
    pickle.dump(inverted_mapping_sec, f)
    pickle.dump(word2vec_model, f)
    pickle.dump(clf_sec, f)
    pickle.dump(clf_ter, f)

In [238]:
with open('model_data.pkl', 'rb') as f:
    # Load the data objects in the same order they were saved
    inverted_mapping_ter = pickle.load(f)
    inverted_mapping_sec = pickle.load(f)
    word2vec_model = pickle.load(f)
    clf_sec = pickle.load(f)
    clf_ter = pickle.load(f)