# Objective
- O objetivo é treinar um modelo utilizando a descrição do produto para prever o setor e o território
- Vamos utilizar Word2Vec para trasformar a descrição em vetores (embeddings)
- Vamos utilizar um modelo XG boost para treinar

# Data
- Dados fornecidos pelo banco de Portugal

# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from feature_engine.encoding import RareLabelEncoder
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
import gradio as gr
import pickle

# Reading Data

In [2]:
df = pd.read_excel('../../data/01.Dataset FI_06032024.xlsx', sheet_name=2)

# Small Preprocessing

Mapeando o código ao nome da descrição

In [3]:
MAP_TipoInformacao = {"A": "ativo", "P": "passivo"}
MAP_TipoInstrumento = {"F21": "Numerário", "F22": "Depósitos transferíveis", "F29": "Outros depósitos", "F3_P": "Títulos de dívida", "F4": "Empréstimos", "F511": "Ações cotadas", "F512": "Ações não cotadas", "F519": "Outras participações", "F521": "Unidades de Participação emitidas por FMM", "F522": "Unidades de Participação emitidas por FI, excluindo FMM", "F71": "Derivados financeiros"}
MAP_MaturidadeOriginal = {"01": "A vista", "10": "Ate 1 ano", "06": "De 1 a 2 anos", "07": "De 2 a 5 anos", "08": "A mais de 5 anos", "_Z": "Não aplicável"}

In [4]:
df.TipoInformacao = df.TipoInformacao.map(MAP_TipoInformacao)
df.TipoInstrumento = df.TipoInstrumento.map(MAP_TipoInstrumento)
df.MaturidadeOriginal = df.MaturidadeOriginal.map(MAP_MaturidadeOriginal)

In [5]:
df.drop(["CodEntidadeRef", "CodEntidadeCon"], axis=1, inplace=True)

In [6]:
df.head(3)

Unnamed: 0,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,SetorInstitucionalCon,TerritorioCon
0,ativo,Depósitos transferíveis,BST Futuros 2.90%,A vista,S122,PRT
1,ativo,Depósitos transferíveis,BPI EUR 0.00%,A vista,S122,PRT
2,ativo,Depósitos transferíveis,BST Futuros 2.65%,A vista,S122,PRT


# Feature Engineering

In [7]:
df_clean = df.copy()

### Encoding Rare Labels

### Label Enconder

Criando funções para fazer o enconding dos targets

In [8]:
def encode_target(label, category_mapping):
  # Check if label is unseen (not in the dictionary)
  if label not in category_mapping:
    # Assign next available integer as seen in training data
    new_value = len(category_mapping)
    category_mapping[label] = new_value
  
  return category_mapping[label]

In [9]:
def map_numbers_to_categories(numbers, category_mapping):
    """Maps numbers back to their corresponding category names using a provided mapping dictionary.

    Args:
        numbers: A list or array containing the numerical representations of categories.
        category_mapping: A dictionary mapping category names (keys) to their numerical representations (values).

    Returns:
        A list containing the corresponding category names for the input numbers.
    """

    category_names = [category_mapping.get(number, None) for number in numbers]
    return category_names

In [10]:
def return_map(df_clean):

    territory_map = {}
    sector_map = {}

    # Iterate through each row (assuming TerritorioCon and encoded_label_territorio are in the same order)
    for territorio, encoded_label in zip(df_clean["TerritorioCon"], df_clean["encoded_label_territorio"]):
      # Add the mapping to the dictionary if the TerritorioCon is not already present
      if territorio not in territory_map:
        territory_map[territorio] = encoded_label
    
    for sector, encoded_label_sector in zip(df_clean["SetorInstitucionalCon"], df_clean["encoded_label_setor"]):
      # Add the mapping to the dictionary if the TerritorioCon is not already present
      if sector not in sector_map:
        sector_map[sector] = encoded_label_sector

    return territory_map, sector_map

Aplicando funções para mapear os targets a códigos pra podermos treinar o modelo

In [11]:
# Get unique categories from 'TerritorioCon' column
unique_categories_ter = df_clean['TerritorioCon'].unique()
category_mapping_ter = dict(zip(unique_categories_ter, range(len(unique_categories_ter))))
inverted_mapping_ter = {value: key for key, value in category_mapping_ter.items()}

unique_categories_sec = df_clean["SetorInstitucionalCon"].unique()
category_mapping_sec = dict(zip(unique_categories_sec, range(len(unique_categories_sec))))
inverted_mapping_sec = {value: key for key, value in category_mapping_sec.items()}

df_clean["encoded_label_territorio"] = df_clean["TerritorioCon"].apply(encode_target, args=[category_mapping_ter])
df_clean["encoded_label_setor"] = df_clean['SetorInstitucionalCon'].apply(encode_target, args=[category_mapping_sec])

In [12]:
df_clean.head(3)

Unnamed: 0,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,SetorInstitucionalCon,TerritorioCon,encoded_label_territorio,encoded_label_setor
0,ativo,Depósitos transferíveis,BST Futuros 2.90%,A vista,S122,PRT,0,0
1,ativo,Depósitos transferíveis,BPI EUR 0.00%,A vista,S122,PRT,0,0
2,ativo,Depósitos transferíveis,BST Futuros 2.65%,A vista,S122,PRT,0,0


### Processing Description Column

Vamos agora:
1. Utilizar a função simple_preprocess para aplicarmos tecnicas de Text Mining para limpar a descrição
2. Treinando o modelo Word2Vec para processar coluna com a descrição para vetores (Embeddings)

In [13]:
help(simple_preprocess)

Help on function simple_preprocess in module gensim.utils:

simple_preprocess(doc, deacc=False, min_len=2, max_len=15)
    Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
    
    Uses :func:`~gensim.utils.tokenize` internally.
    
    Parameters
    ----------
    doc : str
        Input document.
    deacc : bool, optional
        Remove accent marks from tokens using :func:`~gensim.utils.deaccent`?
    min_len : int, optional
        Minimum length of token (inclusive). Shorter tokens are discarded.
    max_len : int, optional
        Maximum length of token in result (inclusive). Longer tokens are discarded.
    
    Returns
    -------
    list of str
        Tokens extracted from `doc`.



In [14]:
df_clean['tokenized_Descricao_text'] = df_clean['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
word2vec_model = Word2Vec(sentences=df_clean['tokenized_Descricao_text'], vector_size=100, window=5, min_count=1, workers=4)

In [15]:
df_clean

Unnamed: 0,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,SetorInstitucionalCon,TerritorioCon,encoded_label_territorio,encoded_label_setor,tokenized_Descricao_text
0,ativo,Depósitos transferíveis,BST Futuros 2.90%,A vista,S122,PRT,0,0,"[bst, futuros]"
1,ativo,Depósitos transferíveis,BPI EUR 0.00%,A vista,S122,PRT,0,0,"[bpi, eur]"
2,ativo,Depósitos transferíveis,BST Futuros 2.65%,A vista,S122,PRT,0,0,"[bst, futuros]"
3,ativo,Depósitos transferíveis,BST EUR 3.15%,A vista,S122,PRT,0,0,"[bst, eur]"
4,ativo,Depósitos transferíveis,BST EUR 2.65%,A vista,S122,PRT,0,0,"[bst, eur]"
...,...,...,...,...,...,...,...,...,...
42402,passivo,Empréstimos,"Empréstimo Vic Management 0,01%",Não aplicável,S11,PRT,0,2,"[empréstimo, vic, management]"
42403,passivo,Títulos de dívida,Outros Passivos,De 2 a 5 anos,S122,BEL,10,0,"[outros, passivos]"
42404,passivo,Empréstimos,"Empréstimo Vic One Pest Sup 0,01%",Não aplicável,S11,PRT,0,2,"[empréstimo, vic, one, pest, sup]"
42405,passivo,Empréstimos,"Empréstimo Vic Management 0,01%",Não aplicável,S11,PRT,0,2,"[empréstimo, vic, management]"


In [16]:
def compute_avg_embedding(tokens, unknown_embedding=[0]*word2vec_model.vector_size):
    embeddings = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if embeddings:  # Embeddings found
        return np.array(embeddings).mean(axis=0)  # Return average embedding as a NumPy array
    else:  # No embeddings found
        return np.array(unknown_embedding)

In [17]:
df_clean['avg_embedding'] = df_clean['tokenized_Descricao_text'].apply(compute_avg_embedding)
X = df_clean['avg_embedding'].apply(pd.Series).to_numpy()
y1 = df_clean['encoded_label_territorio']
y2 = df_clean['encoded_label_setor']
embed_data = pd.DataFrame(X)

In [18]:
df_clean.head(3)

Unnamed: 0,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,SetorInstitucionalCon,TerritorioCon,encoded_label_territorio,encoded_label_setor,tokenized_Descricao_text,avg_embedding
0,ativo,Depósitos transferíveis,BST Futuros 2.90%,A vista,S122,PRT,0,0,"[bst, futuros]","[0.1498547, 0.20878483, 0.21798834, 0.18397766..."
1,ativo,Depósitos transferíveis,BPI EUR 0.00%,A vista,S122,PRT,0,0,"[bpi, eur]","[-0.37996352, 0.5689551, 0.065465994, 0.037380..."
2,ativo,Depósitos transferíveis,BST Futuros 2.65%,A vista,S122,PRT,0,0,"[bst, futuros]","[0.1498547, 0.20878483, 0.21798834, 0.18397766..."


### Train Test Split

Separando os dados em treino e test. Vamos ter dois targets. Um para o setor e outro para o território

In [18]:
# Stratified split with 'TerritorioCon' as the stratification factor
#df_train, df_test = train_test_split(embed_data, test_size=0.2, random_state=42)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y1, test_size=0.1, random_state=41)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y2, test_size=0.1, random_state=41)

# Train 

## Train the Model Territorio

In [82]:
clf_ter = XGBClassifier(random_state=42, max_depth=4)
clf_ter.fit(X_train_1, y_train_1, early_stopping_rounds=10, 
        eval_set=[(X_test_1, y_test_1)])



[0]	validation_0-mlogloss:2.08674
[1]	validation_0-mlogloss:3.26949
[2]	validation_0-mlogloss:1.97326
[3]	validation_0-mlogloss:1.73965
[4]	validation_0-mlogloss:1.57890
[5]	validation_0-mlogloss:1.45490
[6]	validation_0-mlogloss:1.34948
[7]	validation_0-mlogloss:1.26921
[8]	validation_0-mlogloss:1.19779
[9]	validation_0-mlogloss:1.14557
[10]	validation_0-mlogloss:1.09836
[11]	validation_0-mlogloss:1.06429
[12]	validation_0-mlogloss:1.02978
[13]	validation_0-mlogloss:0.99897
[14]	validation_0-mlogloss:0.97667
[15]	validation_0-mlogloss:0.95338
[16]	validation_0-mlogloss:0.92684
[17]	validation_0-mlogloss:0.89420
[18]	validation_0-mlogloss:0.86506
[19]	validation_0-mlogloss:0.84374
[20]	validation_0-mlogloss:0.81388
[21]	validation_0-mlogloss:0.78930
[22]	validation_0-mlogloss:0.77298
[23]	validation_0-mlogloss:0.75789
[24]	validation_0-mlogloss:0.74367
[25]	validation_0-mlogloss:0.73120
[26]	validation_0-mlogloss:0.72085
[27]	validation_0-mlogloss:0.70990
[28]	validation_0-mlogloss:0.6

## Evaluate 

In [83]:
y_pred_test = clf_ter.predict(X_test_1)
y_pred_train = clf_ter.predict(X_train_1)

In [84]:
pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_ter)
pred_train_str = map_numbers_to_categories(y_pred_train, inverted_mapping_ter)

In [85]:
# Evaluate accuracy
accuracy_test = accuracy_score(y_test_1.to_list(), y_pred_test)
accuracy_train = accuracy_score(y_train_1.to_list(), y_pred_train)
print("Accuracy Test:", accuracy_test)
print("Accuracy Train:", accuracy_train)

Accuracy Test: 0.9250176845083706
Accuracy Train: 0.9586542996384216


In [86]:
inverted_mapping_ter_eval = {0: 'PRT',
 1: 'USA',
 2: 'NLD',
 3: 'ESP',
 4: 'DEU',
 5: 'LUX',
 6: 'GBR',
 7: 'CAN',
 8: 'FIN',
 9: 'CHE',
 10: 'BEL',
 11: 'FRA',
 12: 'IRL',
 13: 'ITA',
 14: 'DNK',
 15: 'SWE',
 16: 'AUS',
 17: 'ISL',
 18: '4D',
 19: 'JPN',
 20: '4S',
 21: 'AUT',
 22: 'JEY',
 23: 'NOR',
 24: 'IMN',
 25: 'KOR',
 26: 'CZE',
 27: 'CYM',
 28: 'HUN',
 29: 'POL',
 30: '7M',
 31: 'NZL',
 32: '4C',
 33: 'MEX',
 36: 'GRC',
 38: 'HKG',
 39: 'IND',
 41: 'ISR',
 43: 'IDN',
 45: 'ROU',
 46: 'SVN',
 49: 'EGY',
 52: 'BRA',
 55: 'ZAF',
 59: 'MAR',
 61: '1E',
 62: 'BMU',
 63: 'SVK',
 64: 'LIE',
 66: 'LBR',
 67: 'CUW',
 69: '5D',
 75: 'HRV',
 81: '5F',
 82: 'ARG',
}

report = classification_report(y_test_1.to_list(), y_pred_test, output_dict=True)
df_report = pd.DataFrame(report).transpose()

lenght = len(df_report) - 2 

df_report_subset = df_report.iloc[:lenght]
df_report_subset.index = df_report_subset.index = inverted_mapping_ter_eval.values()
df_report_subset.reset_index(drop=False, inplace=True)
df_combined = pd.concat([df_report_subset, df_report.iloc[lenght-1:]])
df_combined

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,index,precision,recall,f1-score,support
0,PRT,0.944402,0.971026,0.957529,1277.0
1,USA,0.888889,0.945591,0.916364,533.0
2,NLD,0.874459,0.874459,0.874459,231.0
3,ESP,0.921933,0.876325,0.898551,283.0
4,DEU,0.923387,0.880769,0.901575,260.0
5,LUX,0.92284,0.882006,0.901961,339.0
6,GBR,0.909091,0.877193,0.892857,171.0
7,CAN,1.0,0.956522,0.977778,23.0
8,FIN,1.0,0.851852,0.92,27.0
9,CHE,0.854839,0.946429,0.898305,56.0


In [23]:
print("Classification Report:")
print(classification_report(y_test_1.to_list(), y_pred_test))

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.92      0.90      1277
           1       0.84      0.92      0.88       533
           2       0.88      0.83      0.85       231
           3       0.81      0.75      0.78       283
           4       0.90      0.85      0.87       260
           5       0.89      0.87      0.88       339
           6       0.87      0.88      0.88       171
           7       1.00      0.96      0.98        23
           8       0.96      0.85      0.90        27
           9       0.91      0.89      0.90        56
          10       1.00      0.91      0.95        32
          11       0.91      0.84      0.87       326
          12       0.90      0.90      0.90       201
          13       0.97      0.97      0.97       161
          14       1.00      0.86      0.93        37
          15       0.98      0.92      0.95        53
          16       1.00      0.85      0.92        13
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Train Model Sector

In [87]:
clf_sec = XGBClassifier(random_state=42)
clf_sec.fit(X_train_2, y_train_2, early_stopping_rounds=10, 
        eval_set=[(X_test_2, y_test_2)])



[0]	validation_0-mlogloss:1.43437
[1]	validation_0-mlogloss:1.15196
[2]	validation_0-mlogloss:0.97022
[3]	validation_0-mlogloss:0.84379
[4]	validation_0-mlogloss:0.74967
[5]	validation_0-mlogloss:0.66872
[6]	validation_0-mlogloss:0.61003
[7]	validation_0-mlogloss:0.56571
[8]	validation_0-mlogloss:0.53247
[9]	validation_0-mlogloss:0.50431
[10]	validation_0-mlogloss:0.47527
[11]	validation_0-mlogloss:0.45277
[12]	validation_0-mlogloss:0.43440
[13]	validation_0-mlogloss:0.41798
[14]	validation_0-mlogloss:0.40610
[15]	validation_0-mlogloss:0.39121
[16]	validation_0-mlogloss:0.37845
[17]	validation_0-mlogloss:0.36800
[18]	validation_0-mlogloss:0.35908
[19]	validation_0-mlogloss:0.34939
[20]	validation_0-mlogloss:0.34076
[21]	validation_0-mlogloss:0.33403
[22]	validation_0-mlogloss:0.32478
[23]	validation_0-mlogloss:0.31599
[24]	validation_0-mlogloss:0.31005
[25]	validation_0-mlogloss:0.30261
[26]	validation_0-mlogloss:0.29446
[27]	validation_0-mlogloss:0.28954
[28]	validation_0-mlogloss:0.2

In [88]:
y_pred_test = clf_sec.predict(X_test_2)
y_pred_train = clf_sec.predict(X_train_2)

In [89]:
pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_sec)
pred_train_str = map_numbers_to_categories(y_pred_train, inverted_mapping_sec)

In [90]:
# Evaluate accuracy
accuracy_test = accuracy_score(y_test_2.to_list(), y_pred_test)
accuracy_train = accuracy_score(y_train_2.to_list(), y_pred_train)
print("Accuracy Test:", accuracy_test)
print("Accuracy Train:", accuracy_train)

Accuracy Test: 0.9431737797689225
Accuracy Train: 0.9669339202431484


In [91]:
inverted_mapping_sec_eval = {0: 'S122',
 1: 'S126',
 2: 'S11',
 3: 'S127',
 4: 'S1311',
 5: 'S125',
 6: 'S1312',
 7: 'S128',
 8: 'S124',
 9: 'S123',
 10: 'S121',
 11: 'S1313',
 13: 'S14'}

report = classification_report(y_test_2.to_list(), y_pred_test, output_dict=True)
df_report = pd.DataFrame(report).transpose()

df_report_subset = df_report.iloc[:13]
df_report_subset.index = df_report_subset.index = inverted_mapping_sec_eval.values()
df_report_subset.reset_index(drop=False, inplace=True)
df_combined = pd.concat([df_report_subset, df_report.iloc[13:]])
df_combined

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,index,precision,recall,f1-score,support
0,S122,0.960191,0.966346,0.963259,1248.0
1,S126,0.839286,0.706767,0.767347,133.0
2,S11,0.943671,0.979632,0.961315,1522.0
3,S127,0.812734,0.728188,0.768142,298.0
4,S1311,0.979228,0.993976,0.986547,332.0
5,S125,0.81746,0.746377,0.780303,138.0
6,S1312,1.0,1.0,1.0,2.0
7,S128,0.789474,0.75,0.769231,20.0
8,S124,1.0,0.992495,0.996234,533.0
9,S123,1.0,1.0,1.0,5.0


In [28]:
print("Classification Report:")
print(classification_report(y_test_2.to_list(), y_pred_test))

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1248
           1       0.85      0.71      0.78       133
           2       0.93      0.97      0.95      1522
           3       0.85      0.72      0.78       298
           4       0.95      0.99      0.97       332
           5       0.80      0.77      0.78       138
           6       0.67      1.00      0.80         2
           7       0.79      0.75      0.77        20
           8       0.98      0.97      0.97       533
           9       1.00      1.00      1.00         5
          10       0.75      0.75      0.75         4
          11       0.00      0.00      0.00         1
          13       1.00      0.60      0.75         5

    accuracy                           0.93      4241
   macro avg       0.81      0.78      0.79      4241
weighted avg       0.93      0.93      0.93      4241



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Adding More Features

In [29]:
new_features = ["TipoInformacao", "TipoInstrumento", "MaturidadeOriginal"]
df_to_get_dummies = df_clean[new_features]
dummies = pd.get_dummies(df_to_get_dummies)

new_df_to_train = pd.concat([embed_data, dummies], axis=1)

In [30]:
# Stratified split with 'TerritorioCon' as the stratification factor
#df_train, df_test = train_test_split(embed_data, test_size=0.2, random_state=42)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(new_df_to_train, y1, test_size=0.1, random_state=41)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(new_df_to_train, y2, test_size=0.1, random_state=41)

# Train 

## Train the Model Territorio

In [31]:
clf_ter = XGBClassifier(random_state=42, max_depth=4)
clf_ter.fit(X_train_1, y_train_1, early_stopping_rounds=10, 
        eval_set=[(X_test_1, y_test_1)])



[0]	validation_0-mlogloss:2.08674
[1]	validation_0-mlogloss:3.26949
[2]	validation_0-mlogloss:1.97326
[3]	validation_0-mlogloss:1.73965
[4]	validation_0-mlogloss:1.57890
[5]	validation_0-mlogloss:1.45490
[6]	validation_0-mlogloss:1.34948
[7]	validation_0-mlogloss:1.26921
[8]	validation_0-mlogloss:1.19779
[9]	validation_0-mlogloss:1.14557
[10]	validation_0-mlogloss:1.09836
[11]	validation_0-mlogloss:1.06429
[12]	validation_0-mlogloss:1.02978
[13]	validation_0-mlogloss:0.99897
[14]	validation_0-mlogloss:0.97667
[15]	validation_0-mlogloss:0.95338
[16]	validation_0-mlogloss:0.92684
[17]	validation_0-mlogloss:0.89420
[18]	validation_0-mlogloss:0.86506
[19]	validation_0-mlogloss:0.84374
[20]	validation_0-mlogloss:0.81388
[21]	validation_0-mlogloss:0.78930
[22]	validation_0-mlogloss:0.77298
[23]	validation_0-mlogloss:0.75789
[24]	validation_0-mlogloss:0.74367
[25]	validation_0-mlogloss:0.73120
[26]	validation_0-mlogloss:0.72085
[27]	validation_0-mlogloss:0.70990
[28]	validation_0-mlogloss:0.6

## Evaluate 

In [32]:
y_pred_test = clf_ter.predict(X_test_1)
y_pred_train = clf_ter.predict(X_train_1)

pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_ter)
pred_train_str = map_numbers_to_categories(y_pred_train, inverted_mapping_ter)

In [34]:
# Evaluate accuracy
accuracy_test = accuracy_score(y_test_1.to_list(), y_pred_test)
accuracy_train = accuracy_score(y_train_1.to_list(), y_pred_train)
print("Accuracy Test:", accuracy_test)
print("Accuracy Train:", accuracy_train)

Accuracy Test: 0.9250176845083706
Accuracy Train: 0.9586542996384216


In [81]:
inverted_mapping_ter_eval = {0: 'PRT',
 1: 'USA',
 2: 'NLD',
 3: 'ESP',
 4: 'DEU',
 5: 'LUX',
 6: 'GBR',
 7: 'CAN',
 8: 'FIN',
 9: 'CHE',
 10: 'BEL',
 11: 'FRA',
 12: 'IRL',
 13: 'ITA',
 14: 'DNK',
 15: 'SWE',
 16: 'AUS',
 17: 'ISL',
 18: '4D',
 19: 'JPN',
 20: '4S',
 21: 'AUT',
 22: 'JEY',
 23: 'NOR',
 24: 'IMN',
 25: 'KOR',
 26: 'CZE',
 27: 'CYM',
 28: 'HUN',
 29: 'POL',
 30: '7M',
 31: 'NZL',
 32: '4C',
 33: 'MEX',
 36: 'GRC',
 38: 'HKG',
 39: 'IND',
 41: 'ISR',
 43: 'IDN',
 45: 'ROU',
 46: 'SVN',
 49: 'EGY',
 52: 'BRA',
 55: 'ZAF',
 59: 'MAR',
 61: '1E',
 62: 'BMU',
 63: 'SVK',
 64: 'LIE',
 66: 'LBR',
 67: 'CUW',
 69: '5D',
 75: 'HRV',
 81: '5F',
 82: 'ARG',
}

report = classification_report(y_test_1.to_list(), y_pred_test, output_dict=True)
df_report = pd.DataFrame(report).transpose()

lenght = len(df_report) - 2 

df_report_subset = df_report.iloc[:lenght]
df_report_subset.index = df_report_subset.index = inverted_mapping_ter_eval.values()
df_report_subset.reset_index(drop=False, inplace=True)
df_combined = pd.concat([df_report_subset, df_report.iloc[lenght-1:]])
df_combined

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,index,precision,recall,f1-score,support
0,PRT,0.574045,0.564605,0.569285,1277.0
1,USA,0.3125,0.065666,0.108527,533.0
2,NLD,0.031013,0.212121,0.054114,231.0
3,ESP,0.011236,0.010601,0.010909,283.0
4,DEU,0.181009,0.234615,0.204355,260.0
5,LUX,0.039683,0.014749,0.021505,339.0
6,GBR,0.0,0.0,0.0,171.0
7,CAN,0.0,0.0,0.0,23.0
8,FIN,0.0,0.0,0.0,27.0
9,CHE,0.0,0.0,0.0,56.0


In [35]:
print("Classification Report:")
print(classification_report(y_test_1.to_list(), y_pred_test))

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      1277
           1       0.89      0.95      0.92       533
           2       0.87      0.87      0.87       231
           3       0.92      0.88      0.90       283
           4       0.92      0.88      0.90       260
           5       0.92      0.88      0.90       339
           6       0.91      0.88      0.89       171
           7       1.00      0.96      0.98        23
           8       1.00      0.85      0.92        27
           9       0.85      0.95      0.90        56
          10       1.00      0.91      0.95        32
          11       0.91      0.92      0.91       326
          12       0.88      0.92      0.90       201
          13       0.99      0.99      0.99       161
          14       0.97      0.97      0.97        37
          15       0.98      0.94      0.96        53
          16       1.00      0.85      0.92        13
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Train Model Sector

In [36]:
clf_sec = XGBClassifier(random_state=42)
clf_sec.fit(X_train_2, y_train_2, early_stopping_rounds=10, 
        eval_set=[(X_test_2, y_test_2)])



[0]	validation_0-mlogloss:1.43437
[1]	validation_0-mlogloss:1.15196
[2]	validation_0-mlogloss:0.97022
[3]	validation_0-mlogloss:0.84379
[4]	validation_0-mlogloss:0.74967
[5]	validation_0-mlogloss:0.66872
[6]	validation_0-mlogloss:0.61003
[7]	validation_0-mlogloss:0.56571
[8]	validation_0-mlogloss:0.53247
[9]	validation_0-mlogloss:0.50431
[10]	validation_0-mlogloss:0.47527
[11]	validation_0-mlogloss:0.45277
[12]	validation_0-mlogloss:0.43440
[13]	validation_0-mlogloss:0.41798
[14]	validation_0-mlogloss:0.40610
[15]	validation_0-mlogloss:0.39121
[16]	validation_0-mlogloss:0.37845
[17]	validation_0-mlogloss:0.36800
[18]	validation_0-mlogloss:0.35908
[19]	validation_0-mlogloss:0.34939
[20]	validation_0-mlogloss:0.34076
[21]	validation_0-mlogloss:0.33403
[22]	validation_0-mlogloss:0.32478
[23]	validation_0-mlogloss:0.31599
[24]	validation_0-mlogloss:0.31005
[25]	validation_0-mlogloss:0.30261
[26]	validation_0-mlogloss:0.29446
[27]	validation_0-mlogloss:0.28954
[28]	validation_0-mlogloss:0.2

In [37]:
y_pred_test = clf_sec.predict(X_test_2)
y_pred_train = clf_sec.predict(X_train_2)

In [38]:
pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_sec)
pred_train_str = map_numbers_to_categories(y_pred_train, inverted_mapping_sec)

In [39]:
# Evaluate accuracy
accuracy_test = accuracy_score(y_test_2.to_list(), y_pred_test)
accuracy_train = accuracy_score(y_train_2.to_list(), y_pred_train)
print("Accuracy Test:", accuracy_test)
print("Accuracy Train:", accuracy_train)

Accuracy Test: 0.9431737797689225
Accuracy Train: 0.9669339202431484


In [75]:
inverted_mapping_sec_eval = {0: 'S122',
 1: 'S126',
 2: 'S11',
 3: 'S127',
 4: 'S1311',
 5: 'S125',
 6: 'S1312',
 7: 'S128',
 8: 'S124',
 9: 'S123',
 10: 'S121',
 11: 'S1313',
 13: 'S14'}

report = classification_report(y_test_2.to_list(), y_pred_test, output_dict=True)
df_report = pd.DataFrame(report).transpose()

df_report_subset = df_report.iloc[:13]
df_report_subset.index = df_report_subset.index = inverted_mapping_sec_eval.values()
df_report_subset.reset_index(drop=False, inplace=True)
df_combined = pd.concat([df_report_subset, df_report.iloc[13:]])
df_combined

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,index,precision,recall,f1-score,support
0,S122,0.960191,0.966346,0.963259,1248.0
1,S126,0.839286,0.706767,0.767347,133.0
2,S11,0.943671,0.979632,0.961315,1522.0
3,S127,0.812734,0.728188,0.768142,298.0
4,S1311,0.979228,0.993976,0.986547,332.0
5,S125,0.81746,0.746377,0.780303,138.0
6,S1312,1.0,1.0,1.0,2.0
7,S128,0.789474,0.75,0.769231,20.0
8,S124,1.0,0.992495,0.996234,533.0
9,S123,1.0,1.0,1.0,5.0


In [40]:
print("Classification Report:")
print(classification_report(y_test_2.to_list(), y_pred_test))

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      1248
           1       0.84      0.71      0.77       133
           2       0.94      0.98      0.96      1522
           3       0.81      0.73      0.77       298
           4       0.98      0.99      0.99       332
           5       0.82      0.75      0.78       138
           6       1.00      1.00      1.00         2
           7       0.79      0.75      0.77        20
           8       1.00      0.99      1.00       533
           9       1.00      1.00      1.00         5
          10       1.00      1.00      1.00         4
          11       0.00      0.00      0.00         1
          13       1.00      0.80      0.89         5

    accuracy                           0.94      4241
   macro avg       0.86      0.82      0.84      4241
weighted avg       0.94      0.94      0.94      4241



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Analysing results that failed

- HRV
- 'IRN'
- '5D'
- 

In [131]:
df_clean[df_clean.TerritorioCon == 'GBR']

Unnamed: 0,CodEntidadeRef,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,CodEntidadeCon,SetorInstitucionalCon,TerritorioCon,encoded_label_territorio,encoded_label_setor,tokenized_Descricao_text,avg_embedding
35,0011,A,F3_P,NWG Float 13/01/26,07,2138005O9XJIJN4JPN90,S122,GBR,6,0,"[nwg, float]","[0.106123015, 0.049006544, 0.039887793, -0.105..."
40,0011,A,F3_P,IMBLN 1.125 14/8/23,07,2138008L3B3MCG1DFS50,S125,GBR,6,5,[imbln],"[0.0007223332, 0.004438957, 0.007668053, 0.007..."
76,0011,A,F3_P,HSBC Float 24/09/26,08,F0HUI1NY1AZMJMD8LP67,S127,GBR,6,3,"[hsbc, float]","[0.18909998, 0.07456243, 0.009489806, -0.13687..."
81,0011,A,F3_P,"BATSL 2,375 19/01/23",08,21380041YBGOQDFAC823,S125,GBR,6,5,[batsl],"[0.006692107, 0.0085775135, -0.008735614, -0.0..."
82,0011,A,F3_P,BATSLN .875 13/10/23,08,21380041YBGOQDFAC823,S127,GBR,6,3,[batsln],"[0.0070364177, 0.004681741, 0.004979764, -0.00..."
...,...,...,...,...,...,...,...,...,...,...,...,...
41378,1503,P,F4,GBR11200007-22 Catherine 2% 20201215 20,07,_X,S122,GBR,6,0,"[gbr, catherine]","[0.018044705, 0.04555394, -0.014981359, -0.014..."
41379,1503,P,F4,GBR11200007-13 Catherine 2% 20190701 20,07,_X,S122,GBR,6,0,"[gbr, catherine]","[0.018044705, 0.04555394, -0.014981359, -0.014..."
41394,1503,P,F4,GBR11200005-30 Patrick 2% 20211224 2022,10,_X,S122,GBR,6,0,"[gbr, patrick]","[0.023524022, 0.050343737, -0.016882353, -0.01..."
41395,1503,P,F4,GBR11200006-30 Patrick 2% 20211224 2022,06,_X,S122,GBR,6,0,"[gbr, patrick]","[0.023524022, 0.050343737, -0.016882353, -0.01..."


In [130]:
#return_embeedings('AFDB 5.75 Perp')
return_embeedings('Carlyle Europ PatIII', word2vec_model, simple_preprocess, clf_ter, clf_sec, inverted_mapping_ter, inverted_mapping_sec)

(['USA'], ['S124'])

# Training with the whole data

Vamos treinar agora o modelo com os dados todos. São exatamente os mesmos passos aplicados aos dados todos

In [19]:
df = pd.read_excel('../../data/01.Dataset FI_06032024.xlsx', sheet_name=2)

In [20]:
df_clean = df.copy("../data_missing/sampled_indices.csv")

In [21]:
data_eval = df_clean.sample(1000, random_state=42)
not_sampled_mask = ~df_clean.index.isin(data_eval)  # Create a boolean mask for non-sampled rows
df_clean = df_clean[not_sampled_mask]


data_pred = df_clean.sample(1000, random_state=42)
not_sampled_mask = ~df_clean.index.isin(data_pred)
df_clean = df_clean[not_sampled_mask]

In [22]:
df_clean.to_csv("../data_train/data_train.csv")
data_eval.to_csv("../data_train/data_eval.csv")
data_pred.to_csv("../data_missing/data_pred.csv")

In [23]:
# Get unique categories from 'TerritorioCon' column
unique_categories_ter = df_clean['TerritorioCon'].unique()
category_mapping_ter = dict(zip(unique_categories_ter, range(len(unique_categories_ter))))
inverted_mapping_ter = {value: key for key, value in category_mapping_ter.items()}

unique_categories_sec = df_clean["SetorInstitucionalCon"].unique()
category_mapping_sec = dict(zip(unique_categories_sec, range(len(unique_categories_sec))))
inverted_mapping_sec = {value: key for key, value in category_mapping_sec.items()}

df_clean["encoded_label_territorio"] = df_clean["TerritorioCon"].apply(encode_target, args=[category_mapping_ter])
df_clean["encoded_label_setor"] = df_clean['SetorInstitucionalCon'].apply(encode_target, args=[category_mapping_sec])

In [24]:
df_clean['tokenized_Descricao_text'] = df_clean['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
word2vec_model = Word2Vec(sentences=df_clean['tokenized_Descricao_text'], vector_size=100, window=5, min_count=1, workers=4)

In [25]:
df_clean['tokenized_Descricao_text'] = df_clean['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
word2vec_model = Word2Vec(sentences=df_clean['tokenized_Descricao_text'], vector_size=100, window=5, min_count=1, workers=4)

df_clean['avg_embedding'] = df_clean['tokenized_Descricao_text'].apply(compute_avg_embedding)
X = df_clean['avg_embedding'].apply(pd.Series).to_numpy()
y1 = df_clean['encoded_label_territorio']
y2 = df_clean['encoded_label_setor']
embed_data = pd.DataFrame(X)

In [43]:
df_clean

Unnamed: 0,CodEntidadeRef,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,CodEntidadeCon,SetorInstitucionalCon,TerritorioCon,encoded_label_territorio,encoded_label_setor,tokenized_Descricao_text,avg_embedding
0,0011,A,F22,BST Futuros 2.90%,01,549300URJH9VSI58CS32,S122,PRT,0,0,"[bst, futuros]","[0.16896594, 0.21082236, 0.2087562, 0.1661322,..."
1,0011,A,F22,BPI EUR 0.00%,01,3DM5DPGI3W6OU6GJ4N92,S122,PRT,0,0,"[bpi, eur]","[-0.36473963, 0.572219, 0.08754694, 0.00742135..."
2,0011,A,F22,BST Futuros 2.65%,01,549300URJH9VSI58CS32,S122,PRT,0,0,"[bst, futuros]","[0.16896594, 0.21082236, 0.2087562, 0.1661322,..."
3,0011,A,F22,BST EUR 3.15%,01,549300URJH9VSI58CS32,S122,PRT,0,0,"[bst, eur]","[-0.054259613, 0.39159432, 0.23001918, 0.19084..."
4,0011,A,F22,BST EUR 2.65%,01,549300URJH9VSI58CS32,S122,PRT,0,0,"[bst, eur]","[-0.054259613, 0.39159432, 0.23001918, 0.19084..."
...,...,...,...,...,...,...,...,...,...,...,...,...
42402,2039,P,F4,"Empréstimo Vic Management 0,01%",_Z,514925507,S11,PRT,0,2,"[empréstimo, vic, management]","[0.00058524497, 0.177311, -0.040029775, -0.009..."
42403,2040,P,F3_P,Outros Passivos,07,549300OZ46BRLZ8Y6F65,S122,BEL,10,0,"[outros, passivos]","[0.0058196564, 0.025449172, -0.01679747, -0.00..."
42404,2040,P,F4,"Empréstimo Vic One Pest Sup 0,01%",_Z,514925507,S11,PRT,0,2,"[empréstimo, vic, one, pest, sup]","[0.010988126, 0.13505054, -0.03513687, -0.0063..."
42405,2040,P,F4,"Empréstimo Vic Management 0,01%",_Z,514925507,S11,PRT,0,2,"[empréstimo, vic, management]","[0.00058524497, 0.177311, -0.040029775, -0.009..."


In [26]:
new_features = ["TipoInformacao", "TipoInstrumento", "MaturidadeOriginal"]
df_to_get_dummies = df_clean[new_features]
dummies = pd.get_dummies(df_to_get_dummies)

new_df_to_train = pd.concat([embed_data, dummies], axis=1)

In [27]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y1, test_size=0.2, random_state=42)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y2, test_size=0.2, random_state=42)

In [28]:
clf_ter = XGBClassifier(random_state=42, max_depth=5)
clf_ter.fit(X, y1)

In [29]:
clf_sec = XGBClassifier(random_state=42, max_depth=5)
clf_sec.fit(X, y2)

In [30]:
y_pred_test = clf_ter.predict(X)
pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_ter)

# Evaluate accuracy
accuracy_test = accuracy_score(y1.to_list(), y_pred_test)
print("Accuracy Ter:", accuracy_test)

Accuracy Ter: 0.96856651024595


In [31]:
print("Classification Report:")
print(classification_report(y1.to_list(), y_pred_test))

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     13009
           1       0.96      0.97      0.96      5416
           2       0.95      0.91      0.93      2275
           3       0.98      0.92      0.95      2870
           4       0.95      0.95      0.95      2521
           5       0.98      0.96      0.97      3344
           6       0.95      0.93      0.94      1671
           7       1.00      0.96      0.98       263
           8       1.00      0.98      0.99       217
           9       0.93      1.00      0.96       551
          10       1.00      1.00      1.00       333
          11       0.98      0.98      0.98      3195
          12       0.99      0.97      0.98      2044
          13       0.99      0.99      0.99      1730
          14       0.98      0.99      0.99       359
          15       0.99      0.99      0.99       400
          16       1.00      0.92      0.96       136
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
y_pred_test = clf_sec.predict(X)
pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_sec)

# Evaluate accuracy
accuracy_test = accuracy_score(y2.to_list(), y_pred_test)
print("Accuracy Sec:", accuracy_test)

Accuracy Sec: 0.9508335887943028


In [33]:
print("Classification Report:")
print(classification_report(y2.to_list(), y_pred_test))

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     12695
           1       0.88      0.76      0.81      1337
           2       0.95      0.98      0.97     15151
           3       0.86      0.75      0.80      2857
           4       0.95      1.00      0.98      3303
           5       0.85      0.79      0.82      1400
           6       1.00      1.00      1.00        60
           7       0.86      0.94      0.89       264
           8       0.99      0.99      0.99      5171
           9       0.91      1.00      0.95        63
          10       0.89      0.91      0.90        45
          11       1.00      1.00      1.00         7
          12       1.00      1.00      1.00         2
          13       0.88      0.96      0.92        52

    accuracy                           0.95     42407
   macro avg       0.93      0.93      0.93     42407
weighted avg       0.95      0.95      0.95     42407



In [34]:
y_pred_test = clf_ter.predict_proba(X)

In [35]:
y_pred_test = clf_sec.predict_proba(X)

In [36]:
y_pred_test.max(axis=1).min()

0.25756827

In [37]:
clf_ter.predict_proba(X[:1]).max()

0.9999988

In [38]:
!ls ../final_data

df_final.csv


In [95]:
pred_with_prob = pd.read_csv("../final_data/df_final.csv")

pred_with_prob[pred_with_prob["ter_probabilidade"] < 0.5]

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,CodEntidadeRef,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,CodEntidadeCon,SetorInstitucionalCon,TerritorioCon,sec_pred,ter_pred,sec_probabilidade,ter_probabilidade
18,18,12378,605,A,F3_P,"CS 2,125% 13/10/26",07,ANGGYXNX0JLX3X63JN86,S126,CHE,S122,CHE,0.671,0.439
22,22,17713,1021,A,F3_P,CS 1% 06/2027,08,549300506SI9CRFV9Z86,S122,CHE,S122,CHE,0.671,0.439
42,42,4506,153,A,F3_P,DTRGR Float 06/2023,06,724500G15MVL6UCWXF95,S125,NLD,S127,NLD,0.868,0.287
55,55,9273,440,A,F511,LA FRANCAISE DJXSAEM,_Z,969500R4CLSQFTYYI535,S11,FRA,S124,FRA,0.315,0.472
65,65,7475,276,A,F3_P,BUNDESSCHATZANWEIS 2.2% 12/12/2024,07,529900AQBND3S6YJLY83,S1311,DEU,S1311,USA,0.495,0.434
83,83,27792,1460,A,F511,AXA-UAP,_Z,F5WCUMTUM4RKZ1MAIE39,S126,FRA,S128,FRA,0.868,0.435
104,104,13415,647,A,F3_P,HEIBOS 3.248 PERP,08,549300TJR3PR8EXILG79,S11,SWE,S11,SWE,0.861,0.467
137,137,42325,1994,P,F4,Empréstimo - PNI PORTUGAL,07,980662796,S127,LUX,S122,LUX,0.949,0.454
161,161,41628,1620,P,F4,"Emprestimo Joao F Marques, S.A. N/Remun.",07,_X,S122,PRT,S11,PRT,0.789,0.478
190,190,37478,1775,A,F22,Fornecedores,01,_X,S122,PRT,S122,PRT,0.978,0.483


## Mais Features no dataset final e avaliar performance

In [41]:
df_clean = df.copy("../data_missing/sampled_indices.csv")

In [42]:
# Get unique categories from 'TerritorioCon' column
unique_categories_ter = df_clean['TerritorioCon'].unique()
category_mapping_ter = dict(zip(unique_categories_ter, range(len(unique_categories_ter))))
inverted_mapping_ter = {value: key for key, value in category_mapping_ter.items()}

unique_categories_sec = df_clean["SetorInstitucionalCon"].unique()
category_mapping_sec = dict(zip(unique_categories_sec, range(len(unique_categories_sec))))
inverted_mapping_sec = {value: key for key, value in category_mapping_sec.items()}

df_clean["encoded_label_territorio"] = df_clean["TerritorioCon"].apply(encode_target, args=[category_mapping_ter])
df_clean["encoded_label_setor"] = df_clean['SetorInstitucionalCon'].apply(encode_target, args=[category_mapping_sec])

In [43]:
df_clean['tokenized_Descricao_text'] = df_clean['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
word2vec_model = Word2Vec(sentences=df_clean['tokenized_Descricao_text'], vector_size=100, window=5, min_count=1, workers=4)

In [48]:
df_clean['avg_embedding'] = df_clean['tokenized_Descricao_text'].apply(compute_avg_embedding)
X = df_clean['avg_embedding'].apply(pd.Series).to_numpy()
y1 = df_clean['encoded_label_territorio']
y2 = df_clean['encoded_label_setor']
embed_data = pd.DataFrame(X)

new_features = ["TipoInformacao", "TipoInstrumento", "MaturidadeOriginal"]
df_to_get_dummies = df_clean[new_features]
dummies = pd.get_dummies(df_to_get_dummies)

new_df_to_train = pd.concat([embed_data, dummies], axis=1)

In [46]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(new_df_to_train, y1, test_size=0.2, random_state=42)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(new_df_to_train, y2, test_size=0.2, random_state=42)

In [50]:
clf_ter = XGBClassifier(random_state=42, max_depth=5)
clf_ter.fit(new_df_to_train, y1)

In [51]:
clf_sec = XGBClassifier(random_state=42, max_depth=5)
clf_sec.fit(new_df_to_train, y2)

In [52]:
y_pred_test = clf_ter.predict(new_df_to_train)
pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_ter)

# Evaluate accuracy
accuracy_test = accuracy_score(y1.to_list(), y_pred_test)
print("Accuracy Ter:", accuracy_test)

Accuracy Ter: 0.9794845190652487


In [53]:
y_pred_test = clf_sec.predict(new_df_to_train)
pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_sec)

# Evaluate accuracy
accuracy_test = accuracy_score(y2.to_list(), y_pred_test)
print("Accuracy Sec:", accuracy_test)

Accuracy Sec: 0.9589454571179287


## Fazer
- Adicionar mais features no dataset final e avaliar 
- Adicionar predict probability
- Rever diretório para onde data deve está 
- Rever o Read ME
- Analisar casos que falharam
- Organizar Notebook 
- Fazer fix da ci/cd
- Fazer o push da imagem do ci/cd
- colacar dados na s3 e criar logica para acessa dados da s3
- Fazer o push do modelo para um bucket e acessar modelo de lá para utilizar
- Criar a app no ECS 
- Escrever relarório -> 
    - Treinamos com mais features e melhorou
    - Mas não melhorou
    - Dizer que separamos em treino e teste mas no final treinamos com os dados todos
    - Sugestão de como se pode melhorar
    - Explicar que treinamos dois modelos
- Questions
    - How predict proba works
    - How the word2vec works 
    - How simple process works
    - How decision trees works
    - How XG Boost Works
    

Função que permite usar o modelo e retornar as previsões para novas entradas

In [39]:
def return_embeedings(string, word2vec_model, simple_preprocess, clf_t, clf_s, inverted_mapping_t, inverted_mapping_s):
    case = pd.DataFrame({'DescricaoInstrumento': [string]})
    case['tokenized_Descricao_text'] = case['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
    case['avg_embedding'] = case['tokenized_Descricao_text'].apply(compute_avg_embedding)
    
    X = case['avg_embedding'].apply(pd.Series).to_numpy()
    
    prediction_t = clf_t.predict(X)
    str_pred_t = map_numbers_to_categories(prediction_t, inverted_mapping_t)
    
    prediction_s = clf_s.predict(X)
    str_pred_s = map_numbers_to_categories(prediction_s, inverted_mapping_s)
    
    prediction_s_prob = clf_sec.predict_proba(X).max()
    prediction_t_prob = clf_ter.predict_proba(X).max()
    
    return str_pred_t, str_pred_s #, prediction_t_prob, prediction_s_prob

In [100]:
str_to_try = 'HEATHROW FUNDING LTD 1.50% 12/10/2027'
str_pred_t, str_pred_s, prediction_s_prob, prediction_t_prob = return_embeedings(str_to_try, word2vec_model, simple_preprocess, clf_ter, clf_sec, inverted_mapping_ter, inverted_mapping_sec)

In [101]:
return_embeedings(str_to_try, word2vec_model, simple_preprocess, clf_ter, clf_sec, inverted_mapping_ter, inverted_mapping_sec)

(['JEY'], ['S125'], 0.98384386, 0.8323205)

In [None]:
all_str_ter = []
all_str_sec = []

for i in range(len(sampled_indices)):

    string = sampled_indices.DescricaoInstrumento.iloc[i]
    str_pred_t, str_pred_s = return_embeedings(string, word2vec_model, simple_preprocess, clf_ter, clf_sec, inverted_mapping_ter, inverted_mapping_sec)
    
    all_str_ter.append(str_pred_t[0])
    all_str_sec.append(str_pred_s[0])
    
sampled_indices["sec_pred"] = all_str_sec  
sampled_indices["ter_pred"] = all_str_ter

## Gradio App

Criando um aplicativo gradio para demostrar o trabalho

In [41]:
def return_embeedings_(string):
    case = pd.DataFrame({'DescricaoInstrumento': [string]})
    case['tokenized_Descricao_text'] = case['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
    case['avg_embedding'] = case['tokenized_Descricao_text'].apply(compute_avg_embedding)

    X = case['avg_embedding'].apply(pd.Series).to_numpy()

    try:
        prediction_t = clf_ter.predict(X)
        str_pred_t = map_numbers_to_categories(prediction_t, inverted_mapping_ter)

        prediction_s = clf_sec.predict(X)
        str_pred_s = map_numbers_to_categories(prediction_s, inverted_mapping_sec)
        prediction_s_prob = clf_sec.predict_proba(X).max()
        prediction_t_prob = clf_ter.predict_proba(X).max()

        return str(str_pred_t[0]), str(str_pred_s[0]) #, prediction_t_prob, prediction_s_prob
           
    except Exception as e:
        return "An error occurred: " + str(e)

In [42]:
examples = [
    ["HEATHROW FUNDING LTD 1.50% 12/10/2027"],
    ["FORTUM 1.625% A:27/02/2026"],
    ["BAC FLOAT 25/4/24"],
    ["DP DP 4M 0% 16/12/21CGD 0.00% 2020-12-16"],
]

# Create the Gradio app
iface = gr.Interface(
    # Argument 1: function (Required)
    fn=return_embeedings_,  # The function you want to expose as an interface

    # Argument 2: input components (Required)
    inputs=[gr.Textbox(lines=5, placeholder="Enter a text description")],  # Defines user input

    # Argument 3: examples (Optional)
    examples=examples,  # List of example text descriptions (or tuples with predictions)

    # Argument 4: output components (Required)
    outputs=[
        gr.Textbox(label="Prediction Territory"),
        gr.Textbox(label="Prediction Sector"),
        #gr.Textbox(label="Prediction Probability Territory"),
        #gr.Textbox(label="Prediction Probability Sector"),
    ],  # Defines how to display the output
)

# Launch the app
iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




## Saving and reading models

In [236]:
with open('model_data.pkl', 'wb') as f:
    pickle.dump(inverted_mapping_ter, f)
    pickle.dump(inverted_mapping_sec, f)
    pickle.dump(word2vec_model, f)
    pickle.dump(clf_sec, f)
    pickle.dump(clf_ter, f)

In [238]:
with open('model_data.pkl', 'rb') as f:
    # Load the data objects in the same order they were saved
    inverted_mapping_ter = pickle.load(f)
    inverted_mapping_sec = pickle.load(f)
    word2vec_model = pickle.load(f)
    clf_sec = pickle.load(f)
    clf_ter = pickle.load(f)