# Objective
- O objetivo é treinar um modelo utilizando a descrição do produto para prever o setor e o território
- Vamos utilizar Word2Vec para trasformar a descrição em vetores (embeddings)
- Vamos utilizar um modelo XG boost para treinar

# Data
- Dados fornecidos pelo banco de Portugal

# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from feature_engine.encoding import RareLabelEncoder
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import gradio as gr
import pickle

# Reading Data

In [2]:
df = pd.read_excel('.../.../data/01.Dataset FI_06032024.xlsx', sheet_name=2)

# Small Preprocessing

Mapeando o código ao nome da descrição

In [3]:
MAP_TipoInformacao = {"A": "ativo", "P": "passivo"}
MAP_TipoInstrumento = {"F21": "Numerário", "F22": "Depósitos transferíveis", "F29": "Outros depósitos", "F3_P": "Títulos de dívida", "F4": "Empréstimos", "F511": "Ações cotadas", "F512": "Ações não cotadas", "F519": "Outras participações", "F521": "Unidades de Participação emitidas por FMM", "F522": "Unidades de Participação emitidas por FI, excluindo FMM", "F71": "Derivados financeiros"}
MAP_MaturidadeOriginal = {"01": "A vista", "10": "Ate 1 ano", "06": "De 1 a 2 anos", "07": "De 2 a 5 anos", "08": "A mais de 5 anos", "_Z": "Não aplicável"}

In [4]:
df.TipoInformacao = df.TipoInformacao.map(MAP_TipoInformacao)
df.TipoInstrumento = df.TipoInstrumento.map(MAP_TipoInstrumento)
df.MaturidadeOriginal = df.MaturidadeOriginal.map(MAP_MaturidadeOriginal)

In [5]:
df.drop(["CodEntidadeRef", "CodEntidadeCon"], axis=1, inplace=True)

In [6]:
df.head(3)

Unnamed: 0,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,SetorInstitucionalCon,TerritorioCon
0,ativo,Depósitos transferíveis,BST Futuros 2.90%,A vista,S122,PRT
1,ativo,Depósitos transferíveis,BPI EUR 0.00%,A vista,S122,PRT
2,ativo,Depósitos transferíveis,BST Futuros 2.65%,A vista,S122,PRT


# Feature Engineering

In [7]:
df_clean = df.copy()

### Encoding Rare Labels

### Label Enconder

Criando funções para fazer o enconding dos targets

In [8]:
def encode_target(label, category_mapping):
  # Check if label is unseen (not in the dictionary)
  if label not in category_mapping:
    # Assign next available integer as seen in training data
    new_value = len(category_mapping)
    category_mapping[label] = new_value
  
  return category_mapping[label]

In [9]:
def map_numbers_to_categories(numbers, category_mapping):
    """Maps numbers back to their corresponding category names using a provided mapping dictionary.

    Args:
        numbers: A list or array containing the numerical representations of categories.
        category_mapping: A dictionary mapping category names (keys) to their numerical representations (values).

    Returns:
        A list containing the corresponding category names for the input numbers.
    """

    category_names = [category_mapping.get(number, None) for number in numbers]
    return category_names

In [10]:
def return_map(df_clean):

    territory_map = {}
    sector_map = {}

    # Iterate through each row (assuming TerritorioCon and encoded_label_territorio are in the same order)
    for territorio, encoded_label in zip(df_clean["TerritorioCon"], df_clean["encoded_label_territorio"]):
      # Add the mapping to the dictionary if the TerritorioCon is not already present
      if territorio not in territory_map:
        territory_map[territorio] = encoded_label
    
    for sector, encoded_label_sector in zip(df_clean["SetorInstitucionalCon"], df_clean["encoded_label_setor"]):
      # Add the mapping to the dictionary if the TerritorioCon is not already present
      if sector not in sector_map:
        sector_map[sector] = encoded_label_sector

    return territory_map, sector_map

Aplicando funções para mapear os targets a códigos pra podermos treinar o modelo

In [11]:
# Get unique categories from 'TerritorioCon' column
unique_categories_ter = df_clean['TerritorioCon'].unique()
category_mapping_ter = dict(zip(unique_categories_ter, range(len(unique_categories_ter))))
inverted_mapping_ter = {value: key for key, value in category_mapping_ter.items()}

unique_categories_sec = df_clean["SetorInstitucionalCon"].unique()
category_mapping_sec = dict(zip(unique_categories_sec, range(len(unique_categories_sec))))
inverted_mapping_sec = {value: key for key, value in category_mapping_sec.items()}

df_clean["encoded_label_territorio"] = df_clean["TerritorioCon"].apply(encode_target, args=[category_mapping_ter])
df_clean["encoded_label_setor"] = df_clean['SetorInstitucionalCon'].apply(encode_target, args=[category_mapping_sec])

In [12]:
df_clean.head(3)

Unnamed: 0,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,SetorInstitucionalCon,TerritorioCon,encoded_label_territorio,encoded_label_setor
0,ativo,Depósitos transferíveis,BST Futuros 2.90%,A vista,S122,PRT,0,0
1,ativo,Depósitos transferíveis,BPI EUR 0.00%,A vista,S122,PRT,0,0
2,ativo,Depósitos transferíveis,BST Futuros 2.65%,A vista,S122,PRT,0,0


### Processing Description Column

Vamos agora:
1. Utilizar a função simple_preprocess para aplicarmos tecnicas de Text Mining para limpar a descrição
2. Treinando o modelo Word2Vec para processar coluna com a descrição para vetores (Embeddings)

In [13]:
df_clean['tokenized_Descricao_text'] = df_clean['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
word2vec_model = Word2Vec(sentences=df_clean['tokenized_Descricao_text'], vector_size=100, window=5, min_count=1, workers=4)

In [14]:
df_clean

Unnamed: 0,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,SetorInstitucionalCon,TerritorioCon,encoded_label_territorio,encoded_label_setor,tokenized_Descricao_text
0,ativo,Depósitos transferíveis,BST Futuros 2.90%,A vista,S122,PRT,0,0,"[bst, futuros]"
1,ativo,Depósitos transferíveis,BPI EUR 0.00%,A vista,S122,PRT,0,0,"[bpi, eur]"
2,ativo,Depósitos transferíveis,BST Futuros 2.65%,A vista,S122,PRT,0,0,"[bst, futuros]"
3,ativo,Depósitos transferíveis,BST EUR 3.15%,A vista,S122,PRT,0,0,"[bst, eur]"
4,ativo,Depósitos transferíveis,BST EUR 2.65%,A vista,S122,PRT,0,0,"[bst, eur]"
...,...,...,...,...,...,...,...,...,...
42402,passivo,Empréstimos,"Empréstimo Vic Management 0,01%",Não aplicável,S11,PRT,0,2,"[empréstimo, vic, management]"
42403,passivo,Títulos de dívida,Outros Passivos,De 2 a 5 anos,S122,BEL,10,0,"[outros, passivos]"
42404,passivo,Empréstimos,"Empréstimo Vic One Pest Sup 0,01%",Não aplicável,S11,PRT,0,2,"[empréstimo, vic, one, pest, sup]"
42405,passivo,Empréstimos,"Empréstimo Vic Management 0,01%",Não aplicável,S11,PRT,0,2,"[empréstimo, vic, management]"


In [15]:
def compute_avg_embedding(tokens, unknown_embedding=[0]*word2vec_model.vector_size):
    embeddings = [word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv]
    if embeddings:  # Embeddings found
        return np.array(embeddings).mean(axis=0)  # Return average embedding as a NumPy array
    else:  # No embeddings found
        return np.array(unknown_embedding)

In [16]:
df_clean['avg_embedding'] = df_clean['tokenized_Descricao_text'].apply(compute_avg_embedding)
X = df_clean['avg_embedding'].apply(pd.Series).to_numpy()
y1 = df_clean['encoded_label_territorio']
y2 = df_clean['encoded_label_setor']
embed_data = pd.DataFrame(X)

In [17]:
df_clean.head(3)

Unnamed: 0,TipoInformacao,TipoInstrumento,DescricaoInstrumento,MaturidadeOriginal,SetorInstitucionalCon,TerritorioCon,encoded_label_territorio,encoded_label_setor,tokenized_Descricao_text,avg_embedding
0,ativo,Depósitos transferíveis,BST Futuros 2.90%,A vista,S122,PRT,0,0,"[bst, futuros]","[0.1639617, 0.24717936, 0.20080124, 0.06980523..."
1,ativo,Depósitos transferíveis,BPI EUR 0.00%,A vista,S122,PRT,0,0,"[bpi, eur]","[-0.29855, 0.55359936, -0.02400001, -0.1402287..."
2,ativo,Depósitos transferíveis,BST Futuros 2.65%,A vista,S122,PRT,0,0,"[bst, futuros]","[0.1639617, 0.24717936, 0.20080124, 0.06980523..."


### Train Test Split

Separando os dados em treino e test. Vamos ter dois targets. Um para o setor e outro para o território

In [18]:
# Stratified split with 'TerritorioCon' as the stratification factor
#df_train, df_test = train_test_split(embed_data, test_size=0.2, random_state=42)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y1, test_size=0.2, random_state=42)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y2, test_size=0.2, random_state=42)

# Train 

## Train the Model Territorio

In [20]:
clf_ter = XGBClassifier(random_state=42, max_depth=4)
clf_ter.fit(X_train_1, y_train_1, early_stopping_rounds=10, 
        eval_set=[(X_test_1, y_test_1)])

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81], got [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 43 45 46 47 48 49
 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
 74 75 76 77 78 79 80 81 82 83]

## Evaluate 

In [32]:
y_pred_test = clf_ter.predict(X_test_1)
y_pred_train = clf_ter.predict(X_train_1)

NotFittedError: need to call fit or load_model beforehand

In [179]:
pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_ter)
pred_train_str = map_numbers_to_categories(y_pred_train, inverted_mapping_ter)

In [180]:
# Evaluate accuracy
accuracy_test = accuracy_score(y_test_1.to_list(), y_pred_test)
accuracy_train = accuracy_score(y_train_1.to_list(), y_pred_train)
print("Accuracy Test:", accuracy_test)
print("Accuracy Train:", accuracy_train)

Accuracy Test: 0.9145248762084414
Accuracy Train: 0.9526308032424465


## Train Model Sector

In [181]:
clf_sec = XGBClassifier(random_state=42)
clf_sec.fit(X_train_2, y_train_2, early_stopping_rounds=10, 
        eval_set=[(X_test_2, y_test_2)])



[0]	validation_0-mlogloss:1.50263
[1]	validation_0-mlogloss:1.23458
[2]	validation_0-mlogloss:1.05783
[3]	validation_0-mlogloss:0.93583
[4]	validation_0-mlogloss:0.84430
[5]	validation_0-mlogloss:0.77642
[6]	validation_0-mlogloss:0.71586
[7]	validation_0-mlogloss:0.67003
[8]	validation_0-mlogloss:0.63406
[9]	validation_0-mlogloss:0.60550
[10]	validation_0-mlogloss:0.58064
[11]	validation_0-mlogloss:0.55888
[12]	validation_0-mlogloss:0.53581
[13]	validation_0-mlogloss:0.51696
[14]	validation_0-mlogloss:0.50353
[15]	validation_0-mlogloss:0.48938
[16]	validation_0-mlogloss:0.47473
[17]	validation_0-mlogloss:0.46345
[18]	validation_0-mlogloss:0.45012
[19]	validation_0-mlogloss:0.44078
[20]	validation_0-mlogloss:0.43278
[21]	validation_0-mlogloss:0.42497
[22]	validation_0-mlogloss:0.41673
[23]	validation_0-mlogloss:0.40798
[24]	validation_0-mlogloss:0.40134
[25]	validation_0-mlogloss:0.39490
[26]	validation_0-mlogloss:0.38824
[27]	validation_0-mlogloss:0.38157
[28]	validation_0-mlogloss:0.3

In [182]:
y_pred_test = clf_sec.predict(X_test_2)
y_pred_train = clf_sec.predict(X_train_2)

In [183]:
pred_test_str = map_numbers_to_categories(y_pred_test, inverted_mapping_sec)
pred_train_str = map_numbers_to_categories(y_pred_train, inverted_mapping_sec)

In [184]:
# Evaluate accuracy
accuracy_test = accuracy_score(y_test_2.to_list(), y_pred_test)
accuracy_train = accuracy_score(y_train_2.to_list(), y_pred_train)
print("Accuracy Test:", accuracy_test)
print("Accuracy Train:", accuracy_train)

Accuracy Test: 0.9278472058476774
Accuracy Train: 0.9620928518791452


# Training with the whole data

Vamos treinar agora o modelo com os dados todos. São exatamente os mesmos passos aplicados aos dados todos

In [18]:
df = pd.read_excel('../../data/01.Dataset FI_06032024.xlsx', sheet_name=2)

In [19]:
df_clean = df.copy("../data_missing/sampled_indices.csv")

In [20]:
data_eval = df_clean.sample(1000, random_state=42)
not_sampled_mask = ~df_clean.index.isin(data_eval)  # Create a boolean mask for non-sampled rows
df_clean = df_clean[not_sampled_mask]


data_pred = df_clean.sample(1000, random_state=42)
not_sampled_mask = ~df_clean.index.isin(data_pred)
df_clean = df_clean[not_sampled_mask]

In [21]:
df_clean.to_csv("../data_train/data_train.csv")
data_eval.to_csv("../data_train/data_eval.csv")
data_pred.to_csv("../data_missing/data_pred.csv")

In [22]:
# Get unique categories from 'TerritorioCon' column
unique_categories_ter = df_clean['TerritorioCon'].unique()
category_mapping_ter = dict(zip(unique_categories_ter, range(len(unique_categories_ter))))
inverted_mapping_ter = {value: key for key, value in category_mapping_ter.items()}

unique_categories_sec = df_clean["SetorInstitucionalCon"].unique()
category_mapping_sec = dict(zip(unique_categories_sec, range(len(unique_categories_sec))))
inverted_mapping_sec = {value: key for key, value in category_mapping_sec.items()}

df_clean["encoded_label_territorio"] = df_clean["TerritorioCon"].apply(encode_target, args=[category_mapping_ter])
df_clean["encoded_label_setor"] = df_clean['SetorInstitucionalCon'].apply(encode_target, args=[category_mapping_sec])

In [23]:
df_clean['tokenized_Descricao_text'] = df_clean['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
word2vec_model = Word2Vec(sentences=df_clean['tokenized_Descricao_text'], vector_size=100, window=5, min_count=1, workers=4)

In [24]:
df_clean['tokenized_Descricao_text'] = df_clean['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
word2vec_model = Word2Vec(sentences=df_clean['tokenized_Descricao_text'], vector_size=100, window=5, min_count=1, workers=4)

df_clean['avg_embedding'] = df_clean['tokenized_Descricao_text'].apply(compute_avg_embedding)
X = df_clean['avg_embedding'].apply(pd.Series).to_numpy()
y1 = df_clean['encoded_label_territorio']
y2 = df_clean['encoded_label_setor']
embed_data = pd.DataFrame(X)

In [25]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y1, test_size=0.2, random_state=42)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y2, test_size=0.2, random_state=42)

In [26]:
clf_ter = XGBClassifier(random_state=42, max_depth=5)
clf_ter.fit(X, y1)

In [27]:
clf_sec = XGBClassifier(random_state=42, max_depth=5)
clf_sec.fit(X, y2)

Função que permite usar o modelo e retornar as previsões para novas entradas

In [28]:
def return_embeedings(string, word2vec_model,simple_preprocess, clf_t, clf_s, inverted_mapping_t, inverted_mapping_s):
    case = pd.DataFrame({'DescricaoInstrumento': [string]})
    case['tokenized_Descricao_text'] = case['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
    case['avg_embedding'] = case['tokenized_Descricao_text'].apply(compute_avg_embedding)
    
    X = case['avg_embedding'].apply(pd.Series).to_numpy()
    
    prediction_t = clf_t.predict(X)
    str_pred_t = map_numbers_to_categories(prediction_t, inverted_mapping_t)
    
    prediction_s = clf_s.predict(X)
    str_pred_s = map_numbers_to_categories(prediction_s, inverted_mapping_s)
    
    return str_pred_t, str_pred_s

In [29]:
str_to_try = 'HEATHROW FUNDING LTD 1.50% 12/10/2027'
str_pred_t, str_pred_s = return_embeedings(str_to_try, word2vec_model, simple_preprocess, clf_ter, clf_sec, inverted_mapping_ter, inverted_mapping_sec)

In [30]:
all_str_ter = []
all_str_sec = []

for i in range(len(sampled_indices)):

    string = sampled_indices.DescricaoInstrumento.iloc[i]
    str_pred_t, str_pred_s = return_embeedings(string, word2vec_model, simple_preprocess, clf_ter, clf_sec, inverted_mapping_ter, inverted_mapping_sec)
    
    all_str_ter.append(str_pred_t[0])
    all_str_sec.append(str_pred_s[0])
    
sampled_indices["sec_pred"] = all_str_sec  
sampled_indices["ter_pred"] = all_str_ter

NameError: name 'sampled_indices' is not defined

## Gradio App

Criando um aplicativo gradio para demostrar o trabalho

In [31]:
def return_embeedings_(string):
    case = pd.DataFrame({'DescricaoInstrumento': [string]})
    case['tokenized_Descricao_text'] = case['DescricaoInstrumento'].apply(lambda x: simple_preprocess(x))
    case['avg_embedding'] = case['tokenized_Descricao_text'].apply(compute_avg_embedding)

    X = case['avg_embedding'].apply(pd.Series).to_numpy()

    try:
        prediction_t = clf_ter.predict(X)
        str_pred_t = map_numbers_to_categories(prediction_t, inverted_mapping_ter)

        prediction_s = clf_sec.predict(X)
        str_pred_s = map_numbers_to_categories(prediction_s, inverted_mapping_sec)

        return str(str_pred_t[0]), str(str_pred_s[0])
           
    except Exception as e:
        return "An error occurred: " + str(e)

    
examples = [
    ["HEATHROW FUNDING LTD 1.50% 12/10/2027"],
    ["FORTUM 1.625% A:27/02/2026"],
    ["BAC FLOAT 25/4/24"],
    ["DP DP 4M 0% 16/12/21CGD 0.00% 2020-12-16"],
]

# Create the Gradio app
iface = gr.Interface(
    # Argument 1: function (Required)
    fn=return_embeedings_,  # The function you want to expose as an interface

    # Argument 2: input components (Required)
    inputs=[gr.Textbox(lines=5, placeholder="Enter a text description")],  # Defines user input

    # Argument 3: examples (Optional)
    examples=examples,  # List of example text descriptions (or tuples with predictions)

    # Argument 4: output components (Required)
    outputs=[
        gr.Textbox(label="Prediction Territory"),
        gr.Textbox(label="Prediction Sector"),
    ],  # Defines how to display the output
)

# Launch the app
iface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




## Saving and reading models

In [236]:
with open('model_data.pkl', 'wb') as f:
    pickle.dump(inverted_mapping_ter, f)
    pickle.dump(inverted_mapping_sec, f)
    pickle.dump(word2vec_model, f)
    pickle.dump(clf_sec, f)
    pickle.dump(clf_ter, f)

In [238]:
with open('model_data.pkl', 'rb') as f:
    # Load the data objects in the same order they were saved
    inverted_mapping_ter = pickle.load(f)
    inverted_mapping_sec = pickle.load(f)
    word2vec_model = pickle.load(f)
    clf_sec = pickle.load(f)
    clf_ter = pickle.load(f)

In [19]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

In [22]:
import lazypredict
print(lazypredict.__file__)


/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/lazypredict/__init__.py


# Para Território

## Com todos os modelos do LazyPredict (demora bastante tempo a correr)

In [19]:
from lazypredict.Supervised import LazyClassifier

# Para o alvo y1 (Território)
clf1 = LazyClassifier(verbose=1, ignore_warnings=True, custom_metric=None)
models1, predictions1 = clf1.fit(X_train_1, X_test_1, y_train_1, y_test_1)

# Imprimindo os resultados
print(models1)

  3%|█▌                                          | 1/29 [00:19<09:11, 19.69s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.35251120018863474, 'Balanced Accuracy': 0.0268234764111602, 'ROC AUC': None, 'F1 Score': 0.278617708871989, 'Time taken': 19.6911518573761}


  7%|███                                         | 2/29 [00:56<13:22, 29.71s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.9300872435746286, 'Balanced Accuracy': 0.7523550069446296, 'ROC AUC': None, 'F1 Score': 0.9290613345004053, 'Time taken': 36.72753620147705}


 10%|████▌                                       | 3/29 [00:56<07:02, 16.26s/it]

{'Model': 'BernoulliNB', 'Accuracy': 0.26786135345437395, 'Balanced Accuracy': 0.1632958569299563, 'ROC AUC': None, 'F1 Score': 0.2961910188486273, 'Time taken': 0.24762296676635742}


 14%|█████                                | 4/29 [3:27:01<33:58:04, 4891.36s/it]

{'Model': 'CalibratedClassifierCV', 'Accuracy': 0.6504362178731431, 'Balanced Accuracy': 0.27948088256122094, 'ROC AUC': None, 'F1 Score': 0.6372969996468181, 'Time taken': 12364.893654823303}


 21%|███████▋                             | 6/29 [3:27:10<13:12:25, 2067.19s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.9198302287196416, 'Balanced Accuracy': 0.7508325865428557, 'ROC AUC': None, 'F1 Score': 0.9196343802470783, 'Time taken': 8.780245065689087}


 24%|█████████▏                            | 7/29 [3:27:10<8:30:11, 1391.45s/it]

{'Model': 'DummyClassifier', 'Accuracy': 0.3042914406979486, 'Balanced Accuracy': 0.015151515151515152, 'ROC AUC': None, 'F1 Score': 0.1419825017520393, 'Time taken': 0.21392297744750977}


 28%|██████████▊                            | 8/29 [3:27:11<5:32:00, 948.59s/it]

{'Model': 'ExtraTreeClassifier', 'Accuracy': 0.9197123319971705, 'Balanced Accuracy': 0.7571853349040921, 'ROC AUC': None, 'F1 Score': 0.9196216113412397, 'Time taken': 0.3332328796386719}


 31%|████████████                           | 9/29 [3:27:20<3:38:21, 655.08s/it]

{'Model': 'ExtraTreesClassifier', 'Accuracy': 0.9360999764206555, 'Balanced Accuracy': 0.751976042509009, 'ROC AUC': None, 'F1 Score': 0.9349282259345835, 'Time taken': 9.701366901397705}


 34%|█████████████                         | 10/29 [3:27:21<2:23:28, 453.07s/it]

{'Model': 'GaussianNB', 'Accuracy': 0.26302758783305824, 'Balanced Accuracy': 0.28543423520665606, 'ROC AUC': None, 'F1 Score': 0.3202062826651322, 'Time taken': 0.7526810169219971}


 38%|██████████████▍                       | 11/29 [3:27:23<1:34:29, 314.96s/it]

{'Model': 'KNeighborsClassifier', 'Accuracy': 0.8591134166470172, 'Balanced Accuracy': 0.5441268485449108, 'ROC AUC': None, 'F1 Score': 0.8556784711607648, 'Time taken': 1.8072340488433838}


 41%|███████████████▋                      | 12/29 [3:29:22<1:12:20, 255.31s/it]

{'Model': 'LabelPropagation', 'Accuracy': 0.9047394482433389, 'Balanced Accuracy': 0.6386554358425286, 'ROC AUC': None, 'F1 Score': 0.9037869956419379, 'Time taken': 118.85804510116577}


 45%|█████████████████                     | 13/29 [3:34:06<1:10:26, 264.13s/it]

{'Model': 'LabelSpreading', 'Accuracy': 0.906036312190521, 'Balanced Accuracy': 0.6502793067853108, 'ROC AUC': None, 'F1 Score': 0.9049220416326311, 'Time taken': 284.3925271034241}


 48%|███████████████████▎                    | 14/29 [3:34:07<46:10, 184.71s/it]

{'Model': 'LinearDiscriminantAnalysis', 'Accuracy': 0.5419712331997171, 'Balanced Accuracy': 0.47867486894766953, 'ROC AUC': None, 'F1 Score': 0.5726339624415133, 'Time taken': 1.2031760215759277}


 52%|████████████████████▋                   | 15/29 [3:40:50<58:25, 250.37s/it]

{'Model': 'LinearSVC', 'Accuracy': 0.6460740391417119, 'Balanced Accuracy': 0.31778464895825526, 'ROC AUC': None, 'F1 Score': 0.631302411283932, 'Time taken': 402.5159397125244}


 55%|██████████████████████                  | 16/29 [3:41:03<38:47, 179.06s/it]

{'Model': 'LogisticRegression', 'Accuracy': 0.5892478189106343, 'Balanced Accuracy': 0.1239867737896965, 'ROC AUC': None, 'F1 Score': 0.5677802029993607, 'Time taken': 13.458243131637573}


 59%|███████████████████████▍                | 17/29 [3:41:04<25:04, 125.36s/it]

{'Model': 'NearestCentroid', 'Accuracy': 0.21869842018391888, 'Balanced Accuracy': 0.2419855895392723, 'ROC AUC': None, 'F1 Score': 0.2619928611389457, 'Time taken': 0.4790041446685791}


 66%|██████████████████████████▊              | 19/29 [3:41:21<11:05, 66.54s/it]

{'Model': 'PassiveAggressiveClassifier', 'Accuracy': 0.44376326338127803, 'Balanced Accuracy': 0.18159320149061609, 'ROC AUC': None, 'F1 Score': 0.4766871299894172, 'Time taken': 17.062029123306274}


 69%|████████████████████████████▎            | 20/29 [3:41:35<07:35, 50.62s/it]

{'Model': 'Perceptron', 'Accuracy': 0.46604574392831877, 'Balanced Accuracy': 0.1133539281606637, 'ROC AUC': None, 'F1 Score': 0.5006629131098871, 'Time taken': 13.495278120040894}


 76%|███████████████████████████████          | 22/29 [3:42:32<04:54, 42.02s/it]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.9357462862532422, 'Balanced Accuracy': 0.7525711398100707, 'ROC AUC': None, 'F1 Score': 0.9345114676733026, 'Time taken': 57.017979860305786}


 79%|████████████████████████████████▌        | 23/29 [3:42:33<02:57, 29.52s/it]

{'Model': 'RidgeClassifier', 'Accuracy': 0.5892478189106343, 'Balanced Accuracy': 0.10394018777611945, 'ROC AUC': None, 'F1 Score': 0.5543152857099285, 'Time taken': 0.3618648052215576}


 83%|█████████████████████████████████▉       | 24/29 [3:42:34<01:44, 20.93s/it]

{'Model': 'RidgeClassifierCV', 'Accuracy': 0.5885404385758076, 'Balanced Accuracy': 0.10388092334171413, 'ROC AUC': None, 'F1 Score': 0.5538148499612631, 'Time taken': 0.8667972087860107}


 86%|███████████████████████████████████▎     | 25/29 [3:42:59<01:29, 22.35s/it]

{'Model': 'SGDClassifier', 'Accuracy': 0.5833529827870785, 'Balanced Accuracy': 0.10730906575140614, 'ROC AUC': None, 'F1 Score': 0.5509461984186109, 'Time taken': 25.670183181762695}


 97%|███████████████████████████████████████▌ | 28/29 [3:46:40<00:44, 44.05s/it]

{'Model': 'SVC', 'Accuracy': 0.5931384107521811, 'Balanced Accuracy': 0.12014100141387896, 'ROC AUC': None, 'F1 Score': 0.5814103398371849, 'Time taken': 220.3044719696045}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019607 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 33925, number of used features: 100
[LightGBM] [Info] Start training from score -1.179658
[LightGBM] [Info] Start training from score -2.059740
[LightGBM] [Info] Start training from score -2.940820
[LightGBM] [Info] Start training from score -2.692984
[LightGBM] [Info] Start training from score -2.811693
[LightGBM] [Info] Start training from score -2.537962
[LightGBM] [Info] Start training from score -3.226272
[LightGBM] [Info] Start training from score -5.065931
[LightGBM] [Info] Start training from score -5.179634
[LightGBM] [Info] Start training from score -4.354





















































































100%|████████████████████████████████████████| 29/29 [3:47:03<00:00, 469.78s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.2043150200424428, 'Balanced Accuracy': 0.03439448876024127, 'ROC AUC': None, 'F1 Score': 0.2361021541416898, 'Time taken': 23.30639886856079}
                             Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                        
ExtraTreeClassifier              0.92               0.76    None      0.92   
RandomForestClassifier           0.94               0.75    None      0.93   
BaggingClassifier                0.93               0.75    None      0.93   
ExtraTreesClassifier             0.94               0.75    None      0.93   
DecisionTreeClassifier           0.92               0.75    None      0.92   
LabelSpreading                   0.91               0.65    None      0.90   
LabelPropagation                 0.90               0.64    None      0.90   
KNeighborsClassifier             0.86               0.54    None      0.86   
LinearDiscriminantAnalysis       0.5




# Para Setor

## Com todos os modelos do LazyPredict (demora bastante tempo a correr)

In [20]:
# Para o alvo y2 (Setor)
clf2 = LazyClassifier(verbose=1, ignore_warnings=True, custom_metric=None)
models2, predictions2 = clf2.fit(X_train_2, X_test_2, y_train_2, y_test_2)

# Imprimindo os resultados
print(models2)

  3%|█▌                                          | 1/29 [00:19<09:04, 19.45s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.4939872671539731, 'Balanced Accuracy': 0.1956349858504998, 'ROC AUC': None, 'F1 Score': 0.44737609211056467, 'Time taken': 19.44986629486084}


  7%|███                                         | 2/29 [00:56<13:32, 30.08s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.9245460976184862, 'Balanced Accuracy': 0.8100128572097406, 'ROC AUC': None, 'F1 Score': 0.9227623147619941, 'Time taken': 37.52697134017944}


 10%|████▌                                       | 3/29 [00:57<07:08, 16.48s/it]

{'Model': 'BernoulliNB', 'Accuracy': 0.35769865597736383, 'Balanced Accuracy': 0.3040823430038554, 'ROC AUC': None, 'F1 Score': 0.3624993235805775, 'Time taken': 0.29103803634643555}


 17%|███████                                  | 5/29 [10:43<1:01:44, 154.37s/it]

{'Model': 'CalibratedClassifierCV', 'Accuracy': 0.7403914171186041, 'Balanced Accuracy': 0.3357533221802938, 'ROC AUC': None, 'F1 Score': 0.7150090180670796, 'Time taken': 586.0332260131836}


 21%|████████▉                                  | 6/29 [10:48<39:46, 103.76s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.9145248762084414, 'Balanced Accuracy': 0.8108589493994859, 'ROC AUC': None, 'F1 Score': 0.9131291339641875, 'Time taken': 5.52090311050415}


 24%|██████████▌                                 | 7/29 [10:49<25:38, 69.91s/it]

{'Model': 'DummyClassifier', 'Accuracy': 0.3604102805941995, 'Balanced Accuracy': 0.07692307692307693, 'ROC AUC': None, 'F1 Score': 0.19096528776782526, 'Time taken': 0.22387099266052246}


 28%|████████████▏                               | 8/29 [10:49<16:42, 47.75s/it]

{'Model': 'ExtraTreeClassifier', 'Accuracy': 0.9099269040320679, 'Balanced Accuracy': 0.8099062024747035, 'ROC AUC': None, 'F1 Score': 0.9090126509533644, 'Time taken': 0.285053014755249}


 31%|█████████████▋                              | 9/29 [10:57<11:44, 35.21s/it]

{'Model': 'ExtraTreesClassifier', 'Accuracy': 0.9309125206319264, 'Balanced Accuracy': 0.8152248372437197, 'ROC AUC': None, 'F1 Score': 0.9286543942751116, 'Time taken': 7.631222248077393}


 34%|██████████████▊                            | 10/29 [10:57<07:44, 24.45s/it]

{'Model': 'GaussianNB', 'Accuracy': 0.3114831407686866, 'Balanced Accuracy': 0.3748595574627573, 'ROC AUC': None, 'F1 Score': 0.3880595480560701, 'Time taken': 0.3516547679901123}


 38%|████████████████▎                          | 11/29 [10:58<05:11, 17.30s/it]

{'Model': 'KNeighborsClassifier', 'Accuracy': 0.8852864890356048, 'Balanced Accuracy': 0.6915662001468559, 'ROC AUC': None, 'F1 Score': 0.8815981010892494, 'Time taken': 1.1008329391479492}


 41%|█████████████████▊                         | 12/29 [11:53<08:06, 28.63s/it]

{'Model': 'LabelPropagation', 'Accuracy': 0.9096911105871257, 'Balanced Accuracy': 0.6803556800244892, 'ROC AUC': None, 'F1 Score': 0.9066188352618834, 'Time taken': 54.52848196029663}


 45%|███████████████████▎                       | 13/29 [13:44<14:17, 53.59s/it]

{'Model': 'LabelSpreading', 'Accuracy': 0.9085121433624145, 'Balanced Accuracy': 0.6697302777671165, 'ROC AUC': None, 'F1 Score': 0.9051376591066889, 'Time taken': 111.028400182724}


 48%|████████████████████▊                      | 14/29 [13:45<09:26, 37.76s/it]

{'Model': 'LinearDiscriminantAnalysis', 'Accuracy': 0.6948832822447536, 'Balanced Accuracy': 0.47966031431771344, 'ROC AUC': None, 'F1 Score': 0.687893899214805, 'Time taken': 1.1657559871673584}


 52%|██████████████████████▏                    | 15/29 [15:44<14:31, 62.27s/it]

{'Model': 'LinearSVC', 'Accuracy': 0.7394482433388352, 'Balanced Accuracy': 0.3134675407177593, 'ROC AUC': None, 'F1 Score': 0.7077844952582105, 'Time taken': 119.0770959854126}


 59%|█████████████████████████▏                 | 17/29 [15:46<06:10, 30.89s/it]

{'Model': 'LogisticRegression', 'Accuracy': 0.7329639236029238, 'Balanced Accuracy': 0.3124216686552606, 'ROC AUC': None, 'F1 Score': 0.706161946315509, 'Time taken': 1.877218246459961}
{'Model': 'NearestCentroid', 'Accuracy': 0.3255128507427493, 'Balanced Accuracy': 0.24763516063047936, 'ROC AUC': None, 'F1 Score': 0.40277429284677135, 'Time taken': 0.17740082740783691}


 66%|████████████████████████████▏              | 19/29 [15:48<02:37, 15.78s/it]

{'Model': 'PassiveAggressiveClassifier', 'Accuracy': 0.6355812308417826, 'Balanced Accuracy': 0.2927460408181386, 'ROC AUC': None, 'F1 Score': 0.6387528347201098, 'Time taken': 2.1202077865600586}


 69%|█████████████████████████████▋             | 20/29 [15:50<01:45, 11.69s/it]

{'Model': 'Perceptron', 'Accuracy': 0.6671775524640415, 'Balanced Accuracy': 0.2834377468211233, 'ROC AUC': None, 'F1 Score': 0.6518414715128109, 'Time taken': 2.1467409133911133}


 72%|███████████████████████████████▏           | 21/29 [15:51<01:06,  8.33s/it]

{'Model': 'QuadraticDiscriminantAnalysis', 'Accuracy': 0.0, 'Balanced Accuracy': 0.0, 'ROC AUC': None, 'F1 Score': 0.0, 'Time taken': 0.4846200942993164}


 76%|████████████████████████████████▌          | 22/29 [16:15<01:30, 12.93s/it]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.9296156566847442, 'Balanced Accuracy': 0.8166392941385817, 'ROC AUC': None, 'F1 Score': 0.927704092171685, 'Time taken': 23.647711038589478}


 79%|██████████████████████████████████         | 23/29 [16:15<00:54,  9.11s/it]

{'Model': 'RidgeClassifier', 'Accuracy': 0.7136288611176609, 'Balanced Accuracy': 0.2548716510830189, 'ROC AUC': None, 'F1 Score': 0.6798305286774137, 'Time taken': 0.21512293815612793}


 83%|███████████████████████████████████▌       | 24/29 [16:15<00:32,  6.53s/it]

{'Model': 'RidgeClassifierCV', 'Accuracy': 0.7181089365715633, 'Balanced Accuracy': 0.26004326309764675, 'ROC AUC': None, 'F1 Score': 0.6848631945388212, 'Time taken': 0.5064058303833008}


 86%|█████████████████████████████████████      | 25/29 [16:20<00:24,  6.10s/it]

{'Model': 'SGDClassifier', 'Accuracy': 0.726008016977128, 'Balanced Accuracy': 0.30228719902406676, 'ROC AUC': None, 'F1 Score': 0.699049788664409, 'Time taken': 5.1030120849609375}


 90%|██████████████████████████████████████▌    | 26/29 [18:23<02:03, 41.16s/it]

{'Model': 'SVC', 'Accuracy': 0.71881631690639, 'Balanced Accuracy': 0.2880630993043255, 'ROC AUC': None, 'F1 Score': 0.6849309469762966, 'Time taken': 122.9482741355896}


 97%|█████████████████████████████████████████▌ | 28/29 [18:35<00:24, 24.97s/it]

{'Model': 'XGBClassifier', 'Accuracy': 0.9265503419004951, 'Balanced Accuracy': 0.8252409054623389, 'ROC AUC': None, 'F1 Score': 0.9244908232917787, 'Time taken': 12.186024904251099}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009880 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25500
[LightGBM] [Info] Number of data points in the train set: 33925, number of used features: 100
[LightGBM] [Info] Start training from score -1.202942
[LightGBM] [Info] Start training from score -3.473459
[LightGBM] [Info] Start training from score -1.031443
[LightGBM] [Info] Start training from score -2.698662
[LightGBM] [Info] Start training from score -2.548461
[LightGBM] [Info] Start training from score -3.408149
[LightGBM] [Info] Start training from score -6.560706
[LightGBM] [Info] Start training from score -5.042836
[LightGBM] [Info] Start training from score -2.103698
[LightGBM] [Info] Start training from sc





100%|███████████████████████████████████████████| 29/29 [18:44<00:00, 38.77s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.671775524640415, 'Balanced Accuracy': 0.36613165208573667, 'ROC AUC': None, 'F1 Score': 0.6661687874954584, 'Time taken': 8.32017993927002}
                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
XGBClassifier                      0.93               0.83    None      0.92   
RandomForestClassifier             0.93               0.82    None      0.93   
ExtraTreesClassifier               0.93               0.82    None      0.93   
DecisionTreeClassifier             0.91               0.81    None      0.91   
BaggingClassifier                  0.92               0.81    None      0.92   
ExtraTreeClassifier                0.91               0.81    None      0.91   
KNeighborsClassifier               0.89               0.69    None      0.88   
LabelPropagation                   0.91               0.68    None      0.91   
LabelSpreading    


