In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, make_scorer, precision_score
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import itertools
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import classification_report, precision_recall_fscore_support

In [3]:
data = pd.read_csv("Dados\data.csv")
df = data[["idade","genero","estado_civil", "int_renda", "bloco"]]

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1581 entries, 0 to 1580
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   idade         1581 non-null   int64 
 1   genero        1581 non-null   object
 2   estado_civil  1581 non-null   object
 3   int_renda     1581 non-null   int64 
 4   bloco         1581 non-null   object
dtypes: int64(2), object(3)
memory usage: 61.9+ KB


In [5]:
df.head()

Unnamed: 0,idade,genero,estado_civil,int_renda,bloco
0,25,Feminino,Solteiro (a),3,Tchanzinho Zona Norte
1,32,Feminino,Solteiro (a),5,Tchanzinho Zona Norte
2,18,Feminino,Solteiro (a),6,Tchanzinho Zona Norte
3,40,Feminino,Casado (a),7,Tchanzinho Zona Norte
4,47,Masculino,Solteiro (a),6,Tchanzinho Zona Norte


## Clustering - retirei

# ML

In [6]:
#preparar dados para treinamento

#definir target
features = df.drop('bloco', axis=1)
target = df['bloco']

## Dummy model

In [7]:
#dummy_target
# Calculate the normalized frequency of each value in bloco
freq = df['bloco'].value_counts(normalize=True)

# Create a list of indices with the same length as bloco
indices = np.arange(len(df))

# Create a Series with random choices based on probabilities
np.random.seed(12345)
bloco_chosen = pd.Series(np.random.choice(freq.index, size=len(df), p=freq.values), index=indices)

#comparar chances
print('Probabilidades aleatórias:')
print(bloco_chosen.value_counts(normalize=True))
print()
print('Distribuição dos blocos:')
print(df['bloco'].value_counts(normalize=True))

Probabilidades aleatórias:
Baianas Ozadas                 0.339658
MONOBLOCO                      0.167615
Entao Brilha                   0.093612
Outros                         0.084756
Bloco Despedida de Carnaval    0.071474
Juventude Bronzeada            0.057559
Quando come se lambuza         0.055028
Bloco Angola Janga             0.050601
Tchanzinho Zona Norte          0.031626
Bloco Havayanas Usadas         0.030361
Batiza                         0.017710
dtype: float64

Distribuição dos blocos:
Baianas Ozadas                 0.331436
MONOBLOCO                      0.169513
Outros                         0.093612
Entao Brilha                   0.091714
Bloco Despedida de Carnaval    0.072739
Juventude Bronzeada            0.060089
Quando come se lambuza         0.053763
Bloco Angola Janga             0.038583
Bloco Havayanas Usadas         0.035421
Tchanzinho Zona Norte          0.030361
Batiza                         0.022770
Name: bloco, dtype: float64


Chances proximas mas diferentes.

In [8]:
#calcular acurácia do modelo aleatório
# calculate accuracy
accuracy = (bloco_chosen == df['bloco']).mean()

# print accuracy
print("Random accuracy: {:.3f}".format(accuracy))

Random accuracy: 0.168


A chance de acertar no bloco aleatoriamente para cada folião é de 0.17 .

Se quisessemos acertar com alta probabilidade, poderíamos colocar todos os valores como iguais ao bloco mais frequente (Baianas Ozadas). Nesse case teríamos um probabilidade **High Chance de 0.33** .

## Logistic Regression

In [13]:

#transformar colunas com OHE
features_ohe = pd.get_dummies(features, drop_first=True)

iterations = 5

#fazer scaling
num_cols = ['idade']
scaler = StandardScaler()
features_ohe[num_cols] = scaler.fit_transform(features_ohe[num_cols])

# definir o tamanho do bloco se houver apenas três deles
sample_size = int(len(features_ohe)/iterations)

scores = []
train_scores = []
macro_f1_scores = []

for i in range(0, len(features_ohe), sample_size):
    #condição paera quando o último bloco fica para lá da length do df    
    if i+sample_size >= len(features_ohe):
        valid_indexes = list(range(i , len(features_ohe)))
#        train_indexes = list(range(0,i)) + list(range(len(features_ohe), len(features_ohe)))
        train_indexes = list(range(0,i)) + list(range(i+sample_size, len(features_ohe)))
    else:
        valid_indexes = list(range(i , i+sample_size))
        train_indexes = list(range(0,i)) + list(range(i+sample_size, len(features_ohe)))
    
	# Dividir as características das variáveis e o objetivo em amostras
    features_train = features_ohe.iloc[train_indexes]
    target_train = target.iloc[train_indexes]
    features_valid = features_ohe.iloc[valid_indexes]
    target_valid = target.iloc[valid_indexes]
    
    #treinar e avaliar a qualidade do modelo (logistic regression para multiclasses)
    model = LogisticRegression(multi_class='multinomial', solver='newton-cg', class_weight='balanced', random_state=42)
    model = model.fit(features_train, target_train)
    score = model.score(features_valid,target_valid)
    train_score = model.score(features_train,target_train)
    
    scores.append(score)
    train_scores.append(train_score)
    
    target_predicted = model.predict(features_valid)
    report = classification_report(target_valid, target_predicted, output_dict=True)
    macro_f1 = precision_recall_fscore_support(target_valid, target_predicted)[2].mean()
    macro_f1_scores.append(macro_f1)

final_score = sum(scores) / len(scores)
final_train_score = sum(train_scores) / len(train_scores)
macro_f1_score = sum(macro_f1_scores) / len(macro_f1_scores)

print('Acurácia da Regressão Logística:', final_score)
print('Acurácia do treino:', final_train_score)
print('Macro F1 score:', macro_f1_score )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Acurácia da Regressão Logística: 0.2510548523206751
Acurácia do treino: 0.12744700722136057
Macro F1 score: 0.2026979211242579


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Decision Tree

In [14]:
#transformar colunas com OHE
features_ohe = pd.get_dummies(features, drop_first=True)

#fazer scaling
num_cols = ['idade']
scaler = StandardScaler()
features_ohe[num_cols] = scaler.fit_transform(features_ohe[num_cols])

# definir o tamanho do bloco se houver apenas três deles
sample_size = int(len(features_ohe)/5)

# Define the hyperparameter grid
param_grid = {'max_depth': range(1, 11),
              'min_samples_split': range(2, 11),
              'min_samples_leaf': range(1, 6)}

# Create a decision tree classifier object
dt = DecisionTreeClassifier(random_state=42)

# Use grid search to find the best hyperparameters
grid_search = GridSearchCV(dt, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(features_ohe, target)

# Print the best score and the corresponding training accuracy score
best_score = grid_search.best_score_
best_train_score = grid_search.best_estimator_.score(features_ohe, target)
print('Best accuracy score:', best_score)
print('Corresponding training accuracy score:', best_train_score)

# Print the hyperparameters that gave the best score
best_params = grid_search.best_params_
print('Best hyperparameters:', best_params)

Best accuracy score: 0.32827536636984384
Corresponding training accuracy score: 0.33206831119544594
Best hyperparameters: {'max_depth': 1, 'min_samples_leaf': 1, 'min_samples_split': 2}


Acurácias:
- Random: 0.16
- High chance: 0.33
- LogReg: 0.25 (underfitted)
- DecisionTree: 0.33 (good fit)
(class_weight='balanced' piora o modelo de DecisionTree)

## RandomForestClassifier

In [15]:
# Transformar colunas com OHE
features_ohe = pd.get_dummies(features, drop_first=True)

# Fazer scaling
num_cols = ['idade']
scaler = StandardScaler()
features_ohe[num_cols] = scaler.fit_transform(features_ohe[num_cols])

# Split data
X_train, X_valid, y_train, y_valid = train_test_split(features_ohe, target, test_size=0.2, random_state=42)

# Definir hiperparametros a otimizar
n_estimators = [2,3,4,5,6,7,8,9]
max_depth = [3, 5, 7, 9, 13, None]
min_samples_split = [2, 3, 4, 6]
min_samples_leaf = [5, 6, 7, 8, 9]
class_weight = [True, False]

# Inicializar variaveis
best_score = 0
best_params = None

# Iterar as combinações de hiperparametros e treinar modelo
for n in n_estimators:
    for d in max_depth:
        for s in min_samples_split:
            for l in min_samples_leaf:
                for cw in class_weight: 
                    if cw:
                        cw_dict = "balanced"
                    else:
                        cw_dict = None
                    model = RandomForestClassifier(n_estimators=n, max_depth=d, min_samples_split=s, min_samples_leaf=l, class_weight=cw_dict, random_state=42)
                    model.fit(X_train, y_train)
                    score = model.score(X_valid, y_valid)
                    train_score = model.score(X_train, y_train)
                    print("n_estimators:", n, ", max_depth:", d, ", min_samples_split:", s, ", min_samples_leaf:", l, ", class_weight:", cw_dict, ", accuracy:", score, ", train accuracy:", train_score)
                    if score > best_score:
                        best_score = score
                        best_params = {"n_estimators": n, "max_depth": d, "min_samples_split": s, "min_samples_leaf": l, "class_weight": cw_dict}

# modelo final com melhores hiperparametros
model = RandomForestClassifier(n_estimators=best_params["n_estimators"], max_depth=best_params["max_depth"], min_samples_split=best_params["min_samples_split"], min_samples_leaf=best_params["min_samples_leaf"], class_weight=best_params["class_weight"], random_state=42)
model.fit(X_train, y_train)
final_score = model.score(X_valid, y_valid)
final_train_score = model.score(X_train, y_train)

print('Best Hyperparameters:', best_params)
print('Acurácia do Random Forest:', final_score)
print('Acurácia do treino:', final_train_score)


n_estimators: 2 , max_depth: 3 , min_samples_split: 2 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.12618296529968454 , train accuracy: 0.11471518987341772
n_estimators: 2 , max_depth: 3 , min_samples_split: 2 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.3470031545741325 , train accuracy: 0.32674050632911394
n_estimators: 2 , max_depth: 3 , min_samples_split: 2 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.12618296529968454 , train accuracy: 0.11471518987341772
n_estimators: 2 , max_depth: 3 , min_samples_split: 2 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3470031545741325 , train accuracy: 0.32674050632911394
n_estimators: 2 , max_depth: 3 , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.12618296529968454 , train accuracy: 0.11471518987341772
n_estimators: 2 , max_depth: 3 , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.3470031545741325 , train accuracy: 0.3267405

n_estimators: 2 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.10680379746835443
n_estimators: 2 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.33069620253164556
n_estimators: 2 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.10680379746835443
n_estimators: 2 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.33069620253164556
n_estimators: 2 , max_depth: 5 , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.12302839116719243 , train accuracy: 0.13132911392405064
n_estimators: 2 , max_depth: 5 , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.3470031545741325 , train accuracy: 0.3370253

n_estimators: 2 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.31545741324921134 , train accuracy: 0.34572784810126583
n_estimators: 2 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.12302839116719243 , train accuracy: 0.16693037974683544
n_estimators: 2 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3217665615141956 , train accuracy: 0.34810126582278483
n_estimators: 2 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.1685126582278481
n_estimators: 2 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.35331230283911674 , train accuracy: 0.3425632911392405
n_estimators: 2 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.11987381703470032 , train accuracy: 0.1210443

n_estimators: 2 , max_depth: 13 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.11041009463722397 , train accuracy: 0.11708860759493671
n_estimators: 2 , max_depth: 13 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.3249211356466877 , train accuracy: 0.33544303797468356
n_estimators: 2 , max_depth: 13 , min_samples_split: 3 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.11787974683544304
n_estimators: 2 , max_depth: 13 , min_samples_split: 3 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.31545741324921134 , train accuracy: 0.33781645569620256
n_estimators: 2 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.12933753943217666 , train accuracy: 0.2468354430379747
n_estimators: 2 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.2586750788643533 , train accuracy: 0.3

n_estimators: 2 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.26813880126182965 , train accuracy: 0.37816455696202533
n_estimators: 2 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.17958860759493672
n_estimators: 2 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.26498422712933756 , train accuracy: 0.370253164556962
n_estimators: 2 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.12302839116719243 , train accuracy: 0.19936708860759494
n_estimators: 2 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.29652996845425866 , train accuracy: 0.35205696202531644
n_estimators: 2 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.11041009463722397 , train 

n_estimators: 3 , max_depth: 5 , min_samples_split: 2 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.11041009463722397 , train accuracy: 0.12895569620253164
n_estimators: 3 , max_depth: 5 , min_samples_split: 2 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.3501577287066246 , train accuracy: 0.34414556962025317
n_estimators: 3 , max_depth: 5 , min_samples_split: 2 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.12302839116719243 , train accuracy: 0.14003164556962025
n_estimators: 3 , max_depth: 5 , min_samples_split: 2 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.35962145110410093 , train accuracy: 0.34968354430379744
n_estimators: 3 , max_depth: 5 , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.14319620253164558
n_estimators: 3 , max_depth: 5 , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.3501577287066246 , train accuracy: 0.333860

n_estimators: 3 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.12618296529968454 , train accuracy: 0.1906645569620253
n_estimators: 3 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.31545741324921134 , train accuracy: 0.36313291139240506
n_estimators: 3 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.09779179810725552 , train accuracy: 0.16534810126582278
n_estimators: 3 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.35646687697160884 , train accuracy: 0.34177215189873417
n_estimators: 3 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.14003164556962025
n_estimators: 3 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.3312302839116719 , train accuracy: 0.347310

n_estimators: 3 , max_depth: 9 , min_samples_split: 6 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.3217665615141956 , train accuracy: 0.3449367088607595
n_estimators: 3 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.138801261829653 , train accuracy: 0.2634493670886076
n_estimators: 3 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.28391167192429023 , train accuracy: 0.3837025316455696
n_estimators: 3 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.11041009463722397 , train accuracy: 0.2064873417721519
n_estimators: 3 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.2555205047318612 , train accuracy: 0.3837025316455696
n_estimators: 3 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.11987381703470032 , train accuracy: 0.21598101

n_estimators: 3 , max_depth: None , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.3401898734177215
n_estimators: 3 , max_depth: None , min_samples_split: 3 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.17246835443037975
n_estimators: 3 , max_depth: None , min_samples_split: 3 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.31230283911671924 , train accuracy: 0.3528481012658228
n_estimators: 3 , max_depth: None , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.2175632911392405
n_estimators: 3 , max_depth: None , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.3028391167192429 , train accuracy: 0.3884493670886076
n_estimators: 3 , max_depth: None , min_samples_split: 4 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.12302839116719243 , train accu

n_estimators: 4 , max_depth: 3 , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.32832278481012656
n_estimators: 4 , max_depth: 3 , min_samples_split: 6 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.12933753943217666 , train accuracy: 0.11550632911392406
n_estimators: 4 , max_depth: 3 , min_samples_split: 6 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.32832278481012656
n_estimators: 4 , max_depth: 3 , min_samples_split: 6 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.12933753943217666 , train accuracy: 0.11550632911392406
n_estimators: 4 , max_depth: 3 , min_samples_split: 6 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.32832278481012656
n_estimators: 4 , max_depth: 5 , min_samples_split: 2 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.1167192429022082 , train accuracy: 0.15427215

n_estimators: 4 , max_depth: 7 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.1518987341772152
n_estimators: 4 , max_depth: 7 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.3312302839116719 , train accuracy: 0.3512658227848101
n_estimators: 4 , max_depth: 7 , min_samples_split: 3 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.12933753943217666 , train accuracy: 0.15427215189873417
n_estimators: 4 , max_depth: 7 , min_samples_split: 3 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.334384858044164 , train accuracy: 0.35443037974683544
n_estimators: 4 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.185126582278481
n_estimators: 4 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.3186119873817035 , train accuracy: 0.363132911392

n_estimators: 4 , max_depth: 9 , min_samples_split: 4 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.334384858044164 , train accuracy: 0.375
n_estimators: 4 , max_depth: 9 , min_samples_split: 4 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.12302839116719243 , train accuracy: 0.15822784810126583
n_estimators: 4 , max_depth: 9 , min_samples_split: 4 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.334384858044164 , train accuracy: 0.35680379746835444
n_estimators: 4 , max_depth: 9 , min_samples_split: 4 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.12618296529968454 , train accuracy: 0.16930379746835442
n_estimators: 4 , max_depth: 9 , min_samples_split: 4 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.31545741324921134 , train accuracy: 0.3599683544303797
n_estimators: 4 , max_depth: 9 , min_samples_split: 6 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.0914826498422713 , train accuracy: 0.2310126582278481
n_estim

n_estimators: 4 , max_depth: None , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.11041009463722397 , train accuracy: 0.2199367088607595
n_estimators: 4 , max_depth: None , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.2996845425867508 , train accuracy: 0.37341772151898733
n_estimators: 4 , max_depth: None , min_samples_split: 2 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.09779179810725552 , train accuracy: 0.16693037974683544
n_estimators: 4 , max_depth: None , min_samples_split: 2 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.3217665615141956 , train accuracy: 0.3528481012658228
n_estimators: 4 , max_depth: None , min_samples_split: 2 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.17009493670886075
n_estimators: 4 , max_depth: None , min_samples_split: 2 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.3217665615141956 , train acc

n_estimators: 5 , max_depth: 3 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.10838607594936708
n_estimators: 5 , max_depth: 3 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.32832278481012656
n_estimators: 5 , max_depth: 3 , min_samples_split: 3 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.10838607594936708
n_estimators: 5 , max_depth: 3 , min_samples_split: 3 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.32832278481012656
n_estimators: 5 , max_depth: 3 , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.0914826498422713 , train accuracy: 0.10284810126582279
n_estimators: 5 , max_depth: 3 , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.32832278

n_estimators: 5 , max_depth: 7 , min_samples_split: 2 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.09779179810725552 , train accuracy: 0.18275316455696203
n_estimators: 5 , max_depth: 7 , min_samples_split: 2 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.3470031545741325 , train accuracy: 0.3615506329113924
n_estimators: 5 , max_depth: 7 , min_samples_split: 2 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.10094637223974763 , train accuracy: 0.18670886075949367
n_estimators: 5 , max_depth: 7 , min_samples_split: 2 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.334384858044164 , train accuracy: 0.3623417721518987
n_estimators: 5 , max_depth: 7 , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.10094637223974763 , train accuracy: 0.18037974683544303
n_estimators: 5 , max_depth: 7 , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.35331230283911674 , train accuracy: 0.357594936

n_estimators: 5 , max_depth: 9 , min_samples_split: 3 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.08201892744479496 , train accuracy: 0.23338607594936708
n_estimators: 5 , max_depth: 9 , min_samples_split: 3 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.3028391167192429 , train accuracy: 0.37341772151898733
n_estimators: 5 , max_depth: 9 , min_samples_split: 3 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.10410094637223975 , train accuracy: 0.2120253164556962
n_estimators: 5 , max_depth: 9 , min_samples_split: 3 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3186119873817035 , train accuracy: 0.37579113924050633
n_estimators: 5 , max_depth: 9 , min_samples_split: 3 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.13249211356466878 , train accuracy: 0.20806962025316456
n_estimators: 5 , max_depth: 9 , min_samples_split: 3 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.3312302839116719 , train accuracy: 0.37420886

n_estimators: 5 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.12302839116719243 , train accuracy: 0.28401898734177217
n_estimators: 5 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.29652996845425866 , train accuracy: 0.3995253164556962
n_estimators: 5 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.11041009463722397 , train accuracy: 0.23022151898734178
n_estimators: 5 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.305993690851735 , train accuracy: 0.39319620253164556
n_estimators: 5 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.1167192429022082 , train accuracy: 0.22468354430379747
n_estimators: 5 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.3280757097791798 , train accuracy: 0.364

n_estimators: 5 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.23813291139240506
n_estimators: 5 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.29337539432176657 , train accuracy: 0.39556962025316456
n_estimators: 5 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.22626582278481014
n_estimators: 5 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.28391167192429023 , train accuracy: 0.39794303797468356
n_estimators: 5 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.12302839116719243 , train accuracy: 0.21914556962025317
n_estimators: 5 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.31230283911671924 , trai

n_estimators: 6 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.1518987341772152
n_estimators: 6 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.3425632911392405
n_estimators: 6 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.1463607594936709
n_estimators: 6 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.334384858044164 , train accuracy: 0.34810126582278483
n_estimators: 6 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.13449367088607594
n_estimators: 6 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.33753943217665616 , train accuracy: 0.3409810126

n_estimators: 6 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3249211356466877 , train accuracy: 0.3662974683544304
n_estimators: 6 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.19224683544303797
n_estimators: 6 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.35331230283911674 , train accuracy: 0.35680379746835444
n_estimators: 6 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.10410094637223975 , train accuracy: 0.16376582278481014
n_estimators: 6 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.33753943217665616 , train accuracy: 0.3623417721518987
n_estimators: 6 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.1167192429022082 , train accuracy: 0.16218354

n_estimators: 6 , max_depth: 13 , min_samples_split: 3 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.2412974683544304
n_estimators: 6 , max_depth: 13 , min_samples_split: 3 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3312302839116719 , train accuracy: 0.3963607594936709
n_estimators: 6 , max_depth: 13 , min_samples_split: 3 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.12302839116719243 , train accuracy: 0.23338607594936708
n_estimators: 6 , max_depth: 13 , min_samples_split: 3 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.377373417721519
n_estimators: 6 , max_depth: 13 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.10410094637223975 , train accuracy: 0.1875
n_estimators: 6 , max_depth: 13 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.30914826498422715 , train accuracy: 0.37183544303797467

n_estimators: 6 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.1167192429022082 , train accuracy: 0.24287974683544303
n_estimators: 6 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3186119873817035 , train accuracy: 0.39794303797468356
n_estimators: 6 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.13564668769716087 , train accuracy: 0.2468354430379747
n_estimators: 6 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.3249211356466877 , train accuracy: 0.37183544303797467
n_estimators: 6 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.10410094637223975 , train accuracy: 0.1875
n_estimators: 6 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.3249211356466877 , train accuracy: 0.3591

n_estimators: 7 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.35331230283911674 , train accuracy: 0.34414556962025317
n_estimators: 7 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.14794303797468356
n_estimators: 7 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.34098101265822783
n_estimators: 7 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.14319620253164558
n_estimators: 7 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.3312302839116719 , train accuracy: 0.34968354430379744
n_estimators: 7 , max_depth: 5 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.10094637223974763 , train accuracy: 0.135284

n_estimators: 7 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.3501577287066246 , train accuracy: 0.35205696202531644
n_estimators: 7 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.19778481012658228
n_estimators: 7 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.334384858044164 , train accuracy: 0.3583860759493671
n_estimators: 7 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.1906645569620253
n_estimators: 7 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3312302839116719 , train accuracy: 0.3575949367088608
n_estimators: 7 , max_depth: 7 , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.11987381703470032 , train accuracy: 0.18196202531

n_estimators: 7 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.31230283911671924 , train accuracy: 0.39082278481012656
n_estimators: 7 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.13249211356466878 , train accuracy: 0.23971518987341772
n_estimators: 7 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.334384858044164 , train accuracy: 0.3813291139240506
n_estimators: 7 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.09779179810725552 , train accuracy: 0.19382911392405064
n_estimators: 7 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.305993690851735 , train accuracy: 0.3694620253164557
n_estimators: 7 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.13249211356466878 , train accuracy: 0.1930

n_estimators: 7 , max_depth: None , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.09779179810725552 , train accuracy: 0.19382911392405064
n_estimators: 7 , max_depth: None , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.29652996845425866 , train accuracy: 0.36313291139240506
n_estimators: 7 , max_depth: None , min_samples_split: 3 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.12618296529968454 , train accuracy: 0.19699367088607594
n_estimators: 7 , max_depth: None , min_samples_split: 3 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.31545741324921134 , train accuracy: 0.35680379746835444
n_estimators: 7 , max_depth: None , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.26424050632911394
n_estimators: 7 , max_depth: None , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.31545741324921134 , trai

n_estimators: 8 , max_depth: 3 , min_samples_split: 4 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.32832278481012656
n_estimators: 8 , max_depth: 3 , min_samples_split: 4 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.11987381703470032 , train accuracy: 0.14082278481012658
n_estimators: 8 , max_depth: 3 , min_samples_split: 4 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.32832278481012656
n_estimators: 8 , max_depth: 3 , min_samples_split: 6 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.11041009463722397 , train accuracy: 0.12341772151898735
n_estimators: 8 , max_depth: 3 , min_samples_split: 6 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.32832278481012656
n_estimators: 8 , max_depth: 3 , min_samples_split: 6 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.1226265

n_estimators: 8 , max_depth: 5 , min_samples_split: 6 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.33781645569620256
n_estimators: 8 , max_depth: 7 , min_samples_split: 2 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.20094936708860758
n_estimators: 8 , max_depth: 7 , min_samples_split: 2 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.3249211356466877 , train accuracy: 0.3599683544303797
n_estimators: 8 , max_depth: 7 , min_samples_split: 2 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.10410094637223975 , train accuracy: 0.18829113924050633
n_estimators: 8 , max_depth: 7 , min_samples_split: 2 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.35680379746835444
n_estimators: 8 , max_depth: 7 , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.11987381703470032 , train accuracy: 0.18037974

n_estimators: 8 , max_depth: 9 , min_samples_split: 3 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.11041009463722397 , train accuracy: 0.22705696202531644
n_estimators: 8 , max_depth: 9 , min_samples_split: 3 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3028391167192429 , train accuracy: 0.37579113924050633
n_estimators: 8 , max_depth: 9 , min_samples_split: 3 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.13564668769716087 , train accuracy: 0.22072784810126583
n_estimators: 8 , max_depth: 9 , min_samples_split: 3 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.36550632911392406
n_estimators: 8 , max_depth: 9 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.11356466876971609 , train accuracy: 0.19145569620253164
n_estimators: 8 , max_depth: 9 , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.31230283911671924 , train accuracy: 0.356803

n_estimators: 8 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.31230283911671924 , train accuracy: 0.3963607594936709
n_estimators: 8 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.12933753943217666 , train accuracy: 0.24604430379746836
n_estimators: 8 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3249211356466877 , train accuracy: 0.3963607594936709
n_estimators: 8 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.13249211356466878 , train accuracy: 0.24841772151898733
n_estimators: 8 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.34069400630914826 , train accuracy: 0.37579113924050633
n_estimators: 8 , max_depth: 13 , min_samples_split: 4 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.0946372239747634 , train accuracy: 0.19

n_estimators: 8 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.12618296529968454 , train accuracy: 0.24287974683544303
n_estimators: 8 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.334384858044164 , train accuracy: 0.379746835443038
n_estimators: 8 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.10410094637223975 , train accuracy: 0.20490506329113925
n_estimators: 8 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.305993690851735 , train accuracy: 0.36075949367088606
n_estimators: 8 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.12618296529968454 , train accuracy: 0.20015822784810128
n_estimators: 8 , max_depth: None , min_samples_split: 6 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.305993690851735 , train accura

n_estimators: 9 , max_depth: 5 , min_samples_split: 2 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3470031545741325 , train accuracy: 0.34098101265822783
n_estimators: 9 , max_depth: 5 , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.10094637223974763 , train accuracy: 0.14556962025316456
n_estimators: 9 , max_depth: 5 , min_samples_split: 2 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.334384858044164 , train accuracy: 0.34651898734177217
n_estimators: 9 , max_depth: 5 , min_samples_split: 2 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.10094637223974763 , train accuracy: 0.13765822784810128
n_estimators: 9 , max_depth: 5 , min_samples_split: 2 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.33753943217665616 , train accuracy: 0.33860759493670883
n_estimators: 9 , max_depth: 5 , min_samples_split: 2 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.1408227

n_estimators: 9 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.1906645569620253
n_estimators: 9 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3280757097791798 , train accuracy: 0.3639240506329114
n_estimators: 9 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.11041009463722397 , train accuracy: 0.18117088607594936
n_estimators: 9 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.3438485804416404 , train accuracy: 0.3560126582278481
n_estimators: 9 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.10725552050473186 , train accuracy: 0.16218354430379747
n_estimators: 9 , max_depth: 7 , min_samples_split: 4 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.3249211356466877 , train accuracy: 0.3520569620

n_estimators: 9 , max_depth: 9 , min_samples_split: 6 , min_samples_leaf: 8 , class_weight: None , accuracy: 0.3186119873817035 , train accuracy: 0.3615506329113924
n_estimators: 9 , max_depth: 9 , min_samples_split: 6 , min_samples_leaf: 9 , class_weight: balanced , accuracy: 0.12933753943217666 , train accuracy: 0.19541139240506328
n_estimators: 9 , max_depth: 9 , min_samples_split: 6 , min_samples_leaf: 9 , class_weight: None , accuracy: 0.334384858044164 , train accuracy: 0.34889240506329117
n_estimators: 9 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 5 , class_weight: balanced , accuracy: 0.13564668769716087 , train accuracy: 0.29193037974683544
n_estimators: 9 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.3217665615141956 , train accuracy: 0.3995253164556962
n_estimators: 9 , max_depth: 13 , min_samples_split: 2 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.11987381703470032 , train accuracy: 0.25
n_es

n_estimators: 9 , max_depth: None , min_samples_split: 3 , min_samples_leaf: 5 , class_weight: None , accuracy: 0.3217665615141956 , train accuracy: 0.3963607594936709
n_estimators: 9 , max_depth: None , min_samples_split: 3 , min_samples_leaf: 6 , class_weight: balanced , accuracy: 0.10410094637223975 , train accuracy: 0.26819620253164556
n_estimators: 9 , max_depth: None , min_samples_split: 3 , min_samples_leaf: 6 , class_weight: None , accuracy: 0.3312302839116719 , train accuracy: 0.39556962025316456
n_estimators: 9 , max_depth: None , min_samples_split: 3 , min_samples_leaf: 7 , class_weight: balanced , accuracy: 0.13249211356466878 , train accuracy: 0.247626582278481
n_estimators: 9 , max_depth: None , min_samples_split: 3 , min_samples_leaf: 7 , class_weight: None , accuracy: 0.334384858044164 , train accuracy: 0.37895569620253167
n_estimators: 9 , max_depth: None , min_samples_split: 3 , min_samples_leaf: 8 , class_weight: balanced , accuracy: 0.11041009463722397 , train accur

Acurácias:
- Random: 0.16
- High chance: 0.33
- LogReg: 0.25 (underfitted)
- DecisionTree: 0.33 (good fit)
- RandomForest: 0.37 (bom fit)

(hiperparametros de random forest foram sendo fine tuned a partir de um range mais largo; class_weight tambem piora o modelo)

## Gradient Descent (LightGBM)

In [16]:
#lightGBM com cross validation,e otimização de hiperparametros (COM balamceamento de classes)

# Transform categorical columns to type 'category'
df["genero"] = df["genero"].astype("category")
df["estado_civil"] = df["estado_civil"].astype("category")

# Scale 'idade'
scaler = StandardScaler()
df["idade"] = scaler.fit_transform(df[["idade"]])

# Split into features and target
features = df.drop('bloco', axis=1)
target = df['bloco']

# Define hyperparameters to optimize
n_estimators = [50, 100, 200]
learning_rate = [0.01, 0.05, 0.1]
max_depth = [3, 5, 7]
num_leaves = [31, 63, 127]

# Initialize variables for best accuracy and hyperparameters
best_accuracy = 0
best_hyperparameters = {}

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Loop through hyperparameters and perform cross-validation
for n in n_estimators:
    for lr in learning_rate:
        for md in max_depth:
            for nl in num_leaves:
                # Define model with current hyperparameters
                lgbm = lgb.LGBMClassifier(n_estimators=n, learning_rate=lr, max_depth=md, num_leaves=nl, is_unbalance=True)
                # Balance automatically imbalanced classes

                # Initialize lists to store accuracy scores for each fold
                valid_scores = []
                train_scores = []

                # Loop through each fold
                for train_index, valid_index in cv.split(features, target):
                    # Split data into train and validation sets
                    lgbm_train_features = features.iloc[train_index]
                    lgbm_valid_features = features.iloc[valid_index]
                    train_target = target.iloc[train_index]
                    valid_target = target.iloc[valid_index]

                    # Fit the model to the training set
                    lgbm.fit(lgbm_train_features, train_target)

                    # Make predictions on the validation and training sets
                    lgbm_valid_pred = lgbm.predict(lgbm_valid_features)
                    lgbm_train_pred = lgbm.predict(lgbm_train_features)

                    # Calculate accuracy score for the validation and training sets
                    valid_acc = accuracy_score(valid_target, lgbm_valid_pred)
                    train_acc = accuracy_score(train_target, lgbm_train_pred)

                    # Append scores to lists
                    valid_scores.append(valid_acc)
                    train_scores.append(train_acc)

                # Calculate mean accuracy scores across all folds
                mean_valid_score = np.mean(valid_scores)
                mean_train_score = np.mean(train_scores)

                # Check if this set of hyperparameters produced the best validation accuracy so far
                if mean_valid_score > best_accuracy:
                    best_accuracy = mean_valid_score
                    best_hyperparameters = {'n_estimators': n, 'learning_rate': lr, 'max_depth': md, 'num_leaves': nl}
                    best_train_accuracy = mean_train_score

# Print results
print("Melhores hiperparametros:", best_hyperparameters)
print("Acurácia média do treino:", best_train_accuracy)
print("Melhor acurácia média na validação:", best_accuracy)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["genero"] = df["genero"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["estado_civil"] = df["estado_civil"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["idade"] = scaler.fit_transform(df[["idade"]])


Melhores hiperparametros: {'n_estimators': 50, 'learning_rate': 0.01, 'max_depth': 7, 'num_leaves': 31}
Acurácia média do treino: 0.34092297493370693
Melhor acurácia média na validação: 0.3326997564189594


In [None]:
#lightGBM com cross validation,e otimização de hiperparametros (SEM balanceamento de classes)

# Transform categorical columns to type 'category'
df["genero"] = df["genero"].astype("category")
df["estado_civil"] = df["estado_civil"].astype("category")

# Scale 'idade'
scaler = StandardScaler()
df["idade"] = scaler.fit_transform(df[["idade"]])

# Split into features and target
features = df.drop('bloco', axis=1)
target = df['bloco']


In [17]:
#lightGBM com cross validation,e otimização de hiperparametros (SEM balanceamento de classes)

# Transform categorical columns to type 'category'
df["genero"] = df["genero"].astype("category")
df["estado_civil"] = df["estado_civil"].astype("category")

# Scale 'idade'
scaler = StandardScaler()
df["idade"] = scaler.fit_transform(df[["idade"]])

# Split into features and target
features = df.drop('bloco', axis=1)
target = df['bloco']

# Define hyperparameters to optimize
n_estimators = [50, 100, 200]
learning_rate = [0.01, 0.05, 0.1]
max_depth = [3, 5, 7]
num_leaves = [31, 63, 127]

# Initialize variables for best accuracy and hyperparameters
best_accuracy = 0
best_hyperparameters = {}

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Loop through hyperparameters and perform cross-validation
for n in n_estimators:
    for lr in learning_rate:
        for md in max_depth:
            for nl in num_leaves:
                # Define model with current hyperparameters
                lgbm = lgb.LGBMClassifier(n_estimators=n, learning_rate=lr, max_depth=md, num_leaves=nl)
                # Balance automatically imbalanced classes

                # Initialize lists to store accuracy scores for each fold
                valid_scores = []
                train_scores = []

                # Loop through each fold
                for train_index, valid_index in cv.split(features, target):
                    # Split data into train and validation sets
                    lgbm_train_features = features.iloc[train_index]
                    lgbm_valid_features = features.iloc[valid_index]
                    train_target = target.iloc[train_index]
                    valid_target = target.iloc[valid_index]

                    # Fit the model to the training set
                    lgbm.fit(lgbm_train_features, train_target)

                    # Make predictions on the validation and training sets
                    lgbm_valid_pred = lgbm.predict(lgbm_valid_features)
                    lgbm_train_pred = lgbm.predict(lgbm_train_features)

                    # Calculate accuracy score for the validation and training sets
                    valid_acc = accuracy_score(valid_target, lgbm_valid_pred)
                    train_acc = accuracy_score(train_target, lgbm_train_pred)

                    # Append scores to lists
                    valid_scores.append(valid_acc)
                    train_scores.append(train_acc)

                # Calculate mean accuracy scores across all folds
                mean_valid_score = np.mean(valid_scores)
                mean_train_score = np.mean(train_scores)

                # Check if this set of hyperparameters produced the best validation accuracy so far
                if mean_valid_score > best_accuracy:
                    best_accuracy = mean_valid_score
                    best_hyperparameters = {'n_estimators': n, 'learning_rate': lr, 'max_depth': md, 'num_leaves': nl}
                    best_train_accuracy = mean_train_score

# Print results
print("Melhores hiperparametros:", best_hyperparameters)
print("Acurácia média do treino:", best_train_accuracy)
print("Melhor acurácia média na validação:", best_accuracy)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["genero"] = df["genero"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["estado_civil"] = df["estado_civil"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["idade"] = scaler.fit_transform(df[["idade"]])


Melhores hiperparametros: {'n_estimators': 50, 'learning_rate': 0.01, 'max_depth': 7, 'num_leaves': 31}
Acurácia média do treino: 0.34092297493370693
Melhor acurácia média na validação: 0.3326997564189594


Acurácias:
- Random: 0.16
- High chance: 0.33
- LogReg: 0.25 (underfitted)
- DecisionTree: 0.33 (good fit)
- RandomForest: 0.37 (bom fit)
- LightGBM: 0.34 (bom fit) (igual com e sem balanceamento de classes)


Melhor modelo dos estudadosfoi **RandomForest** com as seguintes caracteristicas:

{'n_estimators': 5, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 6, 'class_weight': None}

Vamos se não está a escolher demasiadamente o bloco mais frequente.

## Mais métricas para os melhores modelos

In [22]:
#Random forest

#definir target
features = df.drop('bloco', axis=1)
target = df['bloco']

# Transformar colunas com OHE
features_ohe = pd.get_dummies(features, drop_first=True)

# Fazer scaling
num_cols = ['idade']
scaler = StandardScaler()
features_ohe[num_cols] = scaler.fit_transform(features_ohe[num_cols])

# Split data
X_train, X_valid, y_train, y_valid = train_test_split(features_ohe, target, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=5, max_depth=5, min_samples_split=2, min_samples_leaf=6, random_state=42)
model.fit(X_train, y_train)
prediction = model.predict(X_valid)
accuracy = model.score(X_valid, y_valid)
train_accuracy = model.score(X_train, y_train)

#print acurácia
print('Acurácia do Random Forest:', final_score)
print('Acurácia do treino:', final_train_score)

Acurácia do Random Forest: 0.3659305993690852
Acurácia do treino: 0.33939873417721517


In [24]:
#Tabela resumo de medidas
target_names = df['bloco'].unique().tolist()
print(classification_report(y_valid, prediction, target_names=target_names))

                             precision    recall  f1-score   support

      Tchanzinho Zona Norte       0.36      0.92      0.52       109
               Entao Brilha       0.00      0.00      0.00         9
     Quando come se lambuza       0.60      0.46      0.52        13
                     Outros       0.00      0.00      0.00        14
                     Batiza       0.00      0.00      0.00         9
         Bloco Angola Janga       0.00      0.00      0.00        29
             Baianas Ozadas       0.00      0.00      0.00        21
     Bloco Havayanas Usadas       0.32      0.20      0.24        51
        Juventude Bronzeada       0.00      0.00      0.00        38
                  MONOBLOCO       0.00      0.00      0.00        17
Bloco Despedida de Carnaval       0.00      0.00      0.00         7

                   accuracy                           0.37       317
                  macro avg       0.12      0.14      0.12       317
               weighted avg     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
#classification report de Random Forest sem balanceamento 
report = classification_report(y_valid, prediction, output_dict=True)
macro_f1 = precision_recall_fscore_support(y_valid, prediction)[2].mean()

# print the macro avg f1-score
print("Macro avg f1-score:", macro_f1)

Macro avg f1-score: 0.11682928081269929


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Uma tendência para o modelo não escolher blocos sub-representados com **macro avg F1 de 0.117** contra 0.24 de weighted avg F1.

In [28]:
#Random forest, weighted

#definir target
features = df.drop('bloco', axis=1)
target = df['bloco']

# Transformar colunas com OHE
features_ohe = pd.get_dummies(features, drop_first=True)

# Fazer scaling
num_cols = ['idade']
scaler = StandardScaler()
features_ohe[num_cols] = scaler.fit_transform(features_ohe[num_cols])

# Split data
X_train, X_valid, y_train, y_valid = train_test_split(features_ohe, target, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=5, max_depth=5, min_samples_split=2, min_samples_leaf=6, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)
prediction = model.predict(X_valid)
accuracy = model.score(X_valid, y_valid)
train_accuracy = model.score(X_train, y_train)

#print acurácia
print('Acurácia do Random Forest:', final_score)
print('Acurácia do treino:', final_train_score)

Acurácia do Random Forest: 0.3659305993690852
Acurácia do treino: 0.33939873417721517


In [29]:
#Tabela resumo de medidas
target_names = df['bloco'].unique().tolist()
print(classification_report(y_valid, prediction, target_names=target_names))

                             precision    recall  f1-score   support

      Tchanzinho Zona Norte       0.00      0.00      0.00       109
               Entao Brilha       0.07      0.22      0.10         9
     Quando come se lambuza       0.18      0.69      0.29        13
                     Outros       0.06      0.07      0.07        14
                     Batiza       0.03      0.22      0.06         9
         Bloco Angola Janga       0.11      0.03      0.05        29
             Baianas Ozadas       0.14      0.19      0.16        21
     Bloco Havayanas Usadas       0.20      0.02      0.04        51
        Juventude Bronzeada       0.12      0.03      0.04        38
                  MONOBLOCO       0.17      0.47      0.25        17
Bloco Despedida de Carnaval       0.01      0.14      0.03         7

                   accuracy                           0.09       317
                  macro avg       0.10      0.19      0.10       317
               weighted avg     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
#classification report de Random Forest sem balanceamento 
report = classification_report(y_valid, prediction, output_dict=True)
macro_f1 = precision_recall_fscore_support(y_valid, prediction)[2].mean()

# print the macro avg f1-score
print("Macro avg f1-score:", macro_f1)

Macro avg f1-score: 0.09927947717634039


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random forest weighted pior que unweighted.

In [34]:
#lightGBM com cross validation,e otimização de hiperparametros (SEM balanceamento de classes)

# Transform categorical columns to type 'category'
df["genero"] = df["genero"].astype("category")
df["estado_civil"] = df["estado_civil"].astype("category")

# Scale 'idade'
scaler = StandardScaler()
df["idade"] = scaler.fit_transform(df[["idade"]])

# Split into features and target
features = df.drop('bloco', axis=1)
target = df['bloco']

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define model with current hyperparameters
lgbm = lgb.LGBMClassifier(n_estimators=50, learning_rate=0.01, max_depth=7, num_leaves=31)
# Balance automatically imbalanced classes

# Initialize lists to store accuracy scores for each fold
valid_scores = []
train_scores = []
macro_f1_scores = []

# Loop through each fold
for train_index, valid_index in cv.split(features, target):
    # Split data into train and validation sets
    lgbm_train_features = features.iloc[train_index]
    lgbm_valid_features = features.iloc[valid_index]
    train_target = target.iloc[train_index]
    valid_target = target.iloc[valid_index]

    # Fit the model to the training set
    lgbm.fit(lgbm_train_features, train_target)

    # Make predictions on the validation and training sets
    lgbm_valid_pred = lgbm.predict(lgbm_valid_features)
    lgbm_train_pred = lgbm.predict(lgbm_train_features)

    # Calculate accuracy score for the validation and training sets
    valid_acc = accuracy_score(valid_target, lgbm_valid_pred)
    train_acc = accuracy_score(train_target, lgbm_train_pred)
    #classification report de Random Forest sem balanceamento 
    report = classification_report(valid_target, lgbm_valid_pred, output_dict=True)
    macro_f1 = precision_recall_fscore_support(valid_target, lgbm_valid_pred)[2].mean()

    # Append scores to lists
    valid_scores.append(valid_acc)
    train_scores.append(train_acc)
    macro_f1_scores.append(macro_f1)

# Calculate mean accuracy scores across all folds
mean_valid_score = np.mean(valid_scores)
mean_train_score = np.mean(train_scores)
mean_macro_f1 = np.mean(macro_f1_scores)

# Print results
print("Acurácia média do treino:", best_train_accuracy)
print("Melhor acurácia média na validação:", best_accuracy)
print("Macro F1 score:", mean_macro_f1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["genero"] = df["genero"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["estado_civil"] = df["estado_civil"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["idade"] = scaler.fit_transform(df[["idade"]])
  _warn_prf(average, modifier, msg_start, len(res

Acurácia média do treino: 0.34092297493370693
Melhor acurácia média na validação: 0.3326997564189594
Macro F1 score: 0.05075800402296554


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
#lightGBM com cross validation,e otimização de hiperparametros (COM balanceamento de classes)

# Transform categorical columns to type 'category'
df["genero"] = df["genero"].astype("category")
df["estado_civil"] = df["estado_civil"].astype("category")

# Scale 'idade'
scaler = StandardScaler()
df["idade"] = scaler.fit_transform(df[["idade"]])

# Split into features and target
features = df.drop('bloco', axis=1)
target = df['bloco']

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define model with current hyperparameters
lgbm = lgb.LGBMClassifier(n_estimators=50, learning_rate=0.01, max_depth=7, num_leaves=31, is_unbalance=True)
# Balance automatically imbalanced classes

# Initialize lists to store accuracy scores for each fold
valid_scores = []
train_scores = []
macro_f1_scores = []

# Loop through each fold
for train_index, valid_index in cv.split(features, target):
    # Split data into train and validation sets
    lgbm_train_features = features.iloc[train_index]
    lgbm_valid_features = features.iloc[valid_index]
    train_target = target.iloc[train_index]
    valid_target = target.iloc[valid_index]

    # Fit the model to the training set
    lgbm.fit(lgbm_train_features, train_target)

    # Make predictions on the validation and training sets
    lgbm_valid_pred = lgbm.predict(lgbm_valid_features)
    lgbm_train_pred = lgbm.predict(lgbm_train_features)

    # Calculate accuracy score for the validation and training sets
    valid_acc = accuracy_score(valid_target, lgbm_valid_pred)
    train_acc = accuracy_score(train_target, lgbm_train_pred)
    #classification report de Random Forest sem balanceamento 
    report = classification_report(valid_target, lgbm_valid_pred, output_dict=True)
    macro_f1 = precision_recall_fscore_support(valid_target, lgbm_valid_pred)[2].mean()

    # Append scores to lists
    valid_scores.append(valid_acc)
    train_scores.append(train_acc)
    macro_f1_scores.append(macro_f1)

# Calculate mean accuracy scores across all folds
mean_valid_score = np.mean(valid_scores)
mean_train_score = np.mean(train_scores)
mean_macro_f1 = np.mean(macro_f1_scores)

# Print results
print("Acurácia média do treino:", best_train_accuracy)
print("Melhor acurácia média na validação:", best_accuracy)
print("Macro F1 score:", mean_macro_f1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["genero"] = df["genero"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["estado_civil"] = df["estado_civil"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["idade"] = scaler.fit_transform(df[["idade"]])
  _warn_prf(average, modifier, msg_start, len(res

Acurácia média do treino: 0.34092297493370693
Melhor acurácia média na validação: 0.3326997564189594
Macro F1 score: 0.05075800402296554


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Macro F1 scores de LightGBM baixos. Melhor usar a Random Forest.

Mas a CatBoost usada pelo Matheus, apesar de acurácia mais baixa, tem um F1 macro melhor, o que faz diminuir as chances de escolher o bloco mais comum de forma desproporcional.