In [2]:
# data manipulation
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# training
from sklearn.model_selection import train_test_split, cross_val_predict, RandomizedSearchCV, StratifiedKFold

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# metrics
from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer

%matplotlib inline

## Lendo o dataset

In [4]:
column_names = []
with open('data/spambase.names', 'r') as f:
    for line in f:

        if line.strip().endswith('continuous.'):
            column_names.append(line[:line.index(':')])

    f.close()

column_names.append('spam')

In [6]:
df = pd.read_csv('data/spambase.data', names=column_names)
df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
word_freq_make,0.0,0.21,0.06,0.0,0.0,0.0,0.0,0.0,0.15,0.06
word_freq_address,0.64,0.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12
word_freq_all,0.64,0.5,0.71,0.0,0.0,0.0,0.0,0.0,0.46,0.77
word_freq_3d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
word_freq_our,0.32,0.14,1.23,0.63,0.63,1.85,1.92,1.88,0.61,0.19
word_freq_over,0.0,0.28,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.32
word_freq_remove,0.0,0.21,0.19,0.31,0.31,0.0,0.0,0.0,0.3,0.38
word_freq_internet,0.0,0.07,0.12,0.63,0.63,1.85,0.0,1.88,0.0,0.0
word_freq_order,0.0,0.0,0.64,0.31,0.31,0.0,0.0,0.0,0.92,0.06
word_freq_mail,0.0,0.94,0.25,0.63,0.63,0.0,0.64,0.0,0.76,0.0


In [7]:
# sem dados ausentes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   word_freq_make              4601 non-null   float64
 1   word_freq_address           4601 non-null   float64
 2   word_freq_all               4601 non-null   float64
 3   word_freq_3d                4601 non-null   float64
 4   word_freq_our               4601 non-null   float64
 5   word_freq_over              4601 non-null   float64
 6   word_freq_remove            4601 non-null   float64
 7   word_freq_internet          4601 non-null   float64
 8   word_freq_order             4601 non-null   float64
 9   word_freq_mail              4601 non-null   float64
 10  word_freq_receive           4601 non-null   float64
 11  word_freq_will              4601 non-null   float64
 12  word_freq_people            4601 non-null   float64
 13  word_freq_report            4601 

In [8]:
# separando em conjunto de treino e de teste
X = df.drop(columns=['spam'])
y = df['spam']

# divide o dataset em treino e teste, com stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3220, 57), (1381, 57), (3220,), (1381,))

In [9]:
# usando StratifiedKFold para garantir que seja feita uma amostragem estratificada durante o hypertuning
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [10]:
# usando f1 como métrica para o randomized search cross validation
score = make_scorer(f1_score)

In [11]:
# dataset com scaling
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Treinamento dos modelos
Testaremos treinar com e sem usar scaling, para comparar os resultados

### Árvore de decisão


#### Sem Scaling

In [None]:
# Utilizando os hiperparâmetros padrões do scikit-learn
dt_clf = DecisionTreeClassifier(random_state=42)
preds = cross_val_predict(dt_clf, X_train, y_train, cv=5)

print(f"Precision: {precision_score(y_train, preds)}")
print(f"Recall: {recall_score(y_train, preds)}")
print(f"F1-score: {f1_score(y_train, preds)}")

Precision: 0.8701700154559505
Recall: 0.8873128447596532
F1-score: 0.8786578228638314


Tuning de hiperparâmetros

In [None]:
dt_grid = {
    'max_depth': [2, 4, 6, 8, 10, None],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, "sqrt", "log2"]
}

rs_dt = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42),
    dt_grid,
    random_state=42,
    n_iter=100,
    scoring=score,
    cv=stratified_cv,
    n_jobs=-1,
    verbose=2
)

rs_dt.fit(X_train, y_train)
print(f"Melhores hiperparametros: {rs_dt.best_params_}")
print(f"Melhor F1 score: {rs_dt.best_score_}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Melhores hiperparametros: {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 10}
Melhor F1 score: 0.8897705246431116


#### Com scaling

In [None]:
dt_clf = DecisionTreeClassifier(random_state=42)
preds = cross_val_predict(dt_clf, X_train_scaled, y_train, cv=5)

print(f"Precision: {precision_score(y_train, preds)}")
print(f"Recall: {recall_score(y_train, preds)}")
print(f"F1-score: {f1_score(y_train, preds)}")

Precision: 0.8688271604938271
Recall: 0.8873128447596532
F1-score: 0.877972709551657


In [None]:
dt_grid = {
    'max_depth': [2, 4, 6, 8, 10, None],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, "sqrt", "log2"]
}

rs_dt = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42),
    dt_grid,
    random_state=42,
    n_iter=100,
    scoring=score,
    cv=stratified_cv,
    n_jobs=-1,
    verbose=2
)

rs_dt.fit(X_train_scaled, y_train)
print(f"Melhores hiperparametros: {rs_dt.best_params_}")
print(f"Melhor F1 score: {rs_dt.best_score_}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Melhores hiperparametros: {'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 8}
Melhor F1 score: 0.8894693778747339


No final das contas temos que o dataset sem scaling ficou com o f1 score levemente melhor que o dataset com scaling (88.97% contra 88.94%, no conjunto de validação do cross validation). A melhor combinação de hiperparâmetros ficou como {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 10}.

### Bayesiano ingênuo

### Regressão Logística

#### Definindo o modelo de regressão logística

In [12]:
# Criando o modelo de regressão logística
log_reg = LogisticRegression(random_state=42, max_iter=2000)

#### Sem Scaling

In [14]:
# Treinamento sem scaling, utilizando validação cruzada
preds = cross_val_predict(log_reg, X_train, y_train, cv=5)

# Avaliando as métricas de desempenho
print(f"Precision: {precision_score(y_train, preds)}")
print(f"Recall: {recall_score(y_train, preds)}")
print(f"F1-score: {f1_score(y_train, preds)}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Precision: 0.9131832797427653
Recall: 0.8951930654058313
F1-score: 0.9040986868284918


Tuning de hiperparâmetros

In [16]:
# Definição da grade de hiperparâmetros corrigida
param_grid = {
    'solver': ['liblinear', 'saga'],  # 'liblinear' suporta apenas 'l1' e 'l2'; 'saga' suporta 'elasticnet' também
    'penalty': ['l1', 'l2'],  # Removendo 'none' pois causa erro
    'C': np.logspace(-2, 1, 5)  # [0.01, 0.1, 1, 10, 100] Valores de regularização
}

# Criando o modelo base
log_reg = LogisticRegression(max_iter=7000)  # Aumentando max_iter para evitar erro de convergência

# RandomizedSearchCV corrigido
random_search = RandomizedSearchCV(
    estimator=log_reg,
    param_distributions=param_grid,
    n_iter=6,
    cv=5,
    scoring='f1',
    random_state=42,
    n_jobs=-1
)

# Ajustando o modelo
random_search.fit(X_train, y_train)

# Exibindo os melhores hiperparâmetros
print("Melhores hiperparâmetros:", random_search.best_params_)
print("Melhor F1 score:", random_search.best_score_)

Melhores hiperparâmetros: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.31622776601683794}
Melhor F1 score: 0.9000972808921993


#### Com scaling

In [17]:
# Usando o MinMaxScaler para escalonar os dados
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Treinando o modelo com validação cruzada nos dados escalonados
preds_scaled = cross_val_predict(log_reg, X_train_scaled, y_train, cv=5)

# Avaliando as métricas de desempenho nos dados escalonados
print(f"Precision (com scaling): {precision_score(y_train, preds_scaled)}")
print(f"Recall (com scaling): {recall_score(y_train, preds_scaled)}")
print(f"F1-score (com scaling): {f1_score(y_train, preds_scaled)}")


Precision (com scaling): 0.9000900090009001
Recall (com scaling): 0.7880220646178093
F1-score (com scaling): 0.8403361344537815


Tuning de Hiperparâmetros (com scaling)

In [18]:
# Usando os mesmos hiperparâmetros, mas agora para os dados escalonados
rs_log_reg_scaled = RandomizedSearchCV(
    log_reg,
    param_grid,
    random_state=42,
    n_iter=20,
    scoring='f1',
    cv=stratified_cv,
    n_jobs=-1,
    verbose=2
)

# Treinando o modelo com os melhores hiperparâmetros para dados escalonados
rs_log_reg_scaled.fit(X_train_scaled, y_train)

# Exibindo os melhores hiperparâmetros
print(f"Melhores hiperparâmetros (com scaling): {rs_log_reg_scaled.best_params_}")
print(f"Melhor F1 score (com scaling): {rs_log_reg_scaled.best_score_}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Melhores hiperparâmetros (com scaling): {'solver': 'saga', 'penalty': 'l1', 'C': 10.0}
Melhor F1 score (com scaling): 0.9037401630812191


### K-Vizinhos