In [22]:
# data manipulation
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# training
from sklearn.model_selection import train_test_split, cross_val_predict, RandomizedSearchCV, StratifiedKFold

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# models
from sklearn.tree import DecisionTreeClassifier

# metrics
from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer

%matplotlib inline

## Lendo o dataset

In [2]:
column_names = []
with open('data/spambase.names', 'r') as f:
    for line in f:

        if line.strip().endswith('continuous.'):
            column_names.append(line[:line.index(':')])
    
    f.close()

column_names.append('spam')

In [4]:
df = pd.read_csv('data/spambase.data', names=column_names)
df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
word_freq_make,0.0,0.21,0.06,0.0,0.0,0.0,0.0,0.0,0.15,0.06
word_freq_address,0.64,0.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12
word_freq_all,0.64,0.5,0.71,0.0,0.0,0.0,0.0,0.0,0.46,0.77
word_freq_3d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
word_freq_our,0.32,0.14,1.23,0.63,0.63,1.85,1.92,1.88,0.61,0.19
word_freq_over,0.0,0.28,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.32
word_freq_remove,0.0,0.21,0.19,0.31,0.31,0.0,0.0,0.0,0.3,0.38
word_freq_internet,0.0,0.07,0.12,0.63,0.63,1.85,0.0,1.88,0.0,0.0
word_freq_order,0.0,0.0,0.64,0.31,0.31,0.0,0.0,0.0,0.92,0.06
word_freq_mail,0.0,0.94,0.25,0.63,0.63,0.0,0.64,0.0,0.76,0.0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   word_freq_make              4601 non-null   float64
 1   word_freq_address           4601 non-null   float64
 2   word_freq_all               4601 non-null   float64
 3   word_freq_3d                4601 non-null   float64
 4   word_freq_our               4601 non-null   float64
 5   word_freq_over              4601 non-null   float64
 6   word_freq_remove            4601 non-null   float64
 7   word_freq_internet          4601 non-null   float64
 8   word_freq_order             4601 non-null   float64
 9   word_freq_mail              4601 non-null   float64
 10  word_freq_receive           4601 non-null   float64
 11  word_freq_will              4601 non-null   float64
 12  word_freq_people            4601 non-null   float64
 13  word_freq_report            4601 

In [9]:
# separando em conjunto de treino e de teste
X = df.drop(columns=['spam'])
y = df['spam']

# divide o dataset em treino e teste, com stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3220, 57), (1381, 57), (3220,), (1381,))

In [17]:
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [19]:
score = make_scorer(f1_score)

In [24]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Treinamento dos modelos
Testaremos treinar com e sem usar scaling, para comparar os resultados

### Árvore de decisão


#### Sem Scaling

In [13]:
# Utilizando os hiperparâmetros padrões do scikit-learn
dt_clf = DecisionTreeClassifier(random_state=42)
preds = cross_val_predict(dt_clf, X_train, y_train, cv=5)

print(f"Precision: {precision_score(y_train, preds)}")
print(f"Recall: {recall_score(y_train, preds)}")
print(f"F1-score: {f1_score(y_train, preds)}")

Precision: 0.8701700154559505
Recall: 0.8873128447596532
F1-score: 0.8786578228638314


Tuning de hiperparâmetros

In [20]:
dt_grid = {
    'max_depth': [2, 4, 6, 8, 10, None],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, "sqrt", "log2"]
}

rs_dt = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42), 
    dt_grid, 
    random_state=42, 
    n_iter=100, 
    scoring=score,
    cv=stratified_cv,
    n_jobs=-1,
    verbose=2
)

rs_dt.fit(X_train, y_train)
print(f"Melhores hiperparametros: {rs_dt.best_params_}")
print(f"Melhor F1 score: {rs_dt.best_score_}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[

#### Com scaling

In [25]:
dt_clf = DecisionTreeClassifier(random_state=42)
preds = cross_val_predict(dt_clf, X_train_scaled, y_train, cv=5)

print(f"Precision: {precision_score(y_train, preds)}")
print(f"Recall: {recall_score(y_train, preds)}")
print(f"F1-score: {f1_score(y_train, preds)}")

Precision: 0.8688271604938271
Recall: 0.8873128447596532
F1-score: 0.877972709551657


In [26]:
dt_grid = {
    'max_depth': [2, 4, 6, 8, 10, None],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, "sqrt", "log2"]
}

rs_dt = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42), 
    dt_grid, 
    random_state=42, 
    n_iter=100, 
    scoring=score,
    cv=stratified_cv,
    n_jobs=-1,
    verbose=2
)

rs_dt.fit(X_train_scaled, y_train)
print(f"Melhores hiperparametros: {rs_dt.best_params_}")
print(f"Melhor F1 score: {rs_dt.best_score_}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=4; total time=   0.0s
[

### Bayesiano ingênuo

### Regressão Logística

### K-Vizinhos