# Spambase

## Modelling

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from pmlb import fetch_data

In [None]:
columns = [
    "word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d",
    "word_freq_our", "word_freq_over", "word_freq_remove", "word_freq_internet",
    "word_freq_order", "word_freq_mail", "word_freq_receive", "word_freq_will",
    "word_freq_people", "word_freq_report", "word_freq_addresses", "word_freq_free",
    "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit",
    "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money",
    "word_freq_hp", "word_freq_hpl", "word_freq_george", "word_freq_650",
    "word_freq_lab", "word_freq_labs", "word_freq_telnet", "word_freq_857",
    "word_freq_data", "word_freq_415", "word_freq_85", "word_freq_technology",
    "word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct",
    "word_freq_cs", "word_freq_meeting", "word_freq_original", "word_freq_project",
    "word_freq_re", "word_freq_edu", "word_freq_table", "word_freq_conference",
    "char_freq_;", "char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$",
    "char_freq_#", "capital_run_length_average", "capital_run_length_longest",
    "capital_run_length_total", "target"
]

In [None]:
df = fetch_data('spambase')
df.columns = columns

In [None]:
df_train_split, df_test_split = train_test_split(df, test_size=0.25)
df_train_split, df_eval_split = train_test_split(df_train_split, test_size=0.25)
df_train = df_train_split.copy()
df_eval = df_eval_split.copy()

In [None]:
df_train.head()

In [None]:
X_train = df_train.drop(columns=['target'])
y_train = df_train['target'].values

In [None]:
X_eval = df_eval.drop(columns=['target'])
y_eval = df_eval['target'].values

In [None]:
param_grid = {
    'n_estimators': [100],
    'criterion': ['entropy'],
    'bootstrap': [True, False],
    'max_features': ['sqrt', 1.0],
    'max_depth': [6, 12],
    'min_samples_split': [2, 8],
    'class_weight': ['balanced', None]
}

cv = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, scoring='f1', n_jobs=-1)

cv.fit(X_train, y_train)

In [None]:
cv.best_score_

In [None]:
cv.best_params_

In [None]:
clf = cv.best_estimator_

## Partial Dependence

In [None]:
from pdpexplorer import partial_dependence, PDPExplorerWidget

In [None]:
def predict(X):
    return clf.predict_proba(X)[:,1]

In [None]:
df = X_train

In [None]:
features = list(df.columns)

In [None]:
subset = df.sample(500)

In [None]:
pd_data = partial_dependence(
    predict=predict,
    df=df,
    features=features,
    resolution=10,
    n_jobs=4,
)

In [None]:
w = PDPExplorerWidget(
    predict=predict,
    df=df,
    pd_data=pd_data,
    height=650
)

w