# Spambase

Uses the [Spambase Data Set](https://archive.ics.uci.edu/ml/datasets/Spambase).

## Modelling

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [None]:
columns = [
    "word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d",
    "word_freq_our", "word_freq_over", "word_freq_remove", "word_freq_internet",
    "word_freq_order", "word_freq_mail", "word_freq_receive", "word_freq_will",
    "word_freq_people", "word_freq_report", "word_freq_addresses", "word_freq_free",
    "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit",
    "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money",
    "word_freq_hp", "word_freq_hpl", "word_freq_george", "word_freq_650",
    "word_freq_lab", "word_freq_labs", "word_freq_telnet", "word_freq_857",
    "word_freq_data", "word_freq_415", "word_freq_85", "word_freq_technology",
    "word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct",
    "word_freq_cs", "word_freq_meeting", "word_freq_original", "word_freq_project",
    "word_freq_re", "word_freq_edu", "word_freq_table", "word_freq_conference",
    "char_freq_;", "char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$",
    "char_freq_#", "capital_run_length_average", "capital_run_length_longest",
    "capital_run_length_total", "label"
]

In [None]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data'

In [None]:
df = pd.read_csv(url, names=columns, sep=',')

In [None]:
df_train_split, df_test_split = train_test_split(df, test_size=0.25)
df_train_split, df_eval_split = train_test_split(df_train_split, test_size=0.25)
df_train = df_train_split.copy()
df_eval = df_eval_split.copy()

In [None]:
df_train.head()

In [None]:
X_train = df_train.drop(columns=['label'])
y_train = df_train['label'].values

In [None]:
X_eval = df_eval.drop(columns=['label'])
y_eval = df_eval['label'].values

In [None]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)

In [None]:
train_preds = np.where(clf.predict(X_eval) > 0.5, 1, 0)
(train_preds == y_eval).sum() / df_eval.shape[0]

## Partial Dependence

In [None]:
import pdpexplorer
from pdpexplorer.pdp import partial_dependence, plot

In [None]:
df = X_train

In [None]:
features = list(df.columns)

In [None]:
pd_data = partial_dependence(
    predict=clf.predict,
    df=df,
    one_way_features=features,
    two_way_feature_pairs=[],
    feature_to_one_hot={},
    n_instances=500,
    resolution=40,
    n_jobs=4,
)

In [None]:
w = pdpexplorer.PDPExplorerWidget(
    predict=clf.predict,
    df=df,
    feature_to_one_hot={},
    pd_data=pd_data,
    n_jobs=4,
    height=600
)

w