# Spambase

Uses the [Spambase Data Set](https://archive.ics.uci.edu/ml/datasets/Spambase).

## Modelling

In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
columns = [
    "word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d",
    "word_freq_our", "word_freq_over", "word_freq_remove", "word_freq_internet",
    "word_freq_order", "word_freq_mail", "word_freq_receive", "word_freq_will",
    "word_freq_people", "word_freq_report", "word_freq_addresses", "word_freq_free",
    "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit",
    "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money",
    "word_freq_hp", "word_freq_hpl", "word_freq_george", "word_freq_650",
    "word_freq_lab", "word_freq_labs", "word_freq_telnet", "word_freq_857",
    "word_freq_data", "word_freq_415", "word_freq_85", "word_freq_technology",
    "word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct",
    "word_freq_cs", "word_freq_meeting", "word_freq_original", "word_freq_project",
    "word_freq_re", "word_freq_edu", "word_freq_table", "word_freq_conference",
    "char_freq_;", "char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$",
    "char_freq_#", "capital_run_length_average", "capital_run_length_longest",
    "capital_run_length_total", "label"
]

In [3]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data'

In [4]:
df = pd.read_csv(url, names=columns, sep=',')

In [5]:
df_train_split, df_test_split = train_test_split(df, test_size=0.25)
df_train_split, df_eval_split = train_test_split(df_train_split, test_size=0.25)
df_train = df_train_split.copy()
df_eval = df_eval_split.copy()

In [6]:
df_train.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,label
4084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.537,0.0,1.075,0.0,0.0,1.2,3,12,0
3074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.714,4,12,0
865,0.0,0.37,1.11,0.0,0.37,0.0,0.0,0.0,0.0,0.74,...,0.0,0.292,0.0,0.878,0.175,0.058,5.985,58,425,1
2948,0.0,0.0,0.24,0.0,0.0,0.0,0.0,0.0,0.0,0.48,...,0.0,0.034,0.0,0.0,0.0,0.0,3.202,87,285,0
3157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.536,0.0,0.0,0.0,0.268,2.529,11,43,0


In [7]:
X_train = df_train.drop(columns=['label'])
y_train = df_train['label'].values

In [8]:
X_eval = df_eval.drop(columns=['label'])
y_eval = df_eval['label'].values

In [9]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto'))])

In [10]:
train_preds = np.where(clf.predict(X_eval) > 0.5, 1, 0)
(train_preds == y_eval).sum() / df_eval.shape[0]

0.9200463499420626

## Partial Dependence

In [11]:
import pdpexplorer
from pdpexplorer.pdp import partial_dependence, plot

In [12]:
df = X_train

In [13]:
features = list(df.columns)

In [14]:
pd_data = partial_dependence(
    predict=clf.predict,
    df=df,
    one_way_features=features,
    two_way_feature_pairs=[],
    n_instances=500,
    resolution=40,
    n_jobs=4,
)

In [15]:
w = pdpexplorer.PDPExplorerWidget(
    predict=clf.predict,
    df=df,
    pd_data=pd_data,
    n_jobs=4,
    height=600
)

w

PDPExplorerWidget(features=['capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_to…