In [1]:
%matplotlib inline

from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import matplotlib.pylab as plt
from dmba import plotDecisionTree, classificationSummary, regressionSummary


In [2]:
spam_df = pd.read_csv('Spambase.csv')

X = spam_df.drop(columns=['Spam'])
y = spam_df['Spam']
X.rename(columns = {'C[':'CSquare'}, inplace = True) 
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.3, random_state=1)



## Decession Tree

In [3]:
smallClassTree = DecisionTreeClassifier(max_depth=30, min_samples_split=20, min_impurity_decrease=0.01)
smallClassTree.fit(train_X, train_y)

plotDecisionTree(smallClassTree, feature_names=train_X.columns)

InvocationException: GraphViz's executables not found

In [None]:
classificationSummary(train_y, smallClassTree.predict(train_X))
classificationSummary(valid_y, smallClassTree.predict(valid_X))

Confusion Matrix (Accuracy 0.9003)

       Prediction
Actual    0    1
     0 1847  100
     1  221 1052
Confusion Matrix (Accuracy 0.8914)

       Prediction
Actual   0   1
     0 802  39
     1 111 429


## Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100,max_depth=6)
rf.fit(train_X, train_y)
classificationSummary(train_y, rf.predict(train_X))
classificationSummary(valid_y, rf.predict(valid_X))

Confusion Matrix (Accuracy 0.9422)

       Prediction
Actual    0    1
     0 1903   44
     1  142 1131
Confusion Matrix (Accuracy 0.9334)

       Prediction
Actual   0   1
     0 825  16
     1  76 464


## AdaBoost

In [None]:
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         n_estimators=100)

bdt.fit(train_X, train_y)
classificationSummary(train_y, bdt.predict(train_X))
classificationSummary(valid_y, bdt.predict(valid_X))


Confusion Matrix (Accuracy 0.9602)

       Prediction
Actual    0    1
     0 1888   59
     1   69 1204
Confusion Matrix (Accuracy 0.9471)

       Prediction
Actual   0   1
     0 811  30
     1  43 497


## XGBoost

In [None]:
# pip install xgboost

# XGBoost is not from sklearn, so the features nameing convention is differenct. 
# in our dataset we have a feature name is "C[". XGBoost do not aloow that. 
# We need to rename the feature.


from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(train_X, train_y)
classificationSummary(train_y, model.predict(train_X))
classificationSummary(valid_y, model.predict(valid_X))

Confusion Matrix (Accuracy 0.9984)

       Prediction
Actual    0    1
     0 1946    1
     1    4 1269
Confusion Matrix (Accuracy 0.9566)

       Prediction
Actual   0   1
     0 815  26
     1  34 506


In [None]:
# the above code seems overfitting. let's control it. 
#by increase the value of gamma
# you may further expriment decreast the learning rate etc. OR just do a grid search.

model = XGBClassifier(gamma=0.3)
model.fit(train_X, train_y)
classificationSummary(train_y, model.predict(train_X))
classificationSummary(valid_y, model.predict(valid_X))

Confusion Matrix (Accuracy 0.9981)

       Prediction
Actual    0    1
     0 1946    1
     1    5 1268
Confusion Matrix (Accuracy 0.9623)

       Prediction
Actual   0   1
     0 820  21
     1  31 509


## Gradient Boost

In [None]:
boost = GradientBoostingClassifier()
boost.fit(train_X, train_y)
classificationSummary(train_y, boost.predict(train_X))
classificationSummary(valid_y, boost.predict(valid_X))

Confusion Matrix (Accuracy 0.9627)

       Prediction
Actual    0    1
     0 1896   51
     1   69 1204
Confusion Matrix (Accuracy 0.9522)

       Prediction
Actual   0   1
     0 820  21
     1  45 495


## knn

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

grid_params ={'n_neighbors':[3,5,7,9],
              'weights':['uniform','distance']}

gs = GridSearchCV(KNeighborsClassifier(),grid_params,cv=5,n_jobs=-1,scoring='accuracy')
gs.fit(train_X,train_y)
print(gs.best_params_)
bestKnn= gs.best_estimator_

classificationSummary(train_y, bestKnn.predict(train_X))
classificationSummary(valid_y, bestKnn.predict(valid_X))

{'n_neighbors': 9, 'weights': 'distance'}
Confusion Matrix (Accuracy 0.9997)

       Prediction
Actual    0    1
     0 1947    0
     1    1 1272
Confusion Matrix (Accuracy 0.8168)

       Prediction
Actual   0   1
     0 721 120
     1 133 407


## It seems overfitting is a big issue here. 