# Import Libraries

In [1]:
import pandas as pd

In [2]:
names = pd.read_csv('spambase.names',skiprows=32,sep=':/s+',engine='python', names=['attr',''])
names=names['attr']
names=list(names)
names.append('spam_type')
names

['word_freq_make:         continuous.',
 'word_freq_address:      continuous.',
 'word_freq_all:          continuous.',
 'word_freq_3d:           continuous.',
 'word_freq_our:          continuous.',
 'word_freq_over:         continuous.',
 'word_freq_remove:       continuous.',
 'word_freq_internet:     continuous.',
 'word_freq_order:        continuous.',
 'word_freq_mail:         continuous.',
 'word_freq_receive:      continuous.',
 'word_freq_will:         continuous.',
 'word_freq_people:       continuous.',
 'word_freq_report:       continuous.',
 'word_freq_addresses:    continuous.',
 'word_freq_free:         continuous.',
 'word_freq_business:     continuous.',
 'word_freq_email:        continuous.',
 'word_freq_you:          continuous.',
 'word_freq_credit:       continuous.',
 'word_freq_your:         continuous.',
 'word_freq_font:         continuous.',
 'word_freq_000:          continuous.',
 'word_freq_money:        continuous.',
 'word_freq_hp:           continuous.',


## Importing Dataset using Pandas Library

In [3]:
df = pd.read_csv('spambase.csv',names=names)

In [4]:
df.head()

Unnamed: 0,word_freq_make: continuous.,word_freq_address: continuous.,word_freq_all: continuous.,word_freq_3d: continuous.,word_freq_our: continuous.,word_freq_over: continuous.,word_freq_remove: continuous.,word_freq_internet: continuous.,word_freq_order: continuous.,word_freq_mail: continuous.,...,char_freq_;: continuous.,char_freq_(: continuous.,char_freq_[: continuous.,char_freq_!: continuous.,char_freq_$: continuous.,char_freq_#: continuous.,capital_run_length_average: continuous.,capital_run_length_longest: continuous.,capital_run_length_total: continuous.,spam_type
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


## Setting feature matrix and values to be predicted

In [5]:
X = df.iloc[:, :-1].values
y = df.iloc[:,-1].values

# Cleaning DataSet

#### Checking whether there are null values

In [6]:
df.isnull().sum()

word_freq_make:         continuous.        0
word_freq_address:      continuous.        0
word_freq_all:          continuous.        0
word_freq_3d:           continuous.        0
word_freq_our:          continuous.        0
word_freq_over:         continuous.        0
word_freq_remove:       continuous.        0
word_freq_internet:     continuous.        0
word_freq_order:        continuous.        0
word_freq_mail:         continuous.        0
word_freq_receive:      continuous.        0
word_freq_will:         continuous.        0
word_freq_people:       continuous.        0
word_freq_report:       continuous.        0
word_freq_addresses:    continuous.        0
word_freq_free:         continuous.        0
word_freq_business:     continuous.        0
word_freq_email:        continuous.        0
word_freq_you:          continuous.        0
word_freq_credit:       continuous.        0
word_freq_your:         continuous.        0
word_freq_font:         continuous.        0
word_freq_

#### Checking for number of Duplicates and then dropping them

In [7]:
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)

391


# Doing train_test_spliting

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 5)

# Training the model

In [10]:
from sklearn.tree import DecisionTreeClassifier

### Searching most accurate value for max depth hyperparameter

In [11]:
from sklearn.model_selection import GridSearchCV

# Define the values for max depth to be tested on
param_grid = {'max_depth': range(1,20)}

# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=5,criterion='entropy')

# Perform a grid search over the parameter grid using 5-fold cross-validation
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the accuracy scores for each max depth
results = grid_search.cv_results_
for i in range(len(results['params'])):
    print("Max depth:",results['params'][i]['max_depth'],"    ", "Mean Accuracy :", results['mean_test_score'][i])

# Print the best max depth value and its corresponding cross-validation score
print("Best max depth value: ", grid_search.best_params_['max_depth'])
print("Best cross-validation score: ", grid_search.best_score_)


Max depth: 1      Mean Accuracy : 0.7942934782608696
Max depth: 2      Mean Accuracy : 0.8646739130434783
Max depth: 3      Mean Accuracy : 0.8695652173913043
Max depth: 4      Mean Accuracy : 0.8967391304347826
Max depth: 5      Mean Accuracy : 0.9008152173913043
Max depth: 6      Mean Accuracy : 0.9119565217391304
Max depth: 7      Mean Accuracy : 0.9130434782608695
Max depth: 8      Mean Accuracy : 0.9119565217391304
Max depth: 9      Mean Accuracy : 0.9165760869565218
Max depth: 10      Mean Accuracy : 0.9165760869565218
Max depth: 11      Mean Accuracy : 0.9209239130434783
Max depth: 12      Mean Accuracy : 0.9168478260869565
Max depth: 13      Mean Accuracy : 0.9176630434782608
Max depth: 14      Mean Accuracy : 0.916304347826087
Max depth: 15      Mean Accuracy : 0.913586956521739
Max depth: 16      Mean Accuracy : 0.913586956521739
Max depth: 17      Mean Accuracy : 0.9127717391304347
Max depth: 18      Mean Accuracy : 0.9127717391304347
Max depth: 19      Mean Accuracy : 0.914

In [27]:
clf = DecisionTreeClassifier(random_state=5,criterion='entropy',max_depth=11)
clf.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=11, random_state=5)


# Checking accuracy of testing dataset

In [13]:
from sklearn.metrics import accuracy_score

In [14]:
predictions_test=clf.predict(X_test)

accuracy_score(y_test, predictions_test)

0.9315960912052117

# Checking accuracy of training dataset

In [15]:
predictions_train = clf.predict(X_train)

accuracy_score(y_train,predictions_train)

0.9627717391304348

# Evaluating our test dataset

In [16]:
from sklearn.metrics import classification_report,confusion_matrix

#### Confusion Matrix

In [17]:
cm = confusion_matrix(y_test, predictions_test)
print("Confusion Matrix for Testing Data:")
print(cm)

Confusion Matrix for Testing Data:
[[544  31]
 [ 32 314]]


#### Accuracy Score

In [26]:
from sklearn.metrics import accuracy_score

print("Accuracy :",accuracy_score(y_test, predictions_test))

Accuracy : 0.9315960912052117


#### F1 Score

In [23]:
from sklearn.metrics import f1_score

print("F1 Score :",f1_score(y_test, predictions_test))

F1 Score : 0.9088277858176556


#### Recall Score

In [24]:
from sklearn.metrics import recall_score

print("Recall Score :",recall_score(y_test, predictions_test))

Recall Score : 0.9075144508670521


#### Precision Score

In [25]:
from sklearn.metrics import precision_score

print("Precision Score :",precision_score(y_test, predictions_test))

Precision Score : 0.9101449275362319


#### Classification Report

In [18]:
print(classification_report(y_test,predictions_test))

              precision    recall  f1-score   support

           0       0.94      0.95      0.95       575
           1       0.91      0.91      0.91       346

    accuracy                           0.93       921
   macro avg       0.93      0.93      0.93       921
weighted avg       0.93      0.93      0.93       921



# Evaluating our training dataset

#### Confusion Matrix

In [19]:
cm_train = confusion_matrix(y_train, predictions_train)
print("Confusion Matrix for Training Data:")
print(cm_train)

Confusion Matrix for Training Data:
[[2163   50]
 [  87 1380]]


#### Accuracy

In [20]:
accuracy = accuracy_score(y_train, predictions_train)
accuracy

0.9627717391304348

#### Classification Report

In [21]:
print(classification_report(y_train,predictions_train))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      2213
           1       0.97      0.94      0.95      1467

    accuracy                           0.96      3680
   macro avg       0.96      0.96      0.96      3680
weighted avg       0.96      0.96      0.96      3680

