# Import Libraries

In [50]:
import pandas as pd

In [54]:
names = pd.read_csv('spambase.names',skiprows=32,sep=':/s+',engine='python', names=['attr',''])
names=names['attr']
names=list(names)
names.append('spam_type')
names

['word_freq_make:         continuous.',
 'word_freq_address:      continuous.',
 'word_freq_all:          continuous.',
 'word_freq_3d:           continuous.',
 'word_freq_our:          continuous.',
 'word_freq_over:         continuous.',
 'word_freq_remove:       continuous.',
 'word_freq_internet:     continuous.',
 'word_freq_order:        continuous.',
 'word_freq_mail:         continuous.',
 'word_freq_receive:      continuous.',
 'word_freq_will:         continuous.',
 'word_freq_people:       continuous.',
 'word_freq_report:       continuous.',
 'word_freq_addresses:    continuous.',
 'word_freq_free:         continuous.',
 'word_freq_business:     continuous.',
 'word_freq_email:        continuous.',
 'word_freq_you:          continuous.',
 'word_freq_credit:       continuous.',
 'word_freq_your:         continuous.',
 'word_freq_font:         continuous.',
 'word_freq_000:          continuous.',
 'word_freq_money:        continuous.',
 'word_freq_hp:           continuous.',


## Importing Dataset using Pandas Library

In [55]:
df = pd.read_csv('spambase.csv',names=names)

In [56]:
df.head()

Unnamed: 0,word_freq_make: continuous.,word_freq_address: continuous.,word_freq_all: continuous.,word_freq_3d: continuous.,word_freq_our: continuous.,word_freq_over: continuous.,word_freq_remove: continuous.,word_freq_internet: continuous.,word_freq_order: continuous.,word_freq_mail: continuous.,...,char_freq_;: continuous.,char_freq_(: continuous.,char_freq_[: continuous.,char_freq_!: continuous.,char_freq_$: continuous.,char_freq_#: continuous.,capital_run_length_average: continuous.,capital_run_length_longest: continuous.,capital_run_length_total: continuous.,spam_type
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


## Setting feature matrix and values to be predicted

In [4]:
X = df.iloc[:, :-1].values
y = df.iloc[:,-1].values

# Cleaning DataSet

#### Checking whether there are null values

In [5]:
df.isnull().sum()

0         0
0.64      0
0.64.1    0
0.1       0
0.32      0
0.2       0
0.3       0
0.4       0
0.5       0
0.6       0
0.7       0
0.64.2    0
0.8       0
0.9       0
0.10      0
0.32.1    0
0.11      0
1.29      0
1.93      0
0.12      0
0.96      0
0.13      0
0.14      0
0.15      0
0.16      0
0.17      0
0.18      0
0.19      0
0.20      0
0.21      0
0.22      0
0.23      0
0.24      0
0.25      0
0.26      0
0.27      0
0.28      0
0.29      0
0.30      0
0.31      0
0.33      0
0.34      0
0.35      0
0.36      0
0.37      0
0.38      0
0.39      0
0.40      0
0.41      0
0.42      0
0.43      0
0.778     0
0.44      0
0.45      0
3.756     0
61        0
278       0
1         0
dtype: int64

#### Checking for number of Duplicates and then dropping them

In [6]:
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)

391


# Doing train_test_spliting

In [7]:
from sklearn.model_selection import train_test_split

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 5)

# Training the model

In [9]:
from sklearn.tree import DecisionTreeClassifier

### Searching most accurate value for max depth hyperparameter

In [126]:
from sklearn.model_selection import GridSearchCV

# Define the values for max depth to be tested on
param_grid = {'max_depth': range(1,20)}

# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=5,criterion='entropy')

# Perform a grid search over the parameter grid using 5-fold cross-validation
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the accuracy scores for each max depth
results = grid_search.cv_results_
for i in range(len(results['params'])):
    print("Max depth:",results['params'][i]['max_depth'],"    ", "Mean Accuracy :", results['mean_test_score'][i])

# Print the best max depth value and its corresponding cross-validation score
print("Best max depth value: ", grid_search.best_params_['max_depth'])
print("Best cross-validation score: ", grid_search.best_score_)


Max depth: 1      Mean Accuracy : 0.7934782608695652
Max depth: 2      Mean Accuracy : 0.8684782608695653
Max depth: 3      Mean Accuracy : 0.8695652173913043
Max depth: 4      Mean Accuracy : 0.9002717391304348
Max depth: 5      Mean Accuracy : 0.9111413043478261
Max depth: 6      Mean Accuracy : 0.9059782608695652
Max depth: 7      Mean Accuracy : 0.9160326086956522
Max depth: 8      Mean Accuracy : 0.9149456521739129
Max depth: 9      Mean Accuracy : 0.9176630434782608
Max depth: 10      Mean Accuracy : 0.9222826086956522
Max depth: 11      Mean Accuracy : 0.9190217391304347
Max depth: 12      Mean Accuracy : 0.9168478260869566
Max depth: 13      Mean Accuracy : 0.9152173913043479
Max depth: 14      Mean Accuracy : 0.9184782608695652
Max depth: 15      Mean Accuracy : 0.9195652173913043
Max depth: 16      Mean Accuracy : 0.9176630434782609
Max depth: 17      Mean Accuracy : 0.9149456521739131
Max depth: 18      Mean Accuracy : 0.9171195652173912
Max depth: 19      Mean Accuracy : 0.

In [128]:
clf = DecisionTreeClassifier(random_state=5,criterion='entropy',max_depth=10)
clf.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=5)


# Checking accuracy of testing dataset

In [129]:
from sklearn.metrics import accuracy_score

In [130]:
predictions_test=clf.predict(X_test)

accuracy_score(y_test, predictions_test)

0.9206521739130434

# Checking accuracy of training dataset

In [79]:
predictions_train = clf.predict(X_train)

accuracy_score(y_train,predictions_train)

0.9586956521739131

# Evaluating our test dataset

In [61]:
from sklearn.metrics import classification_report,confusion_matrix

#### Classification Report

In [62]:
print(classification_report(y_test,predictions_test))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92       557
           1       0.93      0.81      0.86       363

    accuracy                           0.90       920
   macro avg       0.91      0.88      0.89       920
weighted avg       0.90      0.90      0.90       920



#### Confusion Matrix

In [80]:
cm = confusion_matrix(y_test, predictions_test)
print("Confusion Matrix for Testing Data:")
print(cm)

Confusion Matrix for Testing Data:
[[531  26]
 [ 47 316]]


# Evaluating our training dataset

#### Confusion Matrix

In [132]:
cm_train = confusion_matrix(y_train, predictions_train)
print("Confusion Matrix for Training Data:")
print(cm_train)

Confusion Matrix for Training Data:
[[2192   39]
 [ 113 1336]]


#### Accuracy

In [135]:
accuracy = accuracy_score(y_train, predictions_train)
accuracy

0.9586956521739131

#### Classification Report

In [133]:
print(classification_report(y_train,predictions_train))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97      2231
           1       0.97      0.92      0.95      1449

    accuracy                           0.96      3680
   macro avg       0.96      0.95      0.96      3680
weighted avg       0.96      0.96      0.96      3680

