# Descision tree model



In [3]:
#Loading the libraries needed
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Loading the dataset
data = pd.read_csv('/content/drive/MyDrive/spambase.csv')

# Splitting dataset into features and target variable
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Splitting the dataset into training and testing sets. Test size =0.2, random state =42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a Decision Tree classifier object
tree = DecisionTreeClassifier()

# Training the Decision Tree classifier
tree.fit(X_train, y_train)

# Predicting the classes of testing data
y_pred = tree.predict(X_test)

# Evaluating the performance of the Decision Tree classifier
accuracy = tree.score(X_test, y_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy =", accuracy)

[[502  28]
 [ 44 346]]
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       530
           1       0.93      0.89      0.91       390

    accuracy                           0.92       920
   macro avg       0.92      0.92      0.92       920
weighted avg       0.92      0.92      0.92       920

Accuracy = 0.9217391304347826


#K - Nearest Neighbour model

In [4]:
#Loading the libraries needed
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Loading the spambase dataset
data = pd.read_csv("/content/drive/MyDrive/spambase.csv")

# Split data into features and target variables
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Splitting the dataset into training and testing sets. test size =0.2, random_state =42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating a KNN classifier object
k = 5
knn = KNeighborsClassifier(n_neighbors=k)

# Training the KNN classifier
knn.fit(X_train, y_train)

# Predicting the classes of testing data
y_pred = knn.predict(X_test)

# Evaluating the performance of the KNN classifier
accuracy = knn.score(X_test, y_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy =", accuracy)

[[516  14]
 [ 82 308]]
              precision    recall  f1-score   support

           0       0.86      0.97      0.91       530
           1       0.96      0.79      0.87       390

    accuracy                           0.90       920
   macro avg       0.91      0.88      0.89       920
weighted avg       0.90      0.90      0.89       920

Accuracy = 0.8956521739130435


### Changes have been made to the model to improve accuracy:

1. Applied feature selection using SelectKBest and f_classif.
2. Applied feature scaling using StandardScaler.
3. Changed the value of k to 10.
4. Used the distance weights and Manhattan metric.

In [None]:
#Loading the libraries needed
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# Loading the spambase dataset
data = pd.read_csv("/content/drive/MyDrive/spambase.csv")

# Split data into features and target variables
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Feature Selection
selector = SelectKBest(score_func=f_classif, k=20)
X_new = selector.fit_transform(X, y)

# Scaling the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_new)

# Splitting the dataset into training and testing sets. test size =0.2, random_state =42
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Creating a KNN classifier object
k = 10
knn = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='manhattan')

# Training the KNN classifier
knn.fit(X_train, y_train)

# Predicting the classes of testing data
y_pred = knn.predict(X_test)

# Evaluating the performance of the KNN classifier
accuracy = knn.score(X_test, y_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy =", accuracy)


##Futher changes being applied to the KNN Model

We can futher improve the accuracy if the KNN model by tuning it's hyper parameters. The following changes have been done to improve it's accuracy.

  *   Optimize the k value
  *   Optimizing the weight parameter
  *   optimizing the distance metric



In [6]:
#Loading the libraries needed
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# Loading the spambase dataset
data = pd.read_csv("/content/drive/MyDrive/spambase.csv")

# Split data into features and target variables
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Feature Selection
selector = SelectKBest(score_func=f_classif, k=20)
X_new = selector.fit_transform(X, y)

# Scaling the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_new)

# Splitting the dataset into training and testing sets. test size =0.2, random_state =42
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Creating a KNN classifier object
k = 10
knn = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='manhattan')

# Training the KNN classifier
knn.fit(X_train, y_train)

# Predicting the classes of testing data
y_pred = knn.predict(X_test)

# Evaluating the performance of the KNN classifier
accuracy = knn.score(X_test, y_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy =", accuracy)

# Tuning the hyperparameters
# In order to improve the performance of the KNN classifier, we can tune its hyperparameters.

# 1. Optimize the value of k
k_values = list(range(1, 31))
accuracies = []
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='manhattan')
    knn.fit(X_train, y_train)
    accuracy = knn.score(X_test, y_test)
    accuracies.append(accuracy)

best_k = k_values[accuracies.index(max(accuracies))]
print("The best value of k is", best_k)

# 2. Optimize the weight parameter
weights = ['uniform', 'distance']
accuracies = []
for weight in weights:
    knn = KNeighborsClassifier(n_neighbors=best_k, weights=weight, metric='manhattan')
    knn.fit(X_train, y_train)
    accuracy = knn.score(X_test, y_test)
    accuracies.append(accuracy)

best_weight = weights[accuracies.index(max(accuracies))]
print("The best weight parameter is", best_weight)

# 3. Optimize the distance metric
metrics = ['euclidean', 'manhattan', 'minkowski']
accuracies = []
for metric in metrics:
    knn = KNeighborsClassifier(n_neighbors=best_k, weights=best_weight, metric=metric)
    knn.fit(X_train, y_train)
    accuracy = knn.score(X_test, y_test)
    accuracies.append(accuracy)

best_metric = metrics[accuracies.index(max(accuracies))]
print("The best distance metric is", best_metric)

# Final evaluation of the KNN classifier
knn = KNeighborsClassifier(n_neighbors=best_k, weights=best_weight, metric=best_metric)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = knn.score(X_test, y_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy =", accuracy)


[[516  14]
 [ 82 308]]
              precision    recall  f1-score   support

           0       0.86      0.97      0.91       530
           1       0.96      0.79      0.87       390

    accuracy                           0.90       920
   macro avg       0.91      0.88      0.89       920
weighted avg       0.90      0.90      0.89       920

Accuracy = 0.8956521739130435
The best value of k is 4
The best weight parameter is distance
The best distance metric is euclidean
[[508  22]
 [ 65 325]]
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       530
           1       0.94      0.83      0.88       390

    accuracy                           0.91       920
   macro avg       0.91      0.90      0.90       920
weighted avg       0.91      0.91      0.90       920

Accuracy = 0.9054347826086957
