In [1]:
# Name: Calum Garrigan, Student Number: 201379070

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load and Print The Dataset Information
data = pd.read_csv("dataset_assignment1.csv")
print("Data Information:")
print(data.info())

# Print The Number Of Samples For Each Class In The Dataset
print("\nNumber Of Samples For Each Class In The Dataset:")
print(data['class'].value_counts())

# Plot Histograms For Each Feature
data.hist(figsize=(10, 10))
plt.show()

# Print Statistical Description Of Features For Each Class
for cls in data['class'].unique():
    print(f'Statistical Description For Class {cls}:')
    print(data[data['class'] == cls].describe())

# Split Data Into Training Dataset and Testing Dataset (i.e., 80% vs. 20%)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def print_results(y_test, y_pred, model_name):
    print(f"{model_name} Accuracy Score: {accuracy_score(y_test, y_pred)}")
    print(f"{model_name} Classification Report:\n{classification_report(y_test, y_pred)}")
    print(f"{model_name} Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, cmap='Reds', fmt='d')
    plt.title(f'Confusion Matrix For {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# KNN Classification
def run_knn(X_train, y_train, X_test, y_test):
    k_values = range(1, 30, 2)
    cv_scores = []
    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
        cv_scores.append(scores.mean())

    optimal_k = k_values[cv_scores.index(max(cv_scores))]
    
    # Cross Validation Values and Visualisation
    print(f"\nCross Validation Values For KNN = {cv_scores}\n")
    plt.plot(k_values, cv_scores)
    plt.title('Cross-Validation Scores For Different Values O K')
    plt.xlabel('Number Of Neighbors (K)')
    plt.ylabel('Mean Cross-Validation Score')
    plt.show()
    print(f"\nKNN Optimal K Value: {optimal_k}")

    knn = KNeighborsClassifier(n_neighbors=optimal_k)
    knn.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    print(f"\nKNN Accuracy Score: {accuracy_score(y_test, y_pred_knn)}")
    print(f"KNN Classification Report:\n {classification_report(y_test, y_pred_knn)}")
    print(f"KNN Confusion Matrix:\n {confusion_matrix(y_test, y_pred_knn)}")

    # Confusion Matrix visualization
    cm = confusion_matrix(y_test, y_pred_knn)
    sns.heatmap(cm, annot=True, cmap='Reds', fmt='d')
    plt.title(f'Confusion Matrix For KNN')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

# Run KNN
run_knn(X_train, y_train, X_test, y_test)

# Naive Bayes Classification
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print("\nNaive Bayes Classification Results:")
print("Accuracy Score: ", accuracy_score(y_test, y_pred_nb))
print("Classification Report: \n", classification_report(y_test, y_pred_nb))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred_nb))

# K-Fold Cross Validation For Naive Bayes
print("\nK-Fold Cross Validation For Naive Bayes:")

nb_scores = cross_val_score(nb, X_test, y_test, cv=5, scoring='accuracy')
nb_mean_score = nb_scores.mean()
nb_std_score = nb_scores.std()

# Display Cross Validation Scores and Visualization
print("Cross Validation Scores: ", nb_scores)
plt.plot(range(1, 6), nb_scores)
plt.title('Cross-Validation Scores For Naive Bayes')
plt.xlabel('Fold')
plt.ylabel('Accuracy score')
plt.show()

print("Mean accuracy score: ", nb_mean_score)
print("Standard Deviation Of Accuracy Scores: ", nb_std_score)

# Classification Report After Cross Validation
print("Classification Report After Cross-Validation: ")
y_pred_nb_cv = cross_val_predict(nb, X_test, y_test, cv=5)
print(classification_report(y_test, y_pred_nb_cv))

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred_nb_cv)
sns.heatmap(cm, annot=True, cmap='Reds', fmt='d')
plt.title(f'Confusion Matrix For Naive Bayes')
plt.xlabel('Predicted ')
plt.ylabel('True ')
plt.show()

# Logistic Regression Classification
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
report_lr = classification_report(y_test, y_pred_lr)
matrix_lr = confusion_matrix(y_test, y_pred_lr)

print(f"\nLogistic Regression Accuracy Score: {accuracy_lr}")
print(f"Logistic Regression Classification Report:\n{report_lr}")
print(f"Logistic Regression Confusion Matrix:\n{matrix_lr}")

# K-Fold Cross-Validation For Logistic Regression
print("\nAfter Cross Validation for Logistic Regression\n")
log_reg_scores = cross_val_score(log_reg, X_test, y_test, cv=5, scoring='accuracy')
log_reg_mean_score = np.mean(log_reg_scores)
log_reg_std_score = np.std(log_reg_scores)

# Cross Validation Values and Visualisation
print(f"Cross-Validation Values: {log_reg_scores}\n")
plt.plot(range(1, 6), log_reg_scores)
plt.title('Cross-Validation Scores For Logistic Regression')
plt.xlabel('Fold')
plt.ylabel('Accuracy score')
plt.show()

print(f"Logistic Regression Mean Accuracy Score: {log_reg_mean_score}")
print(f"Logistic Regression Standard Deviation Of Accuracy Scores: {log_reg_std_score}\n")
y_pred_log_reg_cv = cross_val_predict(log_reg, X_test, y_test, cv=5)
report_log_reg_cv = classification_report(y_test, y_pred_log_reg_cv)

print(f"Logistic Regression Classification Report After Cross-Validation:\n{report_log_reg_cv}")

# Confusion Matrix Visualization
confusion_matrix_lr = confusion_matrix(y_test, y_pred_log_reg_cv)
sns.heatmap(confusion_matrix_lr, annot=True, cmap='Reds', fmt='d')
plt.title('Confusion Matrix For Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

knn_mean_score = max(log_reg_scores)
nb_mean_score = nb_scores.mean()
log_reg_mean_score = log_reg_scores.mean()

# Create Bar Chart To Compare Mean Accuracy Scores Across the Three Classification Models
models = ['KNN', 'Naive Bayes', 'Logistic Regression']
scores = [knn_mean_score, nb_mean_score, log_reg_mean_score]

fig, ax = plt.subplots()
ax.bar(models, scores)
ax.set(title='Mean Accuracy Scores For Different Models',
       xlabel='Model',
       ylabel='Mean Accuracy Score',
       ylim=(0.9, 1))
plt.show()


ModuleNotFoundError: No module named 'matplotlib'