In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Helper function for plotting confusion matrix
def plot_confusion_matrix(y, y_predict):
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y, y_predict)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, cmap='Blues'); 
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['did not land', 'landed']); ax.yaxis.set_ticklabels(['did not land', 'landed'])
    plt.show()

# 1. Load Data
X = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_3.csv')
Y = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_2.csv')['Class'].to_numpy()

# 2. Preprocessing & Splitting
transform = preprocessing.StandardScaler()
X = transform.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# 3. Logistic Regression
parameters = {"C": [0.01, 0.1, 1], 'penalty': ['l2'], 'solver': ['lbfgs']}
lr = LogisticRegression()
logreg_cv = GridSearchCV(lr, parameters, cv=10)
logreg_cv.fit(X_train, Y_train)
lr_score = logreg_cv.score(X_test, Y_test)

# 4. SVM
parameters = {'kernel': ('linear', 'rbf', 'poly', 'rbf', 'sigmoid'), 'C': np.logspace(-3, 3, 5), 'gamma': np.logspace(-3, 3, 5)}
svm = SVC()
svm_cv = GridSearchCV(svm, parameters, cv=10)
svm_cv.fit(X_train, Y_train)
svm_score = svm_cv.score(X_test, Y_test)

# 5. Decision Tree
parameters = {'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [2*n for n in range(1, 10)],
              'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10]}
tree = DecisionTreeClassifier()
tree_cv = GridSearchCV(tree, parameters, cv=10)
tree_cv.fit(X_train, Y_train)
tree_score = tree_cv.score(X_test, Y_test)

# 6. KNN
parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'p': [1, 2]}
KNN = KNeighborsClassifier()
knn_cv = GridSearchCV(KNN, parameters, cv=10)
knn_cv.fit(X_train, Y_train)
knn_score = knn_cv.score(X_test, Y_test)

# 7. Results Comparison
scores = {'Logistic Regression': [logreg_cv.best_score_, lr_score],
          'SVM': [svm_cv.best_score_, svm_score],
          'Decision Tree': [tree_cv.best_score_, tree_score],
          'KNN': [knn_cv.best_score_, knn_score]}

df_scores = pd.DataFrame.from_dict(scores, orient='index', columns=['Train Accuracy', 'Test Accuracy'])
print(df_scores)

# Visualize Comparison
df_scores['Test Accuracy'].plot(kind='bar', figsize=(10, 6), color='teal')
plt.xlabel('Algorithm')
plt.ylabel('Accuracy')
plt.title('Comparison of Machine Learning Model Accuracies')
plt.show()

# Plot Confusion Matrix for the best model (using LogReg as example)
yhat = logreg_cv.predict(X_test)
plot_confusion_matrix(Y_test, yhat)