In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

data_v0 = pd.read_csv('Clusters-4-v0.csv')
data_v1 = pd.read_csv('Clusters-4-v1.csv')
data_v2 = pd.read_csv('Clusters-4-v2.csv')

x_train_v0, x_test_v0, y_train_v0, y_test_v0 = train_test_split(data_v0[['x1', 'x2']], data_v0['y'], test_size=0.2, random_state=62, stratify = data_v0['y'])
x_train_v1, x_test_v1, y_train_v1, y_test_v1 = train_test_split(data_v1[['x1', 'x2']], data_v1['y'], test_size=0.2, random_state=82, stratify = data_v1['y'])
x_train_v2, x_test_v2, y_train_v2, y_test_v2 = train_test_split(data_v2[['x1', 'x2']], data_v2['y'], test_size=0.2, random_state=92, stratify = data_v2['y'])

In [None]:
import matplotlib.pyplot as plt

def plot_relation(df):
    colors = {1: 'red', 2: 'blue', 3: 'green', 4: 'orange'}

    plt.figure(figsize=(7,5))

    for cls in df['y'].unique():
        subset = df[df['y'] == cls]
        plt.scatter(subset['x1'], subset['x2'], c=colors[cls], label=f'Class {cls}', alpha=0.7, edgecolors='k')

    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.title('Scatter plot of x1 vs x2 by class')
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
plot_relation(data_v0)
plot_relation(data_v1)
plot_relation(data_v2)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import statsmodels.api as sm

algorithms = {
    'Logistic Regression': LogisticRegression(max_iter=5000),
    'Logistic Regression + Poly(2)': Pipeline([('poly', PolynomialFeatures(degree=2)),('logreg', LogisticRegression(max_iter=10000))]),
    'SVC (linear)': SVC(kernel='linear', probability=True),
    'SVC (rbf)': SVC(kernel='rbf', probability=True),
    'NeuralNet (5)': MLPClassifier(hidden_layer_sizes=(5,), max_iter=5000),
    'NeuralNet (5,5)': MLPClassifier(hidden_layer_sizes=(5,5), max_iter=5000),
    'NeuralNet (5,5,5)': MLPClassifier(hidden_layer_sizes=(5,5,5), max_iter=5000),
    'NeuralNet (10)': MLPClassifier(hidden_layer_sizes=(10,), max_iter=5000),
}

for depth in range(2, 6):
    for leaf in range(1, 6):
        name = f'RandomForest d{depth}_l{leaf}'
        algorithms[name] = RandomForestClassifier(max_depth=depth, min_samples_leaf=leaf)


metric_table_train_v0 = pd.DataFrame()
metric_table_test_v0 = pd.DataFrame()
metric_table_train_v1 = pd.DataFrame()
metric_table_test_v1 = pd.DataFrame()
metric_table_train_v2 = pd.DataFrame()
metric_table_test_v2 = pd.DataFrame()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import numpy as np

def helperFunction(classes, statistic, metric_table_train, metric_table_test, algorithm_name, data1, data2):
    i = 0
    while i < len(classes):
        c = classes[i]
        metric_table_train.at[algorithm_name, f'{statistic}_{c}'] = data1[i]

        metric_table_test.at[algorithm_name, f'{statistic}_{c}'] = data2[i]
        i += 1

def run_classification_metrics_plot(X_train, X_test, y_train, y_test, algorithms, metric_table_train, metric_table_test, version):
    classes = sorted(y_train.unique())
    
    # fig, axs = plt.subplots(len(algorithms), 2, figsize=(20, 5 * len(algorithms)))
    # fig_row = -1

    for algorithm_name, algorithm in algorithms.items():

        algorithm.fit(X_train, y_train)

        y_train_pred = algorithm.predict(X_train)
        y_test_pred = algorithm.predict(X_test)

        # Got an error because, SVC didnt support probabilities on its own, so this bit was needed to make my code run properly. Other models will work ok, but SVC requires the else block.
        if hasattr(algorithm, "predict_proba"):
            y_train_proba = algorithm.predict_proba(X_train)
            y_test_proba = algorithm.predict_proba(X_test)
        else:
            class_to_index = {c: i for i, c in enumerate(classes)}
            
            y_train_idx = np.array([class_to_index[y] for y in y_train_pred])
            y_test_idx  = np.array([class_to_index[y] for y in y_test_pred])
            
            y_train_proba = np.eye(len(classes))[y_train_idx]
            y_test_proba  = np.eye(len(classes))[y_test_idx]


        acc_train = accuracy_score(y_train, y_train_pred)
        prec_train = precision_score(y_train, y_train_pred, average=None, labels=classes, zero_division=0)
        rec_train = recall_score(y_train, y_train_pred, average=None, labels=classes, zero_division=0)
        f1_train = f1_score(y_train, y_train_pred, average=None, labels=classes, zero_division=0)
        auc_train = roc_auc_score(y_train, y_train_proba, average=None, multi_class='ovr')

        prec_avg_train = precision_score(y_train, y_train_pred, average="weighted", zero_division=0)
        rec_avg_train = recall_score(y_train, y_train_pred, average="weighted", zero_division=0)
        f1_avg_train = f1_score(y_train, y_train_pred, average="weighted", zero_division=0)
        auc_avg_train = roc_auc_score(y_train, y_train_proba, average="weighted", multi_class='ovr')


        acc_test = accuracy_score(y_test, y_test_pred)
        prec_test = precision_score(y_test, y_test_pred, average=None, labels=classes, zero_division=0)
        rec_test = recall_score(y_test, y_test_pred, average=None, labels=classes, zero_division=0)
        f1_test = f1_score(y_test, y_test_pred, average=None, labels=classes, zero_division=0)
        auc_test = roc_auc_score(y_test, y_test_proba, average=None, multi_class='ovr')

        prec_avg_test = precision_score(y_test, y_test_pred, average="weighted", zero_division=0)
        rec_avg_test = recall_score(y_test, y_test_pred, average="weighted", zero_division=0)
        f1_avg_test = f1_score(y_test, y_test_pred, average="weighted", zero_division=0)
        auc_avg_test = roc_auc_score(y_test, y_test_proba, average="weighted", multi_class='ovr')


        metric_table_train.at[algorithm_name, 'Accuracy'] = acc_train
        metric_table_test.at[algorithm_name, 'Accuracy'] = acc_test

        metric_table_train.at[algorithm_name, 'Precision_avg'] = prec_avg_train
        metric_table_test.at[algorithm_name, 'Precision_avg'] = prec_avg_test
        helperFunction(classes, "Precision", metric_table_train, metric_table_test, algorithm_name, prec_train, prec_test)

        metric_table_train.at[algorithm_name, 'Recall_avg'] = rec_avg_train
        metric_table_test.at[algorithm_name, 'Recall_avg'] = rec_avg_test
        helperFunction(classes, "Recall", metric_table_train, metric_table_test, algorithm_name, rec_train, rec_test)

        metric_table_train.at[algorithm_name, 'F1_avg'] = f1_avg_train
        metric_table_test.at[algorithm_name, 'F1_avg'] = f1_avg_test
        helperFunction(classes, "F1", metric_table_train, metric_table_test, algorithm_name, f1_train, f1_test)

        metric_table_train.at[algorithm_name, 'AUC_avg'] = auc_avg_train
        metric_table_test.at[algorithm_name, 'AUC_avg'] = auc_avg_test
        helperFunction(classes, "AUC", metric_table_train, metric_table_test, algorithm_name, auc_train, auc_test)

        # fig_row += 1
        cm_train = confusion_matrix(y_train, y_train_pred, labels=classes)
        cm_test = confusion_matrix(y_test, y_test_pred, labels=classes)

        # axs[fig_row, 0].imshow(cm_train, cmap='Blues')
        # axs[fig_row, 0].set_title(algorithm_name + " - Train")
        # for i in range(len(classes)):
        #     for j in range(len(classes)):
        #         axs[fig_row, 0].text(j, i, cm_train[i, j], ha='center', va='center', color='red')

        # axs[fig_row, 1].imshow(cm_test, cmap='Greens')
        # axs[fig_row, 1].set_title(algorithm_name + " - Test")
        # for i in range(len(classes)):
        #     for j in range(len(classes)):
        #         axs[fig_row, 1].text(j, i, cm_test[i, j], ha='center', va='center', color='red')

    # plt.tight_layout()
    # plt.show()

    metric_table_train.to_csv(f"metrics_train_v{version}.csv")
    metric_table_test.to_csv(f"metrics_test_v{version}.csv")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import numpy as np
import pandas as pd

def plot_model_results_row(X_train, X_test, y_train, y_test, algorithms):
    classes = np.unique(np.concatenate([y_train, y_test]))
    y_train_bin = label_binarize(y_train, classes=classes)
    y_test_bin = label_binarize(y_test, classes=classes)
    
    feature_names = X_train.columns
    
    for algorithm_name, algorithm in algorithms.items():
        fig, axes = plt.subplots(1, 4, figsize=(22, 5))
        fig.suptitle(algorithm_name, fontsize=14, fontweight="bold")
        
        if X_train.shape[1] == 2:
            ax = axes[0]
            h = 0.08
            x_min, x_max = X_train.iloc[:, 0].min() - 1, X_train.iloc[:, 0].max() + 1
            y_min, y_max = X_train.iloc[:, 1].min() - 1, X_train.iloc[:, 1].max() + 1
            xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
            
            mesh_points = pd.DataFrame(np.c_[xx.ravel(), yy.ravel()], columns=feature_names)
            Z = algorithm.predict(mesh_points)
            Z = Z.reshape(xx.shape)
            ax.contourf(xx, yy, Z, alpha=0.3)
            ax.scatter(X_train.iloc[:, 0], X_train.iloc[:, 1], c=y_train, edgecolor="k", cmap=plt.cm.coolwarm, alpha=0.8)
            ax.set_title("Decision Boundary (Train)")
        else:
            axes[0].text(0.5, 0.5, "Decision boundary", ha='center')
        
        if X_train.shape[1] == 2:
            ax = axes[1]
            h = 0.08
            x_min, x_max = X_test.iloc[:, 0].min() - 1, X_test.iloc[:, 0].max() + 1
            y_min, y_max = X_test.iloc[:, 1].min() - 1, X_test.iloc[:, 1].max() + 1
            xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
            
            mesh_points = pd.DataFrame(np.c_[xx.ravel(), yy.ravel()], columns=feature_names)
            Z = algorithm.predict(mesh_points)
            Z = Z.reshape(xx.shape)
            ax.contourf(xx, yy, Z, alpha=0.3)
            ax.scatter(X_test.iloc[:, 0], X_test.iloc[:, 1], c=y_test, edgecolor="k", cmap=plt.cm.coolwarm, alpha=0.8)
            ax.set_title("Decision Boundary (Test)")
        else:
            axes[1].text(0.5, 0.5, "Decision boundary", ha='center')
        
        ax = axes[2]
        for i, cls in enumerate(classes):
            y_score = algorithm.predict_proba(X_train)[:, i]
            fpr, tpr, _ = roc_curve(y_train_bin[:, i], y_score)
            roc_auc = auc(fpr, tpr)
            ax.plot(fpr, tpr, label=f"Class {cls} (AUC={roc_auc:.2f})")
        ax.plot([0, 1], [0, 1], "k--")
        ax.set_title("ROC Curve (Train)")
        ax.set_xlabel("False Positive Rate")
        ax.set_ylabel("True Positive Rate")
        ax.legend(loc="lower right")
        
        ax = axes[3]
        for i, cls in enumerate(classes):
            y_score = algorithm.predict_proba(X_test)[:, i]
            fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score)
            roc_auc = auc(fpr, tpr)
            ax.plot(fpr, tpr, label=f"Class {cls} (AUC={roc_auc:.2f})")
        ax.plot([0, 1], [0, 1], "k--")
        ax.set_title("ROC Curve (Test)")
        ax.set_xlabel("False Positive Rate")
        ax.set_ylabel("True Positive Rate")
        ax.legend(loc="lower right")
        
        plt.tight_layout()
        plt.show()


In [None]:
print("-----------------------------------------")
print("Metrics for DataSet-v0")
run_classification_metrics_plot(x_train_v0, x_test_v0, y_train_v0, y_test_v0, algorithms, metric_table_train_v0, metric_table_test_v0, 0)

In [None]:
plot_model_results_row(x_train_v0, x_test_v0, y_train_v0, y_test_v0, algorithms)

In [None]:
print("-----------------------------------------")
print("Metrics for DataSet-v1")
run_classification_metrics_plot(x_train_v1, x_test_v1, y_train_v1, y_test_v1, algorithms, metric_table_train_v1, metric_table_test_v1, 1)

In [None]:
plot_model_results_row(x_train_v1, x_test_v1, y_train_v1, y_test_v1, algorithms)

In [None]:
print("-----------------------------------------")
print("Metrics for DataSet-v2")
run_classification_metrics_plot(x_train_v2, x_test_v2, y_train_v2, y_test_v2, algorithms, metric_table_train_v2, metric_table_test_v2, 2)

In [None]:
plot_model_results_row(x_train_v2, x_test_v2, y_train_v2, y_test_v2, algorithms)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import scipy.cluster.hierarchy as sch

def run_clustering(X, dataset_name, n_clusters):
    if isinstance(X, pd.DataFrame):
        X = X.to_numpy()
    
    algorithms_clustering = {"KMeans": KMeans(n_clusters, random_state=42)}
    linkages = ["complete", "average", "single"]
    for link in linkages:
        name = f"Hierarchical-{link}"
        algorithms_clustering[name] = AgglomerativeClustering(n_clusters, linkage=link)
    
    results = []
    for name, algo in algorithms_clustering.items():
        labels = algo.fit_predict(X)
        sil = silhouette_score(X, labels)
        ch = calinski_harabasz_score(X, labels)
        db = davies_bouldin_score(X, labels)
        results.append({
            "Algorithm": name,
            "Silhouette": sil,
            "Calinski-Harabasz": ch,
            "Davies-Bouldin": db
        })
    
    metric_table = pd.DataFrame(results)
    print(metric_table)
    
    kmeans_labels = algorithms_clustering["KMeans"].fit_predict(X)
    plt.figure(figsize=(5,5))
    plt.scatter(X[:,0], X[:,1], c=kmeans_labels, cmap='viridis', s=50, edgecolor='k')
    plt.title(f"KMeans Clusters ({dataset_name})")
    plt.xlabel("x1")
    plt.ylabel("x2")
    plt.show()
    
    for link in linkages:
        plt.figure(figsize=(8,5))
        sch.dendrogram(sch.linkage(X, method=link))
        plt.title(f"Hierarchical Clustering Dendrogram ({link} linkage) - {dataset_name}")
        plt.xlabel("Sample index")
        plt.ylabel("Distance")
        plt.show()
    
    return metric_table


In [None]:
metric_table_cluster_v0 = pd.DataFrame()
metric_table_cluster_v0 = run_clustering(data_v0[['x1', 'x2']], "DataSet:v0", 4)

In [None]:
metric_table_cluster_v0 = pd.DataFrame()
metric_table_cluster_v0 = run_clustering(data_v0[['x1', 'x2']], "DataSet:v0", 3)

In [None]:
metric_table_cluster_v0 = pd.DataFrame()
metric_table_cluster_v0 = run_clustering(data_v0[['x1', 'x2']], "DataSet:v0", 5)

In [None]:
metric_table_cluster_v1 = pd.DataFrame()
metric_table_cluster_v1 = run_clustering(data_v1[['x1', 'x2']], "DataSet:v1", 4)

In [None]:
metric_table_cluster_v1 = pd.DataFrame()
metric_table_cluster_v1 = run_clustering(data_v1[['x1', 'x2']], "DataSet:v1", 3)

In [None]:
metric_table_cluster_v1 = pd.DataFrame()
metric_table_cluster_v1 = run_clustering(data_v1[['x1', 'x2']], "DataSet:v1", 5)

In [None]:
metric_table_cluster_v2 = pd.DataFrame()
metric_table_cluster_v2 = run_clustering(data_v2[['x1', 'x2']], "DataSet:v2", 4)

In [None]:
metric_table_cluster_v2 = pd.DataFrame()
metric_table_cluster_v2 = run_clustering(data_v2[['x1', 'x2']], "DataSet:v2", 3)

In [None]:
metric_table_cluster_v2 = pd.DataFrame()
metric_table_cluster_v2 = run_clustering(data_v2[['x1', 'x2']], "DataSet:v2", 2)