In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay, make_scorer
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
import sys
import time
from sklearn.model_selection import GridSearchCV

In [8]:
def cross_validate_model(model, X, y, cv, bivariate, printer = True):
    """Perform cross-validation and print metrics if asked"""
    if bivariate == False:
        scoring = {
            'accuracy': 'accuracy',
            'precision': make_scorer(precision_score, average='weighted', zero_division=1),
            'recall': make_scorer(recall_score, average='weighted', zero_division=1),
            'f1': make_scorer(f1_score, average='weighted', zero_division=1)
        }
        start = time.time()
        results = cross_validate(model, X, y, cv=cv, scoring=scoring)
        end = time.time()
        if printer == True:
            print(f"Accuracy: {results['test_accuracy'].mean():.4f}")
            print(f"Precision: {results['test_precision'].mean():.4f}")
            print(f"Recall: {results['test_recall'].mean():.4f}")
            print(f"F1 Score: {results['test_f1'].mean():.4f}")
            print("Prediction time: ", end-start)
        return results
    else:
        scoring = ['accuracy', 'precision', 'recall', 'f1']
        start = time.time()
        cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring)
        end = time.time()
        if printer == True:
            print(f"Accuracy: {cv_results['test_accuracy'].mean():.4f}")
            print(f"Precision: {cv_results['test_precision'].mean():.4f}")
            print(f"Recall: {cv_results['test_recall'].mean():.4f}")
            print(f"F1 Score: {cv_results['test_f1'].mean():.4f}")
            print("Time:", end - start)
        return cv_results

def scale_data(X_train, X_test):
    """Scale only numerical features using StandardScaler."""
    numeric_cols = X_train.select_dtypes(include=['number']).columns  
    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])
    return X_train_scaled, X_test_scaled

def get_dummies_all(X_train, X_test):
    """
    Converts all categorical variables in a DataFrame into dummy (one-hot encoded) variables.
    """
    categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns  # Select categorical columns
    X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=False)  # One-hot encode them
    categorical_cols = X_test.select_dtypes(include=['object', 'category']).columns  # Select categorical columns
    X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=False)  # One-hot encode them
    return X_train_encoded, X_test_encoded

def oversampling(X, y):
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X, y)
    print(f"Original class distribution: {y.value_counts()}")
    print(f"Resampled class distribution: {pd.Series(y_train_resampled).value_counts()}")
    return X_train_resampled, y_train_resampled


def find_optimal_k(min, max, X, y, cv, bivariate, save_path=None):
    """Find the optimal k and plot accuracy against k values"""
    k_values = range(min, max)
    accuracy_scores = []
    start_k = time.time()
    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        cv_results = cross_validate_model(knn, X, y, cv=cv, bivariate=bivariate, printer=False)
        accuracy_scores.append(cv_results['test_accuracy'].mean())
    end_k = time.time()
    optimal_k = k_values[np.argmax(accuracy_scores)]
    print("Optimal k:", optimal_k)
    print("Time: ", {end_k - start_k})
    plt.figure(figsize=(12, 6))
    plt.plot(k_values, accuracy_scores, marker='o')
    plt.title('Accuracy vs. Number of Neighbors (k)')
    plt.xlabel('Number of Neighbors (k)')
    plt.ylabel('Accuracy')
    plt.grid(True)
    if save_path:
        plt.savefig(save_path, bbox_inches="tight")
    plt.close()
    return optimal_k


def hyperparameterTuning(min, max, X, y, cv, bivariate):
    param_grid = {
    'n_neighbors': range(min, max),
    'weights': ['uniform', 'distance'],
    'metric': ['minkowski', 'euclidean', 'manhattan', 'cosine'],
    }

    if bivariate == True:
        scoring = {
        'accuracy': 'accuracy',
        'precision_weighted': make_scorer(precision_score, average='macro', zero_division=1),
        'recall_weighted': make_scorer(recall_score, average='macro', zero_division=1),
        'f1_weighted': make_scorer(f1_score, average='macro', zero_division=1)
        }

        start = time.time()
        knn = KNeighborsClassifier()
        grid_search = GridSearchCV(knn, param_grid, cv=cv, scoring=scoring, refit='accuracy', n_jobs=-1)
        grid_search.fit(X, y)
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best cross-validation accuracy: {grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation precision: {grid_search.cv_results_['mean_test_precision_weighted'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation recall: {grid_search.cv_results_['mean_test_recall_weighted'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation F1 Score: {grid_search.cv_results_['mean_test_f1_weighted'][grid_search.best_index_]:.4f}")
        print("Time for Hypertuning: ", time.time()-start)
    else:
        scoring = {
        'accuracy': 'accuracy',
        'precision_weighted': make_scorer(precision_score, average='weighted', zero_division=1),
        'recall_weighted': make_scorer(recall_score, average='weighted', zero_division=1),
        'f1_weighted': make_scorer(f1_score, average='weighted', zero_division=1)
        }
        start = time.time()
        knn = KNeighborsClassifier()
        grid_search = GridSearchCV(knn, param_grid, cv=cv, scoring=scoring, refit='accuracy', n_jobs=-1)
        grid_search.fit(X, y)
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best cross-validation accuracy: {grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation precision: {grid_search.cv_results_['mean_test_precision_weighted'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation recall: {grid_search.cv_results_['mean_test_recall_weighted'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation F1 Score: {grid_search.cv_results_['mean_test_f1_weighted'][grid_search.best_index_]:.4f}")
        print("Time for Hypertuning: ", time.time()-start)

    


In [10]:
print("----------------------------------------------------------------------------------------------")
print("------------------------------------ AIDS ----------------------------------")
print("----------------------------------------------------------------------------------------------")

AIDS_train = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/AIDS_train.csv")
AIDS_test = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/AIDS_test.csv")



X_train = AIDS_train.drop('infected', axis=1); y_train = AIDS_train['infected']
X_test = AIDS_test.drop('infected', axis=1); y_test = AIDS_test['infected']
X_train, X_test = get_dummies_all(X_train, X_test)
X_train, y_train = oversampling(X_train, y_train)

print("----------------------------------------------------------------------------------------------")
print("AIDS - With Scaling:")
knn = KNeighborsClassifier(n_neighbors=3)
cross_validate_model(knn, X_train, y_train, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("Hyperparameter Tuning:")
optimal_params = hyperparameterTuning(1, 25, X_train, y_train, cv=3, bivariate = True)

----------------------------------------------------------------------------------------------
------------------------------------ AIDS ----------------------------------
----------------------------------------------------------------------------------------------
Original class distribution: infected
0    1258
1     393
Name: count, dtype: int64
Resampled class distribution: infected
0    1258
1    1258
Name: count, dtype: int64
----------------------------------------------------------------------------------------------
AIDS - With Scaling:
Accuracy: 0.7401
Precision: 0.7000
Recall: 0.8402
F1 Score: 0.7637
Time: 0.320570707321167
----------------------------------------------------------------------------------------------
Hyperparameter Tuning:
Best parameters: {'metric': 'manhattan', 'n_neighbors': 1, 'weights': 'uniform'}
Best cross-validation accuracy: 0.8617
Best cross-validation precision: 0.8835
Best cross-validation recall: 0.8617
Best cross-validation F1 Score: 0.8596
Tim

In [72]:
print("----------------------------------------------------------------------------------------------")
print("------------------------------------ BONE MARROW TRANSPLANT ----------------------------------")
print("----------------------------------------------------------------------------------------------")

BMT_train = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/BMT_train.csv")
#BMT_train['survival_status'] = BMT_train['survival_status'].astype('category')
BMT_test = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/BMT_test.csv")
#BMT_test['survival_status'] = BMT_test['survival_status'].astype('category')

X_train, X_test = scale_data(BMT_train, BMT_test)
X_train = BMT_train.drop('survival_status', axis=1); y_train = BMT_train['survival_status']
X_test = BMT_test.drop('survival_status', axis=1); y_test = BMT_test['survival_status']
X_train, X_test = get_dummies_all(X_train, X_test)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

print("----------------------------------------------------------------------------------------------")
print("BMT - With Scaling:")
knn = KNeighborsClassifier(n_neighbors=3)
cross_validate_model(knn, X_train, y_train, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("Finding optimal k:")
optimal_k = find_optimal_k(1, 25, X_train, y_train, cv=3, bivariate = True, save_path=None)
print(f"BONE MARROW TRANSPLANT - With optimal k={optimal_k} (Cross-validation):")
k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
cross_validate_model(k_knn, X_train, y_train, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("Hyperparameter Tuning:")
optimal_params = hyperparameterTuning(1, 25, X_train, y_train, cv=3, bivariate = True)


----------------------------------------------------------------------------------------------
------------------------------------ BONE MARROW TRANSPLANT ----------------------------------
----------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------
BMT - With Scaling:
Accuracy: 0.6400
Precision: 0.6403
Recall: 0.5972
F1 Score: 0.6141
Time: 0.07099580764770508
----------------------------------------------------------------------------------------------
Finding optimal k:
Optimal k: 9
Time:  {1.3392689228057861}
BONE MARROW TRANSPLANT - With optimal k=9 (Cross-validation):
Accuracy: 0.7000
Precision: 0.7413
Recall: 0.5694
F1 Score: 0.6389
Time: 0.051992177963256836
----------------------------------------------------------------------------------------------
Hyperparameter Tuning:
Best parameters: {'metric': 'cosine', 'n_neighbors': 10, 'weights': 'distance'

In [None]:
print("----------------------------------------------------------------------------------------------")
print("------------------------------------- CONTRACEPTIVE METHOD -----------------------------------")
print("----------------------------------------------------------------------------------------------")

# Load CMC dataset
CMC_train = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/CMC_train.csv")
CMC_test = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/CMC_test.csv")

# Scale numerical features
X_train, X_test = scale_data(CMC_train, CMC_test)

# Define target variable
X_train = CMC_train.drop('contraceptive_method', axis=1)
y_train = CMC_train['contraceptive_method']
X_test = CMC_test.drop('contraceptive_method', axis=1)
y_test = CMC_test['contraceptive_method']

# One-hot encode categorical variables
X_train, X_test = get_dummies_all(X_train, X_test)

# Align columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

print("----------------------------------------------------------------------------------------------")
print("CMC - With Scaling:")
knn = KNeighborsClassifier(n_neighbors=3)
cross_validate_model(knn, X_train, y_train, cv=3, bivariate=False)

print("----------------------------------------------------------------------------------------------")
print("Finding optimal k:")
optimal_k = find_optimal_k(1, 25, X_train, y_train, cv=3, bivariate=False, save_path=None)
print(f"CONTRACEPTIVE METHOD CHOICE - With optimal k={optimal_k} (Cross-validation):")
k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
cross_validate_model(k_knn, X_train, y_train, cv=3, bivariate=False)

print("----------------------------------------------------------------------------------------------")
print("Hyperparameter Tuning:")
optimal_params = hyperparameterTuning(1, 25, X_train, y_train, cv=3, bivariate=False)

----------------------------------------------------------------------------------------------
CMC - With Scaling:
Accuracy: 0.4860
Precision: 0.4773
Recall: 0.4860
F1 Score: 0.4774
Prediction time:  0.08962225914001465
----------------------------------------------------------------------------------------------
Finding optimal k:
Optimal k: 21
Time:  {1.9524872303009033}
CONTRACEPTIVE METHOD CHOICE - With optimal k=21 (Cross-validation):
Accuracy: 0.5522
Precision: 0.5537
Recall: 0.5522
F1 Score: 0.5515
Prediction time:  0.07853460311889648
----------------------------------------------------------------------------------------------
Hyperparameter Tuning:
Best parameters: {'metric': 'minkowski', 'n_neighbors': 21, 'weights': 'uniform'}
Best cross-validation accuracy: 0.5522
Best cross-validation precision: 0.5537
Best cross-validation recall: 0.5522
Best cross-validation F1 Score: 0.5515
Time for Hypertuning:  6.82965087890625


In [5]:
print("----------------------------------------------------------------------------------------------")
print("------------------------------------ CENSUS INCOME (KDD) ------------------------------------")
print("----------------------------------------------------------------------------------------------")

KDD_train = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/KDD_train.csv")
KDD_test = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/KDD_test.csv")

# Define target variable
X_train = KDD_train.drop('income', axis=1)
y_train = KDD_train['income']
X_test = KDD_test.drop('income', axis=1)
y_test = KDD_test['income']

y_train = y_train.map({'-50000': 0, ' 50000+.': 1})
y_test = y_test.map({'-50000': 0, ' 50000+.': 1})

X_train, X_test = scale_data(KDD_train, KDD_test)
X_train, X_test = get_dummies_all(X_train, X_test)

# Align columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

print("----------------------------------------------------------------------------------------------")
print("KDD - With Scaling:")
knn = KNeighborsClassifier(n_neighbors=3)
cross_validate_model(knn, X_train, y_train, cv=3, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("Finding optimal k:")
optimal_k = find_optimal_k(1, 25, X_train, y_train, cv=3, bivariate=True, save_path=None)
print(f"KDD INCOME CLASSIFICATION - With optimal k={optimal_k} (Cross-validation):")
k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
cross_validate_model(k_knn, X_train, y_train, cv=3, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("Hyperparameter Tuning:")
optimal_params = hyperparameterTuning(1, 25, X_train, y_train, cv=3, bivariate=True)

----------------------------------------------------------------------------------------------
------------------------------------ CENSUS INCOME (KDD) ------------------------------------
----------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------
KDD - With Scaling:
Accuracy: 0.9861
Precision: 0.9818
Recall: 0.8574
F1 Score: 0.9153
Time: 31.070089101791382
----------------------------------------------------------------------------------------------
Finding optimal k:


KeyboardInterrupt: 

In [65]:
print("----------------------------------------------------------------------------------------------")
print("-------------------------------- CONTRACEPTIVE METHOD CHOICE --------------------------------")
print("----------------------------------------------------------------------------------------------")

CMC_train = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/CMC_train.csv")
CMC_train['contraceptive_method'] = CMC_train['contraceptive_method'].astype('category')
CMC_test = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/CMC_test.csv")
CMC_test['contraceptive_method'] = CMC_test['contraceptive_method'].astype('category')
CMC_train_scaled, CMC_test_scaled = scale_data(CMC_train, CMC_test)


X_train = CMC_train.drop('contraceptive_method', axis=1); y_train = CMC_train['contraceptive_method']
X_test = CMC_test.drop('contraceptive_method', axis=1); y_test = CMC_test['contraceptive_method']

print("----------------------------------------------------------------------------------------------")
print("CMC - With Scaling:")
knn = KNeighborsClassifier(n_neighbors=3)
cross_validate_model(knn, X_train, y_train, cv=3, bivariate = False)

#print("----------------------------------------------------------------------------------------------")
#print("Finding optimal k:")
#optimal_k = find_optimal_k(1, 30, X_train, y_train, cv=3, save_path=None)
#print(f"CONTRACEPTIVE METHOD CHOICE - With optimal k={optimal_k} (Cross-validation):")
#k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
#cross_validate_model(k_knn, X_train, y_train, cv=3)

optimal_params = hyperparameterTuning(1, 25, X_train, y_train, cv=3, bivariate=False)



----------------------------------------------------------------------------------------------
-------------------------------- CONTRACEPTIVE METHOD CHOICE --------------------------------
----------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------
CMC - With Scaling:
Accuracy: 0.4860
Precision: 0.4773
Recall: 0.4860
F1 Score: 0.4774
Prediction time:  0.09001016616821289
Best parameters: {'metric': 'minkowski', 'n_neighbors': 21, 'weights': 'uniform'}
Best cross-validation accuracy: 0.5522
Best cross-validation precision: 0.5537
Best cross-validation recall: 0.5522
Best cross-validation F1 Score: 0.5515
Time for Hyperparameter Tuning: 3.384798049926758


In [7]:
######################################### CREDIT CARD DEFAULT #########################################
print("----------------------------------------------------------------------------------------------")
print("------------------------------------ CREDIT CARD DEFAULT -------------------------------------")
print("----------------------------------------------------------------------------------------------")

# Load CCD dataset
CCD_train = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/CCD_train.csv")
CCD_test = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/CCD_test.csv")

# Scale numerical features
X_train, X_test = scale_data(CCD_train, CCD_test)

# Define target variable
X_train = CCD_train.drop('default_payment_next_month', axis=1)
y_train = CCD_train['default_payment_next_month']
X_test = CCD_test.drop('default_payment_next_month', axis=1)
y_test = CCD_test['default_payment_next_month']

# One-hot encode categorical variables
X_train, X_test = get_dummies_all(X_train, X_test)

# Align columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

print("----------------------------------------------------------------------------------------------")
print("CCD - With Scaling:")
knn = KNeighborsClassifier(n_neighbors=3)
cross_validate_model(knn, X_train, y_train, cv=3, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("Finding optimal k:")
optimal_k = find_optimal_k(1, 25, X_train, y_train, cv=3, bivariate=True, save_path=None)
print(f"CREDIT CARD DEFAULT - With optimal k={optimal_k} (Cross-validation):")
k_knn = KNeighborsClassifier(n_neighbors=optimal_k)
cross_validate_model(k_knn, X_train, y_train, cv=3, bivariate=True)

print("----------------------------------------------------------------------------------------------")
print("Hyperparameter Tuning:")
optimal_params = hyperparameterTuning(1, 25, X_train, y_train, cv=3, bivariate=True)


----------------------------------------------------------------------------------------------
------------------------------------ CREDIT CARD DEFAULT -------------------------------------
----------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------
CCD - With Scaling:
Accuracy: 0.7366
Precision: 0.3465
Recall: 0.2260
F1 Score: 0.2732
Time: 8.5518057346344
----------------------------------------------------------------------------------------------
Finding optimal k:
Optimal k: 18
Time:  {189.89020919799805}
CREDIT CARD DEFAULT - With optimal k=18 (Cross-validation):
Accuracy: 0.7786
Precision: 0.4734
Recall: 0.0787
F1 Score: 0.1346
Time: 8.117784261703491
----------------------------------------------------------------------------------------------
Hyperparameter Tuning:


KeyboardInterrupt: 