In [5]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay, make_scorer
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
import sys
import time
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [16]:
def cross_validate_model(model, X, y, cv, bivariate, printer=True):
    """Perform cross-validation with a Random Forest and print metrics if asked"""
    if not bivariate:
        scoring = {
            'accuracy': 'accuracy',
            'precision': make_scorer(precision_score, average='weighted', zero_division=1),
            'recall': make_scorer(recall_score, average='weighted', zero_division=1),
            'f1': make_scorer(f1_score, average='weighted', zero_division=1)
        }
    else:
        scoring = {
            'accuracy': 'accuracy',
            'precision': make_scorer(precision_score, average='binary', zero_division=1),
            'recall': make_scorer(recall_score, average='binary', zero_division=1),
            'f1': make_scorer(f1_score, average='binary', zero_division=1)
        }
    start = time.time()
    results = cross_validate(model, X, y, cv=cv, scoring=scoring)
    end = time.time()

    if printer:
        print(f"Accuracy: {results['test_accuracy'].mean():.4f}")
        print(f"Precision: {results['test_precision'].mean():.4f}")
        print(f"Recall: {results['test_recall'].mean():.4f}")
        print(f"F1 Score: {results['test_f1'].mean():.4f}")
        print("Prediction time: ", end - start)

    return results

def scale_data(X_train, X_test):
    """Scale only numerical features using StandardScaler."""
    numeric_cols = X_train.select_dtypes(include=['number']).columns  
    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
    X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])
    return X_train_scaled, X_test_scaled

def get_dummies_all(X_train, X_test):
    """
    Converts all categorical variables in a DataFrame into dummy (one-hot encoded) variables.
    """
    categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns  # Select categorical columns
    X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=False)  # One-hot encode them
    categorical_cols = X_test.select_dtypes(include=['object', 'category']).columns  # Select categorical columns
    X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=False)  # One-hot encode them
    return X_train_encoded, X_test_encoded

def oversampling(X, y):
    ros = RandomOverSampler(random_state=42)
    X_train_resampled, y_train_resampled = ros.fit_resample(X, y)
    print(f"Original class distribution: {y.value_counts()}")
    print(f"Resampled class distribution: {pd.Series(y_train_resampled).value_counts()}")
    return X_train_resampled, y_train_resampled

def hyperparameterTuning(X, y, cv, bivariate):
    param_grid = [
        {'kernel': ['linear'], 'C': [0.1, 1, 10]},
        {'kernel': ['rbf'], 'C': [0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1]}
        ]

    if bivariate == True:
        scoring = {
        'accuracy': 'accuracy',
        'precision_weighted': make_scorer(precision_score, average='macro', zero_division=1),
        'recall_weighted': make_scorer(recall_score, average='macro', zero_division=1),
        'f1_weighted': make_scorer(f1_score, average='macro', zero_division=1)
        }

        start = time.time()
        svm = SVC(random_state=42)
        grid_search = GridSearchCV(svm, param_grid, cv=cv, scoring=scoring, refit='accuracy', n_jobs=-1)
        grid_search.fit(X, y)
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best cross-validation accuracy: {grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation precision: {grid_search.cv_results_['mean_test_precision_weighted'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation recall: {grid_search.cv_results_['mean_test_recall_weighted'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation F1 Score: {grid_search.cv_results_['mean_test_f1_weighted'][grid_search.best_index_]:.4f}")
        print("Time for Hypertuning: ", time.time()-start)
    else:
        scoring = {
        'accuracy': 'accuracy',
        'precision_weighted': make_scorer(precision_score, average='weighted', zero_division=1),
        'recall_weighted': make_scorer(recall_score, average='weighted', zero_division=1),
        'f1_weighted': make_scorer(f1_score, average='weighted', zero_division=1)
        }
        start = time.time()
        svm = SVC(random_state=42)
        grid_search = GridSearchCV(svm, param_grid, cv=cv, scoring=scoring, refit='accuracy', n_jobs=-1)
        grid_search.fit(X, y)
        print(f"Best parameters: {grid_search.best_params_}")
        print(f"Best cross-validation accuracy: {grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation precision: {grid_search.cv_results_['mean_test_precision_weighted'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation recall: {grid_search.cv_results_['mean_test_recall_weighted'][grid_search.best_index_]:.4f}")
        print(f"Best cross-validation F1 Score: {grid_search.cv_results_['mean_test_f1_weighted'][grid_search.best_index_]:.4f}")
        print("Time for Hypertuning: ", time.time()-start)

In [None]:
print("----------------------------------------------------------------------------------------------")
print("------------------------------------ AIDS ----------------------------------")
print("----------------------------------------------------------------------------------------------")

AIDS_train = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/AIDS_train.csv")
AIDS_test = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/AIDS_test.csv")


X_train = AIDS_train.drop('infected', axis=1); y_train = AIDS_train['infected']
X_test = AIDS_test.drop('infected', axis=1); y_test = AIDS_test['infected']
X_train, X_test = get_dummies_all(X_train, X_test)
X_train, y_train = oversampling(X_train, y_train)

print("----------------------------------------------------------------------------------------------")
print("AIDS - With Scaling:")
svm = SVC(random_state=42)
cross_validate_model(svm, X_train, y_train, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("Hyperparameter Tuning:")
optimal_params = hyperparameterTuning(X_train, y_train, cv=3, bivariate = True)

----------------------------------------------------------------------------------------------
------------------------------------ AIDS ----------------------------------
----------------------------------------------------------------------------------------------
Original class distribution: infected
0    1258
1     393
Name: count, dtype: int64
Resampled class distribution: infected
0    1258
1    1258
Name: count, dtype: int64
----------------------------------------------------------------------------------------------
AIDS - With Scaling:
Accuracy: 0.6737
Precision: 0.6665
Recall: 0.7027
F1 Score: 0.6831
Prediction time:  0.5170254707336426
----------------------------------------------------------------------------------------------
Hyperparameter Tuning:


In [10]:
print("----------------------------------------------------------------------------------------------")
print("------------------------------------ BONE MARROW TRANSPLANT ----------------------------------")
print("----------------------------------------------------------------------------------------------")

BMT_train = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/BMT_train.csv")
BMT_train['survival_status'] = BMT_train['survival_status'].astype('category')
BMT_test = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/BMT_test.csv")
BMT_test['survival_status'] = BMT_test['survival_status'].astype('category')

X_train = BMT_train.drop('survival_status', axis=1); y_train = BMT_train['survival_status']
X_test = BMT_test.drop('survival_status', axis=1); y_test = BMT_test['survival_status']
X_train, X_test = get_dummies_all(X_train, X_test)

print("----------------------------------------------------------------------------------------------")
print("BMT - With Scaling:")
svm = SVC(random_state=42)
cross_validate_model(svm, X_train, y_train, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("Hyperparameter Tuning:")
optimal_params = hyperparameterTuning(X_train, y_train, cv=3, bivariate = True)

----------------------------------------------------------------------------------------------
------------------------------------ BONE MARROW TRANSPLANT ----------------------------------
----------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------
BMT - With Scaling:
Accuracy: 0.6533
Precision: 0.7341
Recall: 0.4306
F1 Score: 0.5411
Prediction time:  0.05372214317321777
----------------------------------------------------------------------------------------------
Hyperparameter Tuning:
Best parameters: {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
Best cross-validation accuracy: 0.6733
Best cross-validation precision: 0.6789
Best cross-validation recall: 0.6704
Best cross-validation F1 Score: 0.6688
Time for Hypertuning:  0.2445080280303955


In [11]:
print("----------------------------------------------------------------------------------------------")
print("------------------------------------- CONTRACEPTIVE METHOD -----------------------------------")
print("----------------------------------------------------------------------------------------------")

# Load CMC dataset
CMC_train = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/CMC_train.csv")
CMC_test = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/CMC_test.csv")

# Define target variable
X_train = CMC_train.drop('contraceptive_method', axis=1)
y_train = CMC_train['contraceptive_method']
X_test = CMC_test.drop('contraceptive_method', axis=1)
y_test = CMC_test['contraceptive_method']

# One-hot encode categorical variables
X_train, X_test = get_dummies_all(X_train, X_test)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

print("----------------------------------------------------------------------------------------------")
print("CMC - With Scaling:")
svm = SVC(random_state=42)
cross_validate_model(svm, X_train, y_train, cv=3, bivariate = False)

print("----------------------------------------------------------------------------------------------")
print("Hyperparameter Tuning:")
optimal_params = hyperparameterTuning(X_train, y_train, cv=3, bivariate=False)

----------------------------------------------------------------------------------------------
------------------------------------- CONTRACEPTIVE METHOD -----------------------------------
----------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------
CMC - With Scaling:
Accuracy: 0.5148
Precision: 0.6283
Recall: 0.5148
F1 Score: 0.4490
Prediction time:  0.1585712432861328
----------------------------------------------------------------------------------------------
Hyperparameter Tuning:
Best parameters: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
Best cross-validation accuracy: 0.5598
Best cross-validation precision: 0.5586
Best cross-validation recall: 0.5598
Best cross-validation F1 Score: 0.5525
Time for Hypertuning:  0.8174443244934082


In [12]:
print("----------------------------------------------------------------------------------------------")
print("------------------------------------ CENSUS INCOME (KDD) ------------------------------------")
print("----------------------------------------------------------------------------------------------")

KDD_train = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/KDD_train.csv")
KDD_test = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/KDD_test.csv")

# Define target variable
X_train = KDD_train.drop('income', axis=1)
y_train = KDD_train['income']
X_test = KDD_test.drop('income', axis=1)
y_test = KDD_test['income']

y_train = y_train.map({'-50000': 0, ' 50000+.': 1})
y_test = y_test.map({'-50000': 0, ' 50000+.': 1})

X_train, X_test = get_dummies_all(X_train, X_test)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

print("----------------------------------------------------------------------------------------------")
print("KDD - With Scaling:")
svm = SVC(random_state=42)
cross_validate_model(svm, X_train, y_train, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("Hyperparameter Tuning:")
optimal_params = hyperparameterTuning(X_train, y_train, cv=3, bivariate=True)

----------------------------------------------------------------------------------------------
------------------------------------ CENSUS INCOME (KDD) ------------------------------------
----------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------
KDD - With Scaling:
Accuracy: 0.9230
Precision: 0.8147
Recall: 0.1545
F1 Score: 0.2597
Prediction time:  569.9808101654053
----------------------------------------------------------------------------------------------
Hyperparameter Tuning:


KeyboardInterrupt: 

In [None]:
######################################### CREDIT CARD DEFAULT #########################################
print("----------------------------------------------------------------------------------------------")
print("------------------------------------ CREDIT CARD DEFAULT -------------------------------------")
print("----------------------------------------------------------------------------------------------")

# Load CCD dataset
CCD_train = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/CCD_train.csv")
CCD_test = pd.read_csv("C:/Users/ameli/OneDrive/Studium/TU Wien/WS2024/ML/Exercise 3/Datasets/CCD_test.csv")

# Scale numerical features
X_train, X_test = scale_data(CCD_train, CCD_test)

# Define target variable
X_train = CCD_train.drop('default_payment_next_month', axis=1)
y_train = CCD_train['default_payment_next_month']
X_test = CCD_test.drop('default_payment_next_month', axis=1)
y_test = CCD_test['default_payment_next_month']

# One-hot encode categorical variables
X_train, X_test = get_dummies_all(X_train, X_test)

# Align columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

print("----------------------------------------------------------------------------------------------")
print("CCD - With Scaling:")
svm = SVC(random_state=42)
cross_validate_model(svm, X_train, y_train, cv=3, bivariate = True)

print("----------------------------------------------------------------------------------------------")
print("Hyperparameter Tuning:")
optimal_params = hyperparameterTuning(X_train, y_train, cv=3, bivariate=True)