<h1 style="color:yellow;">Concurrent Data - Machine Learning</h1>

# Imports

In [None]:
import numpy as np
import pandas as pd
import os
import sys
import time
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)
from utils import print_evaluation_metrics, log_metrics, plot_individual_metrics, preprocessing

# Load the dataframes

In [None]:

# 1. Define the directory where your data is saved
save_dir = os.path.join('..', 'data frames', 'main', 'MLP', 'PCA_0.99')

X_train = pd.read_csv(os.path.join(save_dir, 'X_train_PCA_0.99.csv'))
X_val = pd.read_csv(os.path.join(save_dir, 'X_val_PCA_0.99.csv'))
X_test = pd.read_csv(os.path.join(save_dir, 'X_test_PCA_0.99.csv'))

# 3. Load the target vectors (y)
# Use .values.ravel() to convert the DataFrame to a 1D array (expected for labels)
y_train = pd.read_csv(os.path.join(save_dir, 'y_train_PCA_0.99.csv')).values.ravel()
y_val = pd.read_csv(os.path.join(save_dir, 'y_val_PCA_0.99.csv')).values.ravel()
y_test = pd.read_csv(os.path.join(save_dir, 'y_test_PCA_0.99.csv')).values.ravel()

# 4. Load the label encoder classes
label_classes = np.load(os.path.join(save_dir, 'label_classes_PCA_0.99.npy'), allow_pickle=True)

print("All datasets loaded successfully.")

In [None]:
from sklearn.preprocessing import LabelEncoder

# Reconstruct the encoder from the saved classes
output_encoder = LabelEncoder()
output_encoder.classes_ = label_classes
output_encoder.fit(label_classes) # Sometimes needed to initialize internal dicts

print(f"Encoder restored. Classes: {output_encoder.classes_}")

# Arguments

In [None]:
# Options available are : none, smote, adasyn, borderline_smote, hybrid_100k, undersample
# Here you can specify the sampling method used, note that you should first remove "class_weight = 'balanced'" from the model training if you are using a sampling method.
sampling_method = 'hybrid_100k' 
version = sampling_method + '_Optuna' + 'PCA99'
plot_distributions = True
model_results= []

results_dir = os.path.join('figures', version, 'results')
evaluation_dir = os.path.join('figures', version, 'evaluatoin charts')

# Process the dataframes

In [None]:
X_train_proc, X_val_proc, X_test_proc, y_train_proc, y_val_proc, y_test_proc = preprocessing(
        X_train, X_val, X_test,
        y_train, y_val, y_test,
        output_encoder,
        evaluation_dir,       # Pass save_dir here so plots (like class dist) save to the specific folder
        version, 
        sampling_method, 
        plot_distributions
    )

# Machine Learning

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import VotingClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import f1_score

import optuna
import xgboost as xgb


## Logistic Regression

In [None]:
linear_model = LogisticRegression(solver='saga', max_iter=1000, random_state=42, class_weight='balanced')

start_time = time.time()
linear_model.fit(X_train_proc, y_train_proc)
training_time = time.time() - start_time

start_time = time.time()
y_pred_lr = linear_model.predict(X_val_proc)
prediction_time = time.time() - start_time

# arguments for evaluation function
results_file_name = 'logistic_regression_results.txt'
cm_title = 'Logistic Regression Confusion Matrix'

print("Logistic Regression Evaluation:")
accuracy, precision, recall, f1 = print_evaluation_metrics(y_val_proc, y_pred_lr, training_time, prediction_time, output_encoder, results_dir, version, results_file_name, cm_title)
log_metrics(model_results, 'LR', accuracy, precision, recall, f1, training_time, prediction_time)

## Random Forest

In [None]:
def objective_rf(trial):
    # 1. Define the search space
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 15)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    
    # 2. Initialize model with trial parameters
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced'
    )
    
    # 3. Train
    clf.fit(X_train_proc, y_train_proc)
    
    # 4. Evaluate (Maximize F1 Weighted)
    y_pred = clf.predict(X_val_proc)
    score = f1_score(y_val_proc, y_pred, average='weighted')
    return score

# Create Study and Optimize
print("Optimizing Random Forest with Optuna...")
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_rf, n_trials=40) # Increase n_trials (e.g., 50 or 100) for better results

print("Best Random Forest Params:", study_rf.best_params)

# Train Final Model with Best Params
best_params_rf = study_rf.best_params
rnd_forest = RandomForestClassifier(
    **best_params_rf,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

start_time = time.time()
rnd_forest.fit(X_train_proc, y_train_proc)
training_time = time.time() - start_time

start_time = time.time()
y_pred_rf = rnd_forest.predict(X_val_proc)
prediction_time = time.time() - start_time

# Evaluation
results_file_name = 'random_forest_optuna_results.txt'
cm_title = 'Random Forest (Optuna) Confusion Matrix'

print("Random Forest (Optuna) Evaluation:")
accuracy, precision, recall, f1 = print_evaluation_metrics(y_val_proc, y_pred_rf, training_time, prediction_time, output_encoder, results_dir, version, results_file_name, cm_title)
log_metrics(model_results, 'RF', accuracy, precision, recall, f1, training_time, prediction_time)


## XGBoost

In [None]:
# Pre-compute sample weights once to save time inside the loop
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_proc)

def objective_xgb(trial):
    # 1. Define search space
    params = {
        'objective': 'multi:softmax',
        'num_class': len(output_encoder.classes_),
        'eval_metric': 'mlogloss',
        'n_jobs': -1,
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
    }
    
    # 2. Initialize model
    clf = xgb.XGBClassifier(**params)
    
    # 3. Train (using sample weights)
    clf.fit(X_train_proc, y_train_proc, sample_weight=sample_weights)
    
    # 4. Evaluate
    y_pred = clf.predict(X_val_proc)
    score = f1_score(y_val_proc, y_pred, average='weighted')
    return score

# Create Study and Optimize
print("Optimizing XGBoost with Optuna...")
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=40) # Increase n_trials for better results

print("Best XGBoost Params:", study_xgb.best_params)

# Train Final Model with Best Params
best_params_xgb = study_xgb.best_params
# Re-add fixed parameters that aren't in best_params
best_params_xgb.update({
    'objective': 'multi:softmax',
    'num_class': len(output_encoder.classes_),
    'eval_metric': 'mlogloss',
    'n_jobs': -1,
    'random_state': 42
})

xgb_model = xgb.XGBClassifier(**best_params_xgb)

start_time = time.time()
xgb_model.fit(X_train_proc, y_train_proc, sample_weight=sample_weights)
training_time = time.time() - start_time

start_time = time.time()
y_pred_xgb = xgb_model.predict(X_val_proc)
prediction_time = time.time() - start_time

# Evaluation
results_file_name = 'xgboost_optuna_results.txt'
cm_title = 'XGBoost (Optuna) Confusion Matrix'

print("XGBoost (Optuna) Evaluation:")
accuracy, precision, recall, f1 = print_evaluation_metrics(y_val_proc, y_pred_xgb, training_time, prediction_time, output_encoder, results_dir, version, results_file_name, cm_title)
log_metrics(model_results, 'XGB', accuracy, precision, recall, f1, training_time, prediction_time)

## Naive Bayes

In [None]:
# Create a Naive Bayes classifier
nb_model = GaussianNB()

# Train the model
start_time = time.time()
nb_model.fit(X_train_proc, y_train_proc)
training_time = time.time() - start_time

# Make predictions
start_time = time.time()
y_pred_nb = nb_model.predict(X_val_proc)
prediction_time = time.time() - start_time

# arguments for evaluation function
results_file_name = 'naive_bayes_results.txt'
cm_title = 'Naive Bayes Confusion Matrix'

print("Naive Bayes Evaluation:")
accuracy, precision, recall, f1 = print_evaluation_metrics(y_val_proc, y_pred_nb, training_time, prediction_time, output_encoder, results_dir, version, results_file_name, cm_title)
log_metrics(model_results, 'NB', accuracy, precision, recall, f1, training_time, prediction_time)

## SVM

In [None]:


# 1. Define Linear SVM
# dual=False is generally preferred when n_samples > n_features.
linear_svc = LinearSVC(dual=False, random_state=42, max_iter=200000, class_weight='balanced')

# 2. Wrap in CalibratedClassifierCV to enable predict_proba
# This is required for the model to be compatible with a soft-voting ensemble.
calibrated_svc = CalibratedClassifierCV(estimator=linear_svc, method='sigmoid', cv=3)

# 3. Wrap in OneVsRestClassifier
# n_jobs=4 uses 4 CPU cores to train the per-class models in parallel.
ovr_classifier = OneVsRestClassifier(estimator=calibrated_svc, n_jobs=4)

# Train
print("Training One-vs-Rest Linear SVM (Calibrated)...")
start_time = time.time()
ovr_classifier.fit(X_train_proc, y_train_proc)
training_time = time.time() - start_time

# Predict
print("Predicting...")
start_time = time.time()
y_pred_svm = ovr_classifier.predict(X_val_proc)
prediction_time = time.time() - start_time

# Arguments for evaluation
results_file_name = 'svm_linear_ovr_results.txt'
cm_title = 'SVM (Linear OvR) Confusion Matrix'

print("SVM (Linear OvR) Evaluation:")
# Ensure print_evaluation_metrics and log_metrics are available from utils.py
accuracy, precision, recall, f1 = print_evaluation_metrics(
    y_val_proc, y_pred_svm, training_time, prediction_time, 
    output_encoder, results_dir, version, results_file_name, cm_title
)

# Log metrics
log_metrics(model_results, 'SVM', accuracy, precision, recall, f1, training_time, prediction_time)


## KNN

In [None]:
# Create k-NN classifier
# n_neighbors=5 is standard. n_jobs=-1 is CRITICAL for speed.
knn_model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

# Train
print("Training k-NN ...")
start_time = time.time()
knn_model.fit(X_train_proc, y_train_proc)
training_time = time.time() - start_time

# Predict
# WARNING: k-NN is slow at prediction time!
print("Predicting k-NN ...")
start_time = time.time()
y_pred_knn = knn_model.predict(X_val_proc)
prediction_time = time.time() - start_time

# Evaluation
results_file_name = 'knn_results.txt'
cm_title = 'k-NN Confusion Matrix'

print("k-NN Evaluation:")
accuracy, precision, recall, f1 = print_evaluation_metrics(y_val_proc, y_pred_knn, training_time, prediction_time, output_encoder, results_dir, version, results_file_name, cm_title)
log_metrics(model_results, 'KNN', accuracy, precision, recall, f1, training_time, prediction_time)

## Decision Tree

In [None]:
# Create Decision Tree
# max_depth=None means it grows fully (can overfit). 
dt_model = DecisionTreeClassifier(random_state=42, class_weight='balanced')

# Train
print("Training Decision Tree...")
start_time = time.time()
dt_model.fit(X_train_proc, y_train_proc)
training_time = time.time() - start_time

# Predict
print("Predicting...")
start_time = time.time()
y_pred_dt = dt_model.predict(X_val_proc)
prediction_time = time.time() - start_time

# Evaluation
results_file_name = 'decision_tree_results.txt'
cm_title = 'Decision Tree Confusion Matrix'

print("Decision Tree Evaluation:")
accuracy, precision, recall, f1 = print_evaluation_metrics(y_val_proc, y_pred_dt, training_time, prediction_time, output_encoder, results_dir, version, results_file_name, cm_title)
log_metrics(model_results, 'DT', accuracy, precision, recall, f1, training_time, prediction_time)

## MLP

In [None]:
# Create MLP (Shallow Neural Network)
# hidden_layer_sizes=(100, 50) means 2 layers. max_iter=300 ensures convergence.
mlp_model = MLPClassifier(hidden_layer_sizes=(128, 96, 64, 48), max_iter=300, random_state=42, activation='relu', solver='adam', alpha=0.005, verbose=True)

print("Training MLP Classifier...")
start_time = time.time()
mlp_model.fit(X_train_proc, y_train_proc)
training_time = time.time() - start_time

print("Predicting...")
start_time = time.time()
y_pred_mlp = mlp_model.predict(X_val_proc)
prediction_time = time.time() - start_time

# Evaluation
results_file_name = 'mlp_results.txt'
cm_title = 'MLP Classifier Confusion Matrix'

print("MLP Evaluation:")
accuracy, precision, recall, f1 = print_evaluation_metrics(y_val_proc, y_pred_mlp, training_time, prediction_time, output_encoder, results_dir, version, results_file_name, cm_title)
log_metrics(model_results, 'MLP', accuracy, precision, recall, f1, training_time, prediction_time)

## LDA

In [None]:
# Create LDA
lda_model = LinearDiscriminantAnalysis()

print("Training LDA...")
start_time = time.time()
lda_model.fit(X_train_proc, y_train_proc)
training_time = time.time() - start_time

print("Predicting...")
start_time = time.time()
y_pred_lda = lda_model.predict(X_val_proc)
prediction_time = time.time() - start_time

# Evaluation
results_file_name = 'lda_results.txt'
cm_title = 'LDA Confusion Matrix'

print("LDA Evaluation:")
accuracy, precision, recall, f1 = print_evaluation_metrics(y_val_proc, y_pred_lda, training_time, prediction_time, output_encoder, results_dir, version, results_file_name, cm_title)
log_metrics(model_results, 'LDA', accuracy, precision, recall, f1, training_time, prediction_time)

## Voting Classifier

In [None]:
# 1. Define the models to combine (Estimators)
# We use the models you've already defined above (make sure to run their cells first!)
estimators = [
    ('rf', rnd_forest),
    ('xgb', xgb_model),
    ('svm', ovr_classifier),
    ('knn', knn_model),
    ('dt', dt_model),
    ('mlp', mlp_model)
]

# 2. Create Voting Classifier
# voting='soft' averages the probabilities (usually better). 'hard' counts votes.
voting_model = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)

print("Training Voting Classifier (Ensemble)...")
start_time = time.time()
voting_model.fit(X_train_proc, y_train_proc)
training_time = time.time() - start_time

print("Predicting...")
start_time = time.time()
y_pred_voting = voting_model.predict(X_val_proc)
prediction_time = time.time() - start_time

# Evaluation
results_file_name = 'voting_ensemble_results.txt'
cm_title = 'Voting Ensemble Confusion Matrix'

print("Voting Ensemble Evaluation:")
accuracy, precision, recall, f1 = print_evaluation_metrics(y_val_proc, y_pred_voting, training_time, prediction_time, output_encoder, results_dir, version, results_file_name, cm_title)
log_metrics(model_results, 'VC', accuracy, precision, recall, f1, training_time, prediction_time)


# Bar charts

In [None]:
plot_individual_metrics(model_results, evaluation_dir, version)