In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
from joblib import dump
import os
from myFunc import ALGORITHMS, FEATURE_TYPES

In [None]:
# Load Data
combined_file = './dataset/final/combined_dataset.parquet'
df = pd.read_parquet(combined_file)

In [None]:
# Preprocess Data
X = df.drop(columns=['Label'])
y = df['Label']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Save the scaler
scaler_file = './models/scaler.joblib'
dump(scaler, scaler_file)
print(f'Scaler saved to {scaler_file}')

In [None]:
# Train Models
trained_models = {}
for name, (model, params) in ALGORITHMS.items():
    print(f'Training {name}...')
    model.set_params(**params)
    model.fit(X_train, y_train)
    trained_models[name] = model
    print(f'{name} trained.')

In [None]:
# Evaluate Models
results = []
for name, model in trained_models.items():
    print(f'Evaluating {name}...')
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    try:
        auc_roc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
    except AttributeError:
        auc_roc = 'N/A'  # Some models do not have predict_proba method

    results.append({
        'Model': name,
        'F1 Score': f1,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'AUC-ROC': auc_roc,
        'Confusion Matrix': confusion
    })
    print(f'{name} F1 Score: {f1}')
    print(f'{name} Precision: {precision}')
    print(f'{name} Recall: {recall}')
    print(f'{name} Accuracy: {accuracy}')
    print(f'{name} AUC-ROC: {auc_roc}')
    print(f'{name} Confusion Matrix:\n{confusion}')

In [None]:
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
# Save Results
results_file = './supportFiles/dissertation/fscore_b_combined.csv'
results_df.to_csv(results_file, index=False)
print(f'Results saved to {results_file}')

In [None]:
# Save Models
models_dir = './models'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

In [None]:
for name, model in trained_models.items():
    model_file = os.path.join(models_dir, f'{name}_model.joblib')
    dump(model, model_file)
    print(f'{name} model saved to {model_file}')