In [1]:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, fbeta_score, precision_score, recall_score
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
import numpy as np

In [2]:
X_train = pd.read_csv('artifacts/clean_data' + '/X_train.csv').astype(np.float32)
X_val = pd.read_csv('artifacts/clean_data' + '/X_test.csv').astype(np.float32)
y_train = np.asarray(np.load('artifacts/clean_data' + '/y_train.npy'), np.int64)
y_val = np.asarray(np.load('artifacts/clean_data' + '/y_test.npy'), np.int64)

In [3]:
print(f'Memory usage X_train: {X_train.memory_usage().sum() / 1024 ** 2:.2f} MB')

Memory usage X_train: 79.53 MB


In [4]:
def f2_scorer(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=2, average='macro')

In [5]:
scoring = {
    'f2_macro': make_scorer(f2_scorer),
    'precision_macro': make_scorer(precision_score, average='macro'),
    'recall_macro': make_scorer(recall_score, average='macro')
}

In [6]:
param_distributions = {
    'classifier__hidden_layer_sizes': [(64,), (64, 32), (128, 64, 32)],  # Number of layers and units
    'classifier__learning_rate_init': [1e-4, 1e-3, 1e-2, 1e-1],  # Initial learning rate
    'classifier__activation': ['relu', 'tanh'],  # Activation function
    'classifier__solver': ['adam', 'sgd'],  # Optimizers
    'classifier__warm_start': [False, True], # warm start
}

In [None]:
# Apply sampling separately
smote_tomek = SMOTETomek()
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)


In [7]:
mlp = MLPClassifier(max_iter=500, learning_rate='adaptive', alpha=1e-2, batch_size=1024)
pipeline = Pipeline([
    ('classifier', mlp)
])


In [8]:
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=50,
    scoring=scoring,
    refit='f2_macro',
    n_jobs=-5,
    cv=3,
    verbose=2
)

# Fit the model using resampled data
random_search.fit(X_resampled, y_resampled)


In [None]:
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_val)
f2_score_macro = fbeta_score(y_val, y_pred, beta=2, average='macro')
prec = precision_score(y_val, y_pred, average='macro')
recall = recall_score(y_val, y_pred, average='macro')
print("Best Parameters Found:", random_search.best_params_)
print(f"Best F2 Score (Macro) on Validation Set: {f2_score_macro}")
print(f"Precision on Validation Set: {prec}")
print(f"Recall on Validation Set: {recall}")

In [None]:
f2_scores_per_class = fbeta_score(y_val, y_pred, beta=2, average=None)
prec_per_class = precision_score(y_val, y_pred, average=None)
recall_per_class = recall_score(y_val, y_pred, average=None)

for i, (f2, prec, rec) in enumerate(zip(f2_scores_per_class, prec_per_class, recall_per_class)):
    print(f"Class {i}:")
    print(f"  F2 Score: {f2}")
    print(f"  Precision: {prec}")
    print(f"  Recall: {rec}")