In [97]:
import numpy as np

import pandas as pd

In [98]:
df = pd.read_csv('../Data/Model_Data.csv')

In [99]:
X = df.drop(columns='label')

y = df['label']

In [100]:
# SPLIT DU Lieu

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [101]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, make_scorer, roc_auc_score
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE

# Define the pipeline
pipeline = Pipeline([
    ('smoteenn', SMOTEENN(smote=SMOTE(k_neighbors=5))),  
    ('scaler', MinMaxScaler()), 
    ('classification', LogisticRegression(solver="liblinear", max_iter=5000)) 
])

# Define the parameter grid for Logistic Regression
param_grid = {
    'classification__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  
    'classification__penalty': ['l1', 'l2'], 
    'classification__class_weight': ['balanced', None],  # Dealing with imbalance
}

# Define StratifiedKFold cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Use ROC AUC as the scoring metric
scoring = make_scorer(roc_auc_score)

# RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    pipeline, 
    param_grid, 
    n_iter=28,  # Number of random parameter settings sampled
    cv=cv,  # Cross-validation strategy
    n_jobs=-1,  # Use all processors
    verbose=1,  # Output process details
    random_state=42,  # For reproducibility
    scoring=scoring  # ROC AUC as evaluation metric
)

# Fit the model using RandomizedSearchCV
random_search.fit(X_train, y_train)

# Display best hyperparameters and accuracy
print("Tuned hyperparameters: (best parameters)", random_search.best_params_)
print("\033[1m" + "Best ROC AUC:", random_search.best_score_)

# Predict on test set
y_pred = random_search.predict(X_test)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 28 candidates, totalling 140 fits
Tuned hyperparameters: (best parameters) {'classification__penalty': 'l1', 'classification__class_weight': 'balanced', 'classification__C': 0.1}
[1mBest ROC AUC: 0.7601631925959261

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.81      0.86      1745
           1       0.54      0.73      0.62       539

    accuracy                           0.79      2284
   macro avg       0.73      0.77      0.74      2284
weighted avg       0.82      0.79      0.80      2284



In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE

pipeline = Pipeline([
    ('smoteenn', SMOTEENN(smote=SMOTE(k_neighbors=5))),  
    ('scaler', MinMaxScaler()), 
    ('classification', LogisticRegression(solver="liblinear", max_iter=5000)) 
])


param_grid = {
    'classification__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  
    'classification__penalty': ['l1', 'l2'], 
    'classification__class_weight': ['balanced', None],  # Dealing with imbalance
}

# Use StratifiedKFold to maintain class distribution in folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Display best hyperparameters and accuracy
print("Tuned hyperparameters: (best parameters)", grid_search.best_params_)
print("\033[1m" + "Accuracy:", grid_search.best_score_)

# Predict on test set
y_pred = grid_search.predict(X_test)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 28 candidates, totalling 140 fits
