In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
from scipy.stats import randint, uniform
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Load the entire dataset
data = pd.read_excel('data.xlsx')[:103]
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
X = data.drop('Class', axis=1)
y = data['Class']

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Split the data
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val)

# Preprocessing pipelines
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Classifiers with class_weight adjustments
rf = RandomForestClassifier(random_state=42, class_weight='balanced')
xgb = XGBClassifier(random_state=42, scale_pos_weight=1)
lr = LogisticRegression(random_state=42, max_iter=5000, tol=1e-4, class_weight='balanced')
sgd = SGDClassifier(random_state=42, max_iter=1000, tol=1e-3, class_weight='balanced')
svm = SVC(random_state=42, probability=True, class_weight='balanced')

# Pipeline without SMOTE
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))),
    ('classifier', VotingClassifier(
        estimators=[
            ('rf', rf),
            ('xgb', xgb),
            ('lr', lr),
            ('sgd', sgd),
            ('svm', svm)
        ],
        voting='soft'
    ))
])

# Adjusted hyperparameter distributions
param_dist = {
    'classifier__rf__n_estimators': randint(100, 500),
    'classifier__rf__max_depth': randint(5, 20),
    'classifier__rf__min_samples_split': randint(2, 10),
    'classifier__rf__min_samples_leaf': randint(1, 5),
    'classifier__xgb__n_estimators': randint(100, 500),
    'classifier__xgb__max_depth': randint(3, 10),
    'classifier__xgb__learning_rate': uniform(0.01, 0.3),
    'classifier__xgb__subsample': uniform(0.7, 0.3),
    'classifier__xgb__colsample_bytree': uniform(0.7, 0.3),
    'classifier__xgb__scale_pos_weight': [1, 2, 5, 10],
    'classifier__lr__C': uniform(0.1, 10),
    'classifier__lr__penalty': ['l2'],
    'classifier__lr__solver': ['saga'],
    'classifier__sgd__alpha': uniform(1e-5, 1e-1),
    'classifier__sgd__penalty': ['l2', 'l1', 'elasticnet'],
    'classifier__sgd__loss': ['log_loss', 'modified_huber'],
    'classifier__svm__C': uniform(0.1, 10),
    'classifier__svm__kernel': ['rbf', 'poly', 'sigmoid']
}

# Cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# RandomizedSearchCV with F1-score optimization
search = RandomizedSearchCV(
    pipeline, param_distributions=param_dist, n_iter=400,
    cv=cv, scoring='f1', n_jobs=-1, random_state=42, verbose=1)

# Fit the model
search.fit(X_train, y_train)
best_model = search.best_estimator_

# Evaluate on validation set
y_val_pred = best_model.predict(X_val)
y_val_pred_proba = best_model.predict_proba(X_val)[:, 1]

print("\nValidation Set Performance:")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.3f}")
print(f"F1 Score: {f1_score(y_val, y_val_pred):.3f}")
print(f"ROC AUC: {roc_auc_score(y_val, y_val_pred_proba):.3f}")
print(f"Precision: {precision_score(y_val, y_val_pred):.3f}")
print(f"Recall: {recall_score(y_val, y_val_pred):.3f}")

# Evaluate on test set
y_test_pred = best_model.predict(X_test)
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]

print("\nTest Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.3f}")
print(f"F1 Score: {f1_score(y_test, y_test_pred):.3f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_test_pred_proba):.3f}")
print(f"Precision: {precision_score(y_test, y_test_pred):.3f}")
print(f"Recall: {recall_score(y_test, y_test_pred):.3f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))


Fitting 5 folds for each of 400 candidates, totalling 2000 fits

Validation Set Performance:
Accuracy: 0.882
F1 Score: 0.900
ROC AUC: 0.931
Precision: 0.818
Recall: 1.000

Test Set Performance:
Accuracy: 0.952
F1 Score: 0.957
ROC AUC: 0.982
Precision: 0.917
Recall: 1.000

Confusion Matrix:
[[ 9  1]
 [ 0 11]]


In [12]:
best_model = search.best_estimator_

print("\nBest Hyperparameters found by RandomizedSearchCV:")
print(search.best_params_)


Best Hyperparameters found by RandomizedSearchCV:
{'classifier__lr__C': 3.3337156211552075, 'classifier__lr__penalty': 'l2', 'classifier__lr__solver': 'saga', 'classifier__rf__max_depth': 15, 'classifier__rf__min_samples_leaf': 2, 'classifier__rf__min_samples_split': 2, 'classifier__rf__n_estimators': 295, 'classifier__sgd__alpha': 0.04425099364806696, 'classifier__sgd__loss': 'modified_huber', 'classifier__sgd__penalty': 'l1', 'classifier__svm__C': 6.7960261613396185, 'classifier__svm__kernel': 'sigmoid', 'classifier__xgb__colsample_bytree': 0.8920504671484497, 'classifier__xgb__learning_rate': 0.09134423095005544, 'classifier__xgb__max_depth': 4, 'classifier__xgb__n_estimators': 166, 'classifier__xgb__scale_pos_weight': 2, 'classifier__xgb__subsample': 0.7692487567462036}


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings('ignore')

data = pd.read_excel('data.xlsx')[:103]
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
X = data.drop('Class', axis=1)
y = data['Class']

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=42, stratify=y_train_val)

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

rf = RandomForestClassifier(
    n_estimators=295, max_depth=15, min_samples_leaf=2, min_samples_split=2, random_state=42, class_weight='balanced')

xgb = XGBClassifier(
    n_estimators=700, 
    max_depth=4,
    learning_rate=0.09134423095005544,
    subsample=0.7692487567462036,
    colsample_bytree=0.8920504671484497,
    scale_pos_weight=2,
    random_state=42
)

lr = LogisticRegression(
    C=3.3337156211552075, penalty='l2', solver='saga', max_iter=10000, random_state=42, class_weight='balanced'
)

sgd = SGDClassifier(
    alpha=0.04425099364806696, loss='modified_huber', penalty='l1', max_iter=5000, random_state=42, class_weight='balanced'
)

svm = SVC(
    C=6.7960261613396185, kernel='sigmoid', probability=True, random_state=42, class_weight='balanced'
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))),
    ('classifier', VotingClassifier(
        estimators=[
            ('rf', rf),
            ('xgb', xgb),
            ('lr', lr),
            ('sgd', sgd),
            ('svm', svm)
        ],
        voting='soft'
    ))
])

pipeline.fit(X_train, y_train)

y_val_pred = pipeline.predict(X_val)
y_val_pred_proba = pipeline.predict_proba(X_val)[:, 1]

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix

print("\nValidation Set Performance:")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.3f}")
print(f"F1 Score: {f1_score(y_val, y_val_pred):.3f}")
print(f"ROC AUC: {roc_auc_score(y_val, y_val_pred_proba):.3f}")
print(f"Precision: {precision_score(y_val, y_val_pred):.3f}")
print(f"Recall: {recall_score(y_val, y_val_pred):.3f}")

y_test_pred = pipeline.predict(X_test)
y_test_pred_proba = pipeline.predict_proba(X_test)[:, 1]

print("\nTest Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.3f}")
print(f"F1 Score: {f1_score(y_test, y_test_pred):.3f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_test_pred_proba):.3f}")
print(f"Precision: {precision_score(y_test, y_test_pred):.3f}")
print(f"Recall: {recall_score(y_test, y_test_pred):.3f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))



Validation Set Performance:
Accuracy: 0.882
F1 Score: 0.900
ROC AUC: 0.931
Precision: 0.818
Recall: 1.000

Test Set Performance:
Accuracy: 0.952
F1 Score: 0.957
ROC AUC: 0.973
Precision: 0.917
Recall: 1.000

Confusion Matrix:
[[ 9  1]
 [ 0 11]]


In [19]:
data = pd.read_excel('data.xlsx')
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
X = data.drop('Class', axis=1)
y_pred = pipeline.predict(X)

results_df = pd.DataFrame({
    'user_id': data['user_id'],  
    'Predicted_Class': y_pred
})

results_df.to_csv('final_predictions.csv', index=False)
print("\nPredictions on the whole dataset have been saved to 'final_predictions.csv'.")



Predictions on the whole dataset have been saved to 'final_predictions.csv'.
