In [33]:
import pandas as pd

df=pd.read_csv('finalPCA.csv')

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.combine import SMOTETomek
from collections import Counter
from sklearn.model_selection import RandomizedSearchCV





X = df.drop(["y"], axis=1)
Y = df["y"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(random_state=42)


param_grid = {
    'n_estimators': [200,250,300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [10,15,20],
    'min_samples_leaf': [5,10,15],
    'max_features': ['sqrt','log2'],
}



random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=30,
    cv=3,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, Y_train)
print("Best Hyperparameters: ", random_search.best_params_)






Best Hyperparameters:  {'n_estimators': 200, 'min_samples_split': 15, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 20}


In [35]:
count_zeros = df['y'].value_counts().get(0, 0)
count_ones = df['y'].value_counts().get(1, 0)
print("Negative class: ", count_zeros)
print("Positive class: ", count_ones)

Negative class:  39922
Positive class:  5289


In [36]:

smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, Y_train)

print("Class distribution after SMOTETomek:", Counter(y_resampled))


rf = RandomForestClassifier(min_samples_leaf=5,n_estimators=200,min_samples_split=15,max_features="log2",class_weight='balanced',max_depth=20,random_state=42)
rf.fit(X_resampled, y_resampled)

y_train_pred = rf.predict(X_resampled)
training_accuracy = accuracy_score(y_resampled, y_train_pred)
print(f"Training Accuracy: {training_accuracy * 100:.2f}%")


y_val_pred = rf.predict(X_test)
validation_accuracy = accuracy_score(Y_test, y_val_pred)
print(f"Validation Accuracy: {validation_accuracy * 100:.2f}%")




Class distribution after SMOTETomek: Counter({0: 31354, 1: 31354})
Training Accuracy: 96.89%
Validation Accuracy: 89.23%


In [37]:

thersholds=[0.4,0.3,0.2,0.8]
for Num in thersholds:
    y_prob = rf.predict_proba(X_test)[:, 1]
    threshold= Num
    y_pred = (y_prob > threshold).astype(int)
    print(Num, end=" ")
    print(classification_report(Y_test, y_pred))



0.4               precision    recall  f1-score   support

           0       0.98      0.89      0.93      7952
           1       0.50      0.84      0.63      1091

    accuracy                           0.88      9043
   macro avg       0.74      0.86      0.78      9043
weighted avg       0.92      0.88      0.89      9043

0.3               precision    recall  f1-score   support

           0       0.98      0.85      0.91      7952
           1       0.46      0.90      0.61      1091

    accuracy                           0.86      9043
   macro avg       0.72      0.88      0.76      9043
weighted avg       0.92      0.86      0.88      9043

0.2               precision    recall  f1-score   support

           0       0.99      0.81      0.89      7952
           1       0.41      0.95      0.57      1091

    accuracy                           0.83      9043
   macro avg       0.70      0.88      0.73      9043
weighted avg       0.92      0.83      0.85      9043

0.8    

In [38]:
Num=0.4
y_prob = rf.predict_proba(X_test)[:, 1]
threshold= Num
y_pred = (y_prob > threshold).astype(int)
print(Num, end=" ")
print(classification_report(Y_test, y_pred))

0.4               precision    recall  f1-score   support

           0       0.98      0.89      0.93      7952
           1       0.50      0.84      0.63      1091

    accuracy                           0.88      9043
   macro avg       0.74      0.86      0.78      9043
weighted avg       0.92      0.88      0.89      9043



In [39]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(Y_test, y_pred))

[[7050  902]
 [ 172  919]]


In [40]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf = RandomForestClassifier(class_weight='balanced',random_state=42)
scores = cross_val_score(rf, X, Y, cv=stratified_kfold, scoring='f1_macro')
print("Cross-Validation scores:", scores)
print("Mean:", scores.mean())

Cross-Validation scores: [0.73570973 0.73392247 0.73157678 0.74098158 0.73347746]
Mean: 0.735133602323593
