In [3]:
import pandas as pd

df=pd.read_csv('finalPCA.csv')

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.combine import SMOTETomek
from collections import Counter
from sklearn.model_selection import RandomizedSearchCV





X = df.drop(["y"], axis=1)
Y = df["y"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(random_state=42)


param_grid = {
    'n_estimators': [200,250,300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [10,15,20],
    'min_samples_leaf': [5,10,15],
    'max_features': ['sqrt','log2'],
}



random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=30,
    cv=3,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, Y_train)
print("Best Hyperparameters: ", random_search.best_params_)






Best Hyperparameters:  {'n_estimators': 250, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': 30}


In [10]:

smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, Y_train)

print("Class distribution after SMOTETomek:", Counter(y_resampled))


rf = RandomForestClassifier(min_samples_leaf=5,n_estimators=250,min_samples_split=10,max_features="sqrt",class_weight='balanced',max_depth=30,random_state=42)
rf.fit(X_resampled, y_resampled)

y_train_pred = rf.predict(X_resampled)
training_accuracy = accuracy_score(y_resampled, y_train_pred)
print(f"Training Accuracy: {training_accuracy * 100:.2f}%")


y_val_pred = rf.predict(X_test)
validation_accuracy = accuracy_score(Y_test, y_val_pred)
print(f"Validation Accuracy: {validation_accuracy * 100:.2f}%")




Class distribution after SMOTETomek: Counter({0: 31365, 1: 31365})
Training Accuracy: 97.42%
Validation Accuracy: 89.36%


In [11]:
thersholds=[0.4,0.3,0.2,0.8]
for Num in thersholds:
    y_prob = rf.predict_proba(X_test)[:, 1]
    threshold= Num
    y_pred = (y_prob > threshold).astype(int)
    print(Num, end=" ")
    print(classification_report(Y_test, y_pred))

0.4               precision    recall  f1-score   support

           0       0.98      0.88      0.93      7952
           1       0.50      0.84      0.63      1091

    accuracy                           0.88      9043
   macro avg       0.74      0.86      0.78      9043
weighted avg       0.92      0.88      0.89      9043

0.3               precision    recall  f1-score   support

           0       0.99      0.85      0.91      7952
           1       0.45      0.91      0.61      1091

    accuracy                           0.86      9043
   macro avg       0.72      0.88      0.76      9043
weighted avg       0.92      0.86      0.88      9043

0.2               precision    recall  f1-score   support

           0       0.99      0.81      0.89      7952
           1       0.41      0.95      0.57      1091

    accuracy                           0.83      9043
   macro avg       0.70      0.88      0.73      9043
weighted avg       0.92      0.83      0.85      9043

0.8    

In [12]:
Num=0.4
y_prob = rf.predict_proba(X_test)[:, 1]
threshold= Num
y_pred = (y_prob > threshold).astype(int)
print(Num, end=" ")
print(classification_report(Y_test, y_pred))

0.4               precision    recall  f1-score   support

           0       0.98      0.88      0.93      7952
           1       0.50      0.84      0.63      1091

    accuracy                           0.88      9043
   macro avg       0.74      0.86      0.78      9043
weighted avg       0.92      0.88      0.89      9043



In [8]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf = RandomForestClassifier(class_weight='balanced',random_state=42)
scores = cross_val_score(rf, X, Y, cv=stratified_kfold, scoring='f1_macro')
print("Stratified Cross-Validation F1-Scores:", scores)
print("Mean F1-Score:", scores.mean())

Stratified Cross-Validation F1-Scores: [0.72952106 0.72796026 0.71720947 0.73609051 0.7303796 ]
Mean F1-Score: 0.7282321789094428


In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,education,balance,day,month,duration,campaign,pdays,previous,y,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8
0,0,58,2,2143,5,5,261,1,-1,0,0,1.969508,0.291367,1.114691,0.18342,-0.604762,-0.828776,0.309383,1.723728
1,1,44,1,29,5,5,151,1,-1,0,0,1.53112,0.073696,1.320453,-2.24711,0.942523,-0.072258,-0.377493,-1.644236
2,2,33,1,2,5,5,76,1,-1,0,0,3.168875,-0.476858,-1.869663,1.438016,1.65978,0.160939,-0.038836,0.182354
3,3,47,3,1506,5,5,92,1,-1,0,0,2.812119,-0.016881,1.23786,0.46311,-0.865411,0.068642,-0.777813,0.041188
4,4,33,3,1,5,5,198,1,-1,0,0,-0.073835,2.991254,1.203994,-1.282372,0.628338,1.92001,-1.352203,0.105009
