In [20]:
import pandas as pd

df=pd.read_csv('finalPCA.csv')

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.combine import SMOTETomek
from collections import Counter
from sklearn.model_selection import RandomizedSearchCV





X = df.drop(["y"], axis=1)
Y = df["y"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(random_state=42)


param_grid = {
    'n_estimators': [200,250,300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [10,15,20],
    'min_samples_leaf': [5,10,15],
    'max_features': ['sqrt','log2'],
}



random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=30,
    cv=3,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, Y_train)
print("Best Hyperparameters: ", random_search.best_params_)






Best Hyperparameters:  {'n_estimators': 200, 'min_samples_split': 15, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 20}


In [24]:

smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, Y_train)

print("Class distribution after SMOTETomek:", Counter(y_resampled))


rf = RandomForestClassifier(min_samples_leaf=5,n_estimators=200,min_samples_split=15,max_features="log2",class_weight='balanced',max_depth=20,random_state=42)
rf.fit(X_resampled, y_resampled)

y_train_pred = rf.predict(X_resampled)
training_accuracy = accuracy_score(y_resampled, y_train_pred)
print(f"Training Accuracy: {training_accuracy * 100:.2f}%")


y_val_pred = rf.predict(X_test)
validation_accuracy = accuracy_score(Y_test, y_val_pred)
print(f"Validation Accuracy: {validation_accuracy * 100:.2f}%")

y_prob = rf.predict_proba(X_test)[:, 1]
threshold = 0.4  # Custom threshold
y_pred = (y_prob > threshold).astype(int)

print(classification_report(Y_test, y_pred))


Class distribution after SMOTETomek: Counter({0: 29832, 1: 29832})
Training Accuracy: 97.10%
Validation Accuracy: 89.67%
              precision    recall  f1-score   support

           0       0.97      0.89      0.93      7952
           1       0.51      0.83      0.63      1091

    accuracy                           0.88      9043
   macro avg       0.74      0.86      0.78      9043
weighted avg       0.92      0.88      0.89      9043



In [18]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf = RandomForestClassifier(class_weight='balanced',random_state=42)
scores = cross_val_score(rf, X, Y, cv=stratified_kfold, scoring='f1_macro')
print("Stratified Cross-Validation F1-Scores:", scores)
print("Mean F1-Score:", scores.mean())

Stratified Cross-Validation F1-Scores: [0.73558631 0.73696711 0.72815943 0.74254025 0.73222176]
Mean F1-Score: 0.7350949722936242


In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,age,education,balance,day,month,duration,campaign,pdays,previous,y,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8
0,0,1.651055,2,0.756822,5,5,0.40591,-0.870092,0.0,0.0,0,-0.977435,0.490974,-0.169525,-0.370608,0.931344,-0.254142,-0.108786,-0.208366
1,1,0.302248,1,0.367072,5,5,0.234837,-0.870092,0.0,0.0,0,-0.846006,-0.540116,-1.107939,-0.091377,-0.069491,0.645389,-0.299522,-0.469378
2,2,-0.757528,1,0.362094,5,5,0.118196,-0.870092,0.0,0.0,0,-1.143216,0.589516,-0.002114,1.109814,0.2517,-0.020617,-0.261821,-0.172624
3,3,0.591279,3,0.639381,5,5,0.143079,-0.870092,0.0,0.0,0,-1.24711,0.619359,-0.125332,-0.306701,-0.38212,-0.258093,0.275624,-0.034303
4,4,-0.757528,3,0.36191,5,5,0.307932,-0.870092,0.0,0.0,0,0.268562,0.149515,-1.509453,-0.077746,-0.17588,-0.182524,-0.176405,-0.256351
