In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.combine import SMOTETomek
from collections import Counter
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd

df=pd.read_csv('final.csv')




X = df.drop(["y"], axis=1)
Y = df["y"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(random_state=42)


param_grid = {
    'n_estimators': [200,250,300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [10,15,20],
    'min_samples_leaf': [5,10,15],
    'max_features': ['sqrt','log2'],
}



random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=30,
    cv=3,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, Y_train)
print("Best Hyperparameters: ", random_search.best_params_)






In [None]:

smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, Y_train)

print("Class distribution after SMOTETomek:", Counter(y_resampled))


rf = RandomForestClassifier(min_samples_leaf=10,n_estimators=200,min_samples_split=15,max_features="log2",class_weight='balanced',random_state=42)
rf.fit(X_resampled, y_resampled)

y_train_pred = rf.predict(X_resampled)
training_accuracy = accuracy_score(y_resampled, y_train_pred)
print(f"Training Accuracy: {training_accuracy * 100:.2f}%")


y_val_pred = rf.predict(X_test)
validation_accuracy = accuracy_score(Y_test, y_val_pred)
print(f"Validation Accuracy: {validation_accuracy * 100:.2f}%")

y_prob = rf.predict_proba(X_test)[:, 1]
threshold = 0.4  # Custom threshold
y_pred = (y_prob > threshold).astype(int)

print(classification_report(Y_test, y_pred))


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf = RandomForestClassifier(class_weight='balanced',random_state=42)
scores = cross_val_score(rf, X, Y, cv=stratified_kfold, scoring='f1_macro')
print("Stratified Cross-Validation F1-Scores:", scores)
print("Mean F1-Score:", scores.mean())
