In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

try:
    df = pd.read_csv('feature_selected_dataset.csv')
    print("Feature-selected dataset loaded.")
except FileNotFoundError:
    print("Error: 'feature_selected_dataset.csv' not found.")
    exit()

X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

base_model = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])
base_model.fit(X_train, y_train)
y_pred_base = base_model.predict(X_test)
print("--- Baseline Random Forest Performance ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_base):.4f}")


param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__bootstrap': [True, False]
}

print("\n--- Running RandomizedSearchCV ---")
rf_random = RandomizedSearchCV(estimator=base_model,
                               param_distributions=param_grid,
                               n_iter=100, 
                               cv=5,       
                               verbose=1,
                               random_state=42,
                               n_jobs=-1) 
rf_random.fit(X_train, y_train)
print("Best parameters from RandomizedSearch:", rf_random.best_params_)


param_grid_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}
print("\n--- Running GridSearchCV ---")
grid_search = GridSearchCV(estimator=base_model,
                           param_grid=param_grid_grid,
                           cv=5,
                           n_jobs=-1,
                           verbose=1)
grid_search.fit(X_train, y_train)
print("Best parameters from GridSearchCV:", grid_search.best_params_)


best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

print("\n--- Final Model Performance after Hyperparameter Tuning ---")
print(classification_report(y_test, y_pred_best))


joblib.dump(best_model, 'models/final_model.pkl')
print("\nOptimized model saved to 'models/final_model.pkl'")