In [1]:
import pandas as pd
import numpy as np
import pickle
import tarfile
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('trainable.csv')

In [3]:
X = df.drop('final_status', axis=1)
y = df['final_status']

In [4]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [5]:
rf = RandomForestClassifier(random_state=42)
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [6]:
random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist, n_iter=20, cv=3,
    scoring='f1', random_state=42, n_jobs=-1
)
random_search.fit(X_train, y_train)

# Best model
best_rf = random_search.best_estimator_

In [7]:
y_pred = best_rf.predict(X_test)

In [8]:
report = classification_report(y_test, y_pred)

In [9]:
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
risk_score = fn / (fn + tp)

In [10]:
report_text = report + f"\nRisk Score: {risk_score:.4f}"
with open('classification_report.txt', "w") as f:
    f.write(report_text)

In [11]:
with open('random_forest_model.pkl', "wb") as f:
    pickle.dump(best_rf, f)

In [14]:
with tarfile.open('random_forest.tar.gz', "w:gz") as tar:
    tar.add('random_forest_model.pkl')
    tar.add('classification_report.txt')

print("Training complete.")

Training complete.


In [13]:
print(f"Risk Score: {risk_score:.4f}")
print("Artifacts saved: random_forest_model.pkl, classification_report.txt, model_artifacts.tar.gz")

Risk Score: 0.0563
Artifacts saved: random_forest_model.pkl, classification_report.txt, model_artifacts.tar.gz
