In [None]:
# Model_Training.ipynb

# Importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import joblib

# Load preprocessed data
data_path = "../data/inputs/raw_data.csv"
data = pd.read_csv(data_path)

# Feature and target separation
X = data.drop(columns=["target"])
y = data["target"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the model
joblib.dump(model, "../data/outputs/model.pkl")

# Evaluate model
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Generate reports
report = classification_report(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred_proba)

print("Classification Report:")
print(report)
print("\nAUC Score:", auc_score)

# Save model performance summary
with open("../data/outputs/reports/model_summary.txt", "w") as f:
    f.write(f"Classification Report:\n{report}\n")
    f.write(f"AUC Score: {auc_score}\n")

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"AUC = {auc_score:.2f}")
plt.plot([0, 1], [0, 1], linestyle="--", color="red")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.savefig("../data/outputs/charts/roc_curve.png")
plt.show()
