# Notebook 5: Improved Model - Random Forest
**Author:** Enerita 
**Date:** December 2025

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Setup complete")

In [None]:
# Load data
data = pd.read_csv('../data/raw/stock_prices_with_indicators.csv', parse_dates=['date'])

features = ['momentum_10', 'momentum_20', 'sma_20', 'sma_50', 'rsi_14', 
            'macd', 'atr_14', 'volatility_20', 'volume_ratio', 'bb_width']

data = data[features + ['trend_label', 'date']].dropna().sort_values('date')

split_idx = int(len(data) * 0.7)
X_train = data.iloc[:split_idx][features]
y_train = data.iloc[:split_idx]['trend_label']
X_test = data.iloc[split_idx:][features]
y_test = data.iloc[split_idx:]['trend_label']

print(f"Data ready: Train={len(X_train)}, Test={len(X_test)}")

## 1. Train Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42, 
                            class_weight='balanced', n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Random Forest Accuracy: {accuracy:.4f}")

## 2. Hyperparameter Tuning

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'), 
                   param_grid, cv=3, n_jobs=-1)
grid.fit(X_train, y_train)

best_rf = grid.best_estimator_
print(f"Best params: {grid.best_params_}")

## 3. Evaluate Best Model

In [None]:
y_pred_best = best_rf.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)

print(f"Best Model Accuracy: {accuracy_best:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))

## 4. Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred_best)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.title('Confusion Matrix - Random Forest')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('../visuals/model_performance/confusion_matrix_rf.png', dpi=300)
print("Saved: confusion_matrix_rf.png")

## 5. Feature Importance

In [None]:
importance = pd.DataFrame({
    'Feature': features,
    'Importance': best_rf.feature_importances_
}).sort_values('Importance', ascending=False)

print(importance)

plt.figure(figsize=(10, 6))
plt.barh(importance['Feature'], importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../visuals/model_performance/feature_importance_rf.png', dpi=300)
print("\nSaved: feature_importance_rf.png")

## 6. Model Comparison

In [None]:
baseline = pd.read_csv('../models/baseline_metrics.csv')
baseline_acc = baseline['accuracy'].values[0]

comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Accuracy': [baseline_acc, accuracy_best]
})

print(comparison)

plt.figure(figsize=(8, 5))
plt.bar(comparison['Model'], comparison['Accuracy'])
plt.ylabel('Accuracy')
plt.title('Model Comparison')
plt.ylim([0.5, 0.8])
plt.tight_layout()
plt.savefig('../visuals/model_performance/model_comparison.png', dpi=300)
print("\nSaved: model_comparison.png")

## 7. Save Model

In [None]:
joblib.dump(best_rf, '../models/random_forest_model.pkl')

metrics = pd.DataFrame([{
    'model': 'Random Forest',
    'accuracy': accuracy_best,
    'best_params': str(grid.best_params_)
}])
metrics.to_csv('../models/rf_metrics.csv', index=False)

importance.to_csv('../models/feature_importance.csv', index=False)

print("Model saved")
print(f"\nFinal Accuracy: {accuracy_best:.2%}")
print(f"Improvement: {(accuracy_best - baseline_acc):.2%}")