IMPORTS

In [1]:
import pandas as pd
import pickle
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import (mean_absolute_error, mean_squared_error, 
                            r2_score, accuracy_score, f1_score)
import numpy as np

LOAD EVALUATION RESULTS

In [2]:
# Load and prepare your data (must run this first!)
df = pd.read_csv("data/processed/TSLA_enhanced.csv", parse_dates=['Date'])
df = df.dropna()  # Handle missing values

# Define features and target
features = df.drop(columns=['next_day_change', 'Date'])
target = df['next_day_change']

# Perform train-test split (THIS CREATES X_test)
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, shuffle=False, random_state=42
)

In [3]:
results = {}
model_names = ["Linear Regression", "SVM", "Random Forest", "XGBoost"]

for name in model_names:
    try:
        with open(f"models/{name.lower().replace(' ', '_')}.pkl", 'rb') as f:
            model = pickle.load(f)
        
        # Calculate metrics if not already stored
        y_pred = model.predict(X_test)
        results[name] = {
            'MAE': mean_absolute_error(y_test, y_pred),
            'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
            'R2': r2_score(y_test, y_pred),
            'Accuracy': accuracy_score((y_test > 0), (y_pred > 0)),
            'F1': f1_score((y_test > 0), (y_pred > 0))
        }
    except FileNotFoundError:
        print(f"Model file not found: {name}")
        continue


SELECTION CRITERIA

In [4]:
weights = {
    'R2': 0.5,
    'F1': 0.3,
    'RMSE': -0.2  # Negative because lower is better
}

model_scores = []
for name in model_names:
    if name in results:  # Check if model was evaluated
        score = 0
        for metric, weight in weights.items():
            # Access metric directly from the dictionary
            score += results[name][metric] * weight
        model_scores.append((name, score))

SELECT THE BEST MODEL

In [5]:
best_model_name = max(model_scores, key=lambda x: x[1])[0]
print(f"Best Model: {best_model_name}")

with open(f"models/{best_model_name.lower().replace(' ', '_')}.pkl", 'rb') as f:
    best_model = pickle.load(f)

Best Model: XGBoost


SAVE THE BEST MODEL WITH METADATA

In [6]:
# After selecting the best model
best_model_name = max(model_scores, key=lambda x: x[1])[0]

with open(f"models/{best_model_name.lower().replace(' ', '_')}.pkl", 'rb') as f:
    best_model = pickle.load(f)

# Create metadata 
model_metadata = {
    'model_name': best_model_name,
    'features': list(pd.read_csv("data/processed/TSLA_enhanced.csv").columns),
    'metrics': results[best_model_name],  # This is already a dictionary
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d')
}

# Save to files
with open("models/best_model.pkl", 'wb') as f:
    pickle.dump(best_model, f)
    
with open("models/model_metadata.json", 'w') as f:
    json.dump(model_metadata, f)