In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
import pickle
import pandas as pd
import numpy as np

# Load the final processed dataset
merged_df = pd.read_csv('/Users/ananyaaggarwal/Desktop/Movie Prediction Project/data/final_processed_data.csv')

# Create target variable: success if success_ratio > 1
y = merged_df['success_ratio'].apply(lambda x: 1 if x > 1 else 0)

# Define features that are available **before release**
feature_cols = ['budget', 'director_avg_revenue', 'cast_avg_revenue'] + \
               list(merged_df.columns[merged_df.columns.str.startswith('genres_')])

# Extract features
X = merged_df[feature_cols]

# Split data into 60% train and 40% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Convert to DataFrame just in case
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

# Replace infinities with NaN
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop any rows with NaNs
X_train.dropna(inplace=True)
y_train = y_train.loc[X_train.index]  # Sync y_train with dropped rows

# Train the Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))

# Package the model and feature columns
model_package = {
    'model': model,
    'features': feature_cols
}

# Save the trained model to a pickle file
with open('/Users/ananyaaggarwal/Desktop/Movie Prediction Project/models/sequel_model_60_40.pkl', 'wb') as f:
    pickle.dump(model_package, f)


Accuracy: 0.9786235662148071
Precision: 0.9708920187793427
Recall: 0.9904214559386973
