In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Load the dataset
movies_data = pd.read_csv('movies_data.csv', encoding='latin1')

# Feature engineering on movies_data
movies_data['Log_Budget'] = np.log1p(movies_data['Budget'])  # Log transform Budget
movies_data['Release_Decade'] = (movies_data['Release year'] // 10) * 10  # Group into decades
movies_data['Running_Time_Category'] = pd.cut(
    movies_data['Running time'], bins=[0, 90, 120, np.inf], labels=['Short', 'Medium', 'Long']
)

# Drop rows with missing values in critical features
filtered_data = movies_data.dropna(subset=['Log_Budget', 'Box Office', 'Running_Time_Category', 'Genre'])

# Define features and target (exclude Log_Earnings to reduce overfitting)
X = filtered_data[['Log_Budget', 'IMDb score', 'Director Box Office %', 'Actors Box Office %', 
                   'Release_Decade', 'Running_Time_Category', 'Genre']]
y = filtered_data['Box Office']

# Preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Log_Budget', 'IMDb score', 'Director Box Office %', 'Actors Box Office %']),
        ('cat', OneHotEncoder(), ['Release_Decade', 'Running_Time_Category', 'Genre'])
    ]
)

# Adjust Random Forest parameters to reduce overfitting
model_rf = RandomForestRegressor(random_state=42, n_estimators=200, max_depth=10, min_samples_split=10, min_samples_leaf=5)
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model_rf)
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Random Forest model
pipeline_rf.fit(X_train, y_train)

# Make predictions
y_pred = pipeline_rf.predict(X_test)

# Evaluate model performance
mse_rf = mean_squared_error(y_test, y_pred)
r2_rf = r2_score(y_test, y_pred)

# Display performance metrics
print("Optimized Random Forest Model Performance Metrics:")
print(f"Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"R-squared (R2): {r2_rf:.2f}\n")

# Display predictions (first 10 rows)
results_rf = X_test.copy()
results_rf['Actual Box Office'] = y_test.values
results_rf['Predicted Box Office'] = y_pred
print("Random Forest Predictions (first 10 rows):")
print(results_rf.head(10))

# Feature importance analysis
feature_names = preprocessor.get_feature_names_out()
importances = pipeline_rf.named_steps['model'].feature_importances_
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importance:")
print(importance_df)


Optimized Random Forest Model Performance Metrics:
Mean Squared Error (MSE): 10329533995213360.00
R-squared (R2): 0.67

Random Forest Predictions (first 10 rows):
      Log_Budget  IMDb score  Director Box Office %  Actors Box Office %  \
2354   17.034386         3.5                   0.00                66.67   
495    17.822844         6.5                  42.85                33.33   
3953   16.380460         3.8                  50.00                63.88   
3153   15.846681         7.2                   0.00                33.33   
1178   16.118096         7.3                   0.00                83.33   
2002   15.894952         7.7                  50.00               100.00   
1902   17.766754         7.6                   0.00                25.00   
2589   16.012735         5.4                   0.00                16.67   
2413   17.034386         6.8                   0.00                32.25   
1644   17.281246         7.5                  14.28                25.71   

