In [None]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import pickle

In [3]:
df = pd.read_csv(r'C:\Users\USER\Desktop\Data_Science\Predicting_Movie_Box_Office_Success\train_cleaned.csv')

In [7]:
features = ['budget', 'popularity', 'runtime',
            'Comedy', 'Drama', 'Family', 'Romance', 'Thriller', 'Action',
            'Animation', 'Adventure', 'Horror']

target = 'revenue'

In [8]:
# Fill missing numeric values with median (if any)
for col in ['budget', 'popularity', 'runtime']:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

X = df[features]
y = df[target]

In [9]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
# Prepare LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

In [11]:
# Define model parameters
params = {
    'objective': 'regression',
    'metric': 'mae',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'random_state': 42
}

In [13]:
# Train model
model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(stopping_rounds=50)]
)


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[34]	valid_0's l1: 3.94668e+07


In [14]:
# Predict on validation set using best_iteration
y_pred = model.predict(X_val, num_iteration=model.best_iteration)

In [15]:
# Evaluate MAE
mae = mean_absolute_error(y_val, y_pred)
print(f"Validation MAE: {mae:.2f}")

Validation MAE: 39466779.58


In [16]:
# Save model to file
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [17]:
# Save feature list to use later in Flask app
with open('features.pkl', 'wb') as f:
    pickle.dump(features, f)