In [None]:
# ============================================================
# MOVIE REVENUE PREDICTION - FULL EXPERIMENT PIPELINE
# ============================================================

import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# ============================================================
# MODELS (FULL LIST)
# ============================================================

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor, AdaBoostRegressor,
    GradientBoostingRegressor, BaggingRegressor
)
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor



# ============================================================
# ADJUSTED RÂ² FUNCTION
# ============================================================

def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)


# ============================================================
# MODEL LIST
# ============================================================

models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),

    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Ada Boost": AdaBoostRegressor(),
    "Gradient Boost": GradientBoostingRegressor(),

    "LGBM": LGBMRegressor(),
    "XGBoost": XGBRegressor(),
   

    "SVM": SVR(),
    "KNN": KNeighborsRegressor(),
    "Bagging Regressor": BaggingRegressor()
}


# ============================================================
# EVALUATION FUNCTION (ALL MODELS)
# ============================================================

def evaluate_all_models(stage_name, X_train, X_test, y_train, y_test):

    print(f"\n\n==============================")
    print(f"ðŸ“Œ STAGE: {stage_name}")
    print(f"==============================")

    results = []
    p = X_train.shape[1]

    for name, model in models.items():

        pipe = Pipeline([
            ("scaler", StandardScaler()),
            ("model", model)
        ])

        pipe.fit(X_train, y_train)

        train_pred = pipe.predict(X_train)
        test_pred = pipe.predict(X_test)

        train_r2 = r2_score(y_train, train_pred)
        test_r2 = r2_score(y_test, test_pred)

        train_adj = adjusted_r2(train_r2, len(y_train), p)
        test_adj = adjusted_r2(test_r2, len(y_test), p)

        test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))

        gap = abs(train_adj - test_adj)

        results.append([name, train_adj, test_adj, test_rmse, gap])

    df = pd.DataFrame(results, columns=[
        "Model", "Train Adj R2", "Test Adj R2", "RMSE", "Overfit Gap"
    ])

    df = df.sort_values(by=["Test Adj R2", "Overfit Gap"], ascending=[False, True])

    print("\nTop 5 Models:")
    print(df.head(5))

    return df


# ============================================================
# STEP 1: LOAD DATA
# ============================================================

data = pd.read_csv("bollywood_movie_data.csv")

X = data.drop("Revenue(INR)", axis=1)
y = data["Revenue(INR)"]

print("\nDataset Loaded:", data.shape)


# ============================================================
# STAGE 1: BASELINE (OneHot Encoding)
# ============================================================

X_base = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(
    X_base, y, test_size=0.2, random_state=42
)

baseline_results = evaluate_all_models(
    "Baseline (OneHot Encoding Only)",
    X_train, X_test, y_train, y_test
)


# ============================================================
# STAGE 2: FREQUENCY ENCODING HIGH CARDINALITY
# ============================================================

X_freq = X.copy()

high_card_cols = [
    "Movie Name", "Lead Star", "Director",
    "Music Director", "New Actor",
    "New Director", "New Music Director"
]

for col in high_card_cols:
    if col in X_freq.columns:
        freq_map = X_freq[col].value_counts().to_dict()
        X_freq[col] = X_freq[col].map(freq_map)

X_freq = pd.get_dummies(X_freq, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(
    X_freq, y, test_size=0.2, random_state=42
)

freq_results = evaluate_all_models(
    "After Frequency Encoding",
    X_train, X_test, y_train, y_test
)


# ============================================================
# STAGE 3: LOG TARGET TRANSFORMATION
# ============================================================

y_log = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(
    X_freq, y_log, test_size=0.2, random_state=42
)

log_results = evaluate_all_models(
    "After Log Target Transform",
    X_train, X_test, y_train, y_test
)


# ============================================================
# STAGE 4: HYPERPARAMETER TUNING (Best Model Only)
# ============================================================

print("\n\n==============================")
print("ðŸ“Œ STAGE: Hyperparameter Tuning")
print("==============================")

best_model_name = log_results.iloc[0]["Model"]
print("Best Model Before Tuning:", best_model_name)

best_model = models[best_model_name]

# Example tuning for RandomForest / XGB / LGBM only
param_grid = {}

if best_model_name == "Random Forest":
    param_grid = {
        "model__n_estimators": [100, 200],
        "model__max_depth": [5, 10, None]
    }

elif best_model_name == "XGBoost":
    param_grid = {
        "model__n_estimators": [100, 200],
        "model__learning_rate": [0.05, 0.1]
    }

elif best_model_name == "LGBM":
    param_grid = {
        "model__n_estimators": [100, 200],
        "model__learning_rate": [0.05, 0.1]
    }

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", best_model)
])

if param_grid:
    grid = GridSearchCV(pipe, param_grid, cv=3, scoring="r2", n_jobs=-1)
    grid.fit(X_train, y_train)

    final_pipeline = grid.best_estimator_
    print("Best Params:", grid.best_params_)

else:
    final_pipeline = pipe.fit(X_train, y_train)


# ============================================================
# FINAL EVALUATION
# ============================================================

test_pred = final_pipeline.predict(X_test)

final_rmse = np.sqrt(mean_squared_error(y_test, test_pred))
final_r2 = r2_score(y_test, test_pred)

print("\nâœ… FINAL MODEL PERFORMANCE")
print("Final RMSE:", final_rmse)
print("Final RÂ²:", final_r2)


# ============================================================
# SAVE FINAL MODEL + COLUMNS
# ============================================================

joblib.dump(final_pipeline, "best_movie_model.pkl")
joblib.dump(X_freq.columns.tolist(), "training_columns.pkl")

print("\nðŸŽ‰ FINAL MODEL SAVED!")
print("Files:")
print("âœ… best_movie_model.pkl")
print("âœ… training_columns.pkl")


ModuleNotFoundError: No module named 'catboost'