In [None]:
# train_models.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from scripts.preprocess import preprocess_movies_df
from datetime import datetime
import warnings
import optuna

pd.set_option('future.no_silent_downcasting', True)
warnings.filterwarnings('ignore', category=FutureWarning)

vectorizer = None
svd = None


def objective(trial, X, y):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 800, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 3.0, log=True),
        "tree_method": "hist",
        "objective": "reg:squarederror",
        "n_jobs": -1,
        "random_state": 42,
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmses = []

    for train_idx, valid_idx in kf.split(X):
        X_tr, X_va = X.iloc[train_idx].astype(np.float32), X.iloc[valid_idx].astype(np.float32)
        y_tr, y_va = y.iloc[train_idx], y.iloc[valid_idx]

        model = XGBRegressor(**params)
        model.fit(
            X_tr, 
            y_tr, eval_set=[(X_va, y_va)], 
            verbose=False, 
            )
        pred = model.predict(X_va)
        rmses.append(np.sqrt(mean_squared_error(y_va, pred)))

        trial.report(np.mean(rmses), step=len(rmses))
        if trial.should_prune():
            raise optuna.TrialPruned()

    return float(np.mean(rmses))


def run_optuna_two_phase(X, y):
    print("\n🔎 Fase 1: Busca ampla (80 trials)")
    study1 = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
    study1.optimize(lambda t: objective(t, X, y), n_trials=80)

    best_params = study1.best_params
    print("\n🏆 Melhores parâmetros Fase 1:")
    for k, v in best_params.items():
        print(f"   {k}: {v}")
    print(f"✅ Melhor RMSE (CV 5-fold): {study1.best_value:.4f}")

    def objective_refine(trial, X, y):
        params = {
            "n_estimators": trial.suggest_int(
                "n_estimators", max(500, best_params["n_estimators"] - 200), best_params["n_estimators"] + 200
            ),
            "learning_rate": trial.suggest_float(
                "learning_rate", max(0.005, best_params["learning_rate"] * 0.7),
                best_params["learning_rate"] * 1.3, log=True
            ),
            "max_depth": trial.suggest_int(
                "max_depth", max(2, best_params["max_depth"] - 2), min(12, best_params["max_depth"] + 2)
            ),
            "min_child_weight": trial.suggest_int(
                "min_child_weight", max(1, best_params["min_child_weight"] - 2), best_params["min_child_weight"] + 2
            ),
            "subsample": trial.suggest_float(
                "subsample", max(0.5, best_params["subsample"] - 0.1), min(1.0, best_params["subsample"] + 0.1)
            ),
            "colsample_bytree": trial.suggest_float(
                "colsample_bytree", max(0.5, best_params["colsample_bytree"] - 0.1), min(1.0, best_params["colsample_bytree"] + 0.1)
            ),
            "reg_lambda": trial.suggest_float(
                "reg_lambda", max(1e-3, best_params["reg_lambda"] * 0.5), best_params["reg_lambda"] * 1.5, log=True
            ),
            "reg_alpha": trial.suggest_float(
                "reg_alpha", max(1e-3, best_params["reg_alpha"] * 0.5), best_params["reg_alpha"] * 1.5, log=True
            ),
            "tree_method": "hist",
            "objective": "reg:squarederror",
            "n_jobs": -1,
            "random_state": 42,
        }

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        rmses = []

        for train_idx, valid_idx in kf.split(X):
            X_tr, X_va = X.iloc[train_idx].astype(np.float32), X.iloc[valid_idx].astype(np.float32)
            y_tr, y_va = y.iloc[train_idx], y.iloc[valid_idx]

            model = XGBRegressor(**params)
            model.fit(
                X_tr, y_tr, 
                eval_set=[(X_va, y_va)], 
                verbose=False
                )
            pred = model.predict(X_va)
            rmses.append(np.sqrt(mean_squared_error(y_va, pred)))

            trial.report(np.mean(rmses), step=len(rmses))
            if trial.should_prune():
                raise optuna.TrialPruned()

        return float(np.mean(rmses))

    print("\n🔎 Fase 2: Refinamento (100 trials)")
    study2 = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
    study2.optimize(lambda t: objective_refine(t, X, y), n_trials=100)

    best_params_final = study2.best_params
    print("\n🏆 Melhores parâmetros finais (Fase 2):")
    for k, v in best_params_final.items():
        print(f"   {k}: {v}")
    print(f"✅ Melhor RMSE (CV 5-fold): {study2.best_value:.4f}")

    best_model = XGBRegressor(**best_params_final)
    best_model.fit(X.astype(np.float32), y)
    return best_model

def basic_clean(df):
    global vectorizer, svd
    COLS_TO_DROP = ["id", "Series_Title"]

    df = df.copy()
    df.columns = [c.strip() for c in df.columns]

    for col in COLS_TO_DROP:
        if col in df.columns:
            df.drop(columns=col, inplace=True)

    df = df.dropna()
    df = preprocess_movies_df(df)

    if 'Released_Year' in df.columns:
        current_year = datetime.now().year
        df['Released_Year'] = df['Released_Year'].fillna(df['Released_Year'].median()).infer_objects(copy=False)
        df['Movie_Age'] = current_year - df['Released_Year']
        df['Is_Recent'] = (df['Movie_Age'] <= 5).astype(int)
        df['Is_Classic'] = (df['Movie_Age'] >= 30).astype(int)

    if 'No_of_Votes' in df.columns:
        df['Log_Votes'] = np.log1p(df['No_of_Votes'].fillna(0))
        df['High_Votes'] = (df['No_of_Votes'] > df['No_of_Votes'].quantile(0.75)).astype(int)

    if 'Gross' in df.columns:
        filled_gross = pd.to_numeric(df['Gross'], errors="coerce").fillna(0)
        df['Log_Gross'] = np.log1p(filled_gross)
        df['Has_Gross'] = df['Gross'].notna().astype(int)
        df['Gross'] = filled_gross

    if 'Meta_score' in df.columns:
        df['Has_Meta_Score'] = df['Meta_score'].notna().astype(int)
        filled_meta_score = pd.to_numeric(df['Meta_score'], errors="coerce")
        meta_median = filled_meta_score.median()
        df['Meta_score'] = filled_meta_score.fillna(meta_median)

    if 'Runtime' in df.columns:
        filled_runtime = pd.to_numeric(df['Runtime'], errors="coerce")
        runtime_median = filled_runtime.median()
        df['Runtime_filled'] = filled_runtime.fillna(runtime_median)
        df['Is_Long_Movie'] = (df['Runtime_filled'] > 120).astype(int)
        df['Is_Short_Movie'] = (df['Runtime_filled'] < 90).astype(int)
        df['Runtime'] = filled_runtime.fillna(runtime_median)

    if "Genre" in df.columns:
        df['Genre'] = df['Genre'].fillna('Unknown')
        df["Main_Genre"] = df["Genre"].astype(str).str.split(",").str[0]
        genre_dummies = pd.get_dummies(df["Genre"].astype(str), prefix="Genre")
        df = pd.concat([df, genre_dummies], axis=1)

    for col in ["Director", "Star1", "Star2", "Star3", "Star4"]:
        if col in df.columns:
            df[col] = df[col].fillna('Unknown')
            freq = df[col].value_counts()
            df[f"{col}_Freq"] = df[col].map(freq).fillna(0)

    if "Released_Year" in df.columns:
        df["Decade"] = (df["Released_Year"] // 10) * 10

    for col in ["Certificate", "Overview", "Genre", "Director", "Star1", "Star2", "Star3", "Star4"]:
        if col in df.columns:
            df.drop(columns=col, inplace=True)

    for col in df.columns:
        if col != 'IMDB_Rating' and df[col].dtype == 'object':
            df[col] = pd.to_numeric(df[col], errors='coerce')

    if 'IMDB_Rating' in df.columns:
        df = df.dropna(subset=['IMDB_Rating'])

    df = df.fillna(0).infer_objects(copy=False)
    return df


def split_X_y(df: pd.DataFrame):
    X = df.drop(columns=["IMDB_Rating"])
    y = df["IMDB_Rating"]
    return X, y

def train_models(csv_path: str):
    df_raw = pd.read_csv(csv_path, sep=",", quotechar='"', encoding="utf-8", low_memory=False)
    df_raw = df_raw.drop(df_raw.columns[0], axis=1)
    df_clean = basic_clean(df_raw)

    if df_clean.shape[0] == 0:
        print("❌ Erro: Nenhum dado restou após o processamento!")
        return

    X, y = split_X_y(df_clean)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f"🔢 Número de amostras: {X.shape[0]}")
    print(f"🎛️ Número de features: {X.shape[1]}")

    # RandomForest
    rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)

    # XGBoost baseline
    xgb = XGBRegressor(
        n_estimators=600, learning_rate=0.09, max_depth=6,
        subsample=0.94, colsample_bytree=1, random_state=42,
        reg_lambda=1.0, reg_alpha=0.09, verbosity=0,
        tree_method='hist', n_jobs=-1, objective='reg:squarederror',
    )
    xgb.fit(X_train.astype(np.float32), y_train)
    y_pred_xgb = xgb.predict(X_test.astype(np.float32))

    print("\n" + "="*60)
    print("📊 MÉTRICAS HOLD-OUT (20% Test)")
    print("="*60)

    print("\n🌲 RANDOM FOREST:")
    print(f"   RMSE = {np.sqrt(mean_squared_error(y_test, y_pred_rf)):.4f}")
    print(f"   MAE  = {mean_absolute_error(y_test, y_pred_rf):.4f}")
    print(f"   R²   = {r2_score(y_test, y_pred_rf):.4f}")

    print("\n⚡ XGBOOST:")
    print(f"   RMSE = {np.sqrt(mean_squared_error(y_test, y_pred_xgb)):.4f}")
    print(f"   MAE  = {mean_absolute_error(y_test, y_pred_xgb):.4f}")
    print(f"   R²   = {r2_score(y_test, y_pred_xgb):.4f}")

    #Optuna tuned
    # xgb_best = run_optuna_two_phase(X, y)
    # y_pred_xgb_best = xgb_best.predict(X_test.astype(np.float32))

    # print("\n⚡ XGBOOST (Optuna Tuned):")
    # print(f"   RMSE = {np.sqrt(mean_squared_error(y_test, y_pred_xgb_best)):.4f}")
    # print(f"   MAE  = {mean_absolute_error(y_test, y_pred_xgb_best):.4f}")
    # print(f"   R²   = {r2_score(y_test, y_pred_xgb_best):.4f}")

    print("\n" + "="*60)
    print("🎯 TOP 5 FEATURES MAIS IMPORTANTES")
    print("="*60)
    rf_importance = pd.DataFrame({'feature': X.columns, 'importance': rf.feature_importances_}) \
        .sort_values('importance', ascending=False).head(5)
    xgb_importance = pd.DataFrame({'feature': X.columns, 'importance': xgb.feature_importances_}) \
        .sort_values('importance', ascending=False).head(5)

    print("\n🌲 RANDOM FOREST:")
    for i, (_, row) in enumerate(rf_importance.iterrows(), 1):
        print(f"   {i}. {row['feature']}: {row['importance']:.4f}")

    print("\n⚡ XGBOOST:")
    for i, (_, row) in enumerate(xgb_importance.iterrows(), 1):
        print(f"   {i}. {row['feature']}: {row['importance']:.4f}")

    # Teste em filmes
    print("\n" + "="*60)
    print("🎬 TESTE: The Shawshank Redemption")
    print("="*60)

    shawshank = {
        'Series_Title': 'The Shawshank Redemption',
        'Released_Year': 1994,
        'Certificate': 'A',
        'Runtime': '142 min',
        'Genre': 'Drama',
        'Overview': 'Two imprisoned men bond over a number of years.',
        'Meta_score': 80.0,
        'Director': 'Frank Darabont',
        'Star1': 'Tim Robbins',
        'Star2': 'Morgan Freeman',
        'Star3': 'Bob Gunton',
        'Star4': 'William Sadler',
        'No_of_Votes': 2343110,
        'Gross': '28341469'
    }
    godfather = {
        "Series_Title": "The Godfather",
        "Released_Year": 1972,
        "Certificate": "A",
        "Runtime": "175 min",
        "Genre": "Crime, Drama",
        "Overview": "An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son.",
        "Meta_score": 100.0,
        "Director": "Francis Ford Coppola",
        "Star1": "Marlon Brando",
        "Star2": "Al Pacino",
        "Star3": "James Caan",
        "Star4": "Diane Keaton",
        "No_of_Votes": 1620367,
        "Gross": "134966411"
    }

    df_test = pd.DataFrame([shawshank, godfather])
    df_test_clean = basic_clean(df_test)

    if df_test_clean.shape[0] > 0:
        training_cols = df_clean.drop(columns=["IMDB_Rating"]).columns
        missing_cols = [col for col in training_cols if col not in df_test_clean.columns]
        if missing_cols:
            df_test_clean = pd.concat([df_test_clean, pd.DataFrame(0, index=df_test_clean.index, columns=missing_cols)], axis=1)
        df_test_clean = df_test_clean[training_cols].astype(np.float32)

        pred_rf = rf.predict(df_test_clean)[0]
        pred_xgb = xgb.predict(df_test_clean)[0]
        pred_xgb_best = xgb_best.predict(df_test_clean)[0]

        print(f"\n🎯 RESULTADOS:")
        print(f"🌲 RandomForest: {pred_rf:.2f}")
        print(f"⚡ XGBoost: {pred_xgb:.2f}")
        print(f"⚡ XGBoost (Optuna): {pred_xgb_best:.2f}")
        print(f"📊 Rating Real: 9.30")
        print(f"🎯 Erro RF: {abs(pred_rf - 9.3):.2f}")
        print(f"🎯 Erro XGB: {abs(pred_xgb - 9.3):.2f}")
        print(f"🎯 Erro XGB Optuna: {abs(pred_xgb_best - 9.3):.2f}")
    else:
        print("❌ Erro no processamento do exemplo")

    return df_clean, rf, xgb, xgb_best


if __name__ == "__main__":
    train_models("data/raw/desafio_indicium_imdb.csv")

: 