<a href="https://colab.research.google.com/github/DivyaRoopa123/Intern-week--1/blob/main/week_2_mini_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:



import os
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


import joblib


from matplotlib.backends.backend_pdf import PdfPages


pd.set_option('display.max_columns', 40)


In [44]:


path = "/mnt/data/tmdb_5000_movies.csv"

if os.path.exists(path):
    print("Found dataset at", path)
    df = pd.read_csv(path)
else:
    try:

        from google.colab import files
        print("Open the file picker and upload tmdb_5000_movies.csv")
        uploaded = files.upload()

        filename = list(uploaded.keys())[0]
        df = pd.read_csv(filename)
    except Exception as exc:
        raise RuntimeError("Could not load dataset. Please ensure tmdb_5000_movies.csv is uploaded.") from exc

print("Dataset loaded. Shape:", df.shape)
df.head()


Open the file picker and upload tmdb_5000_movies.csv


Saving tmdb_5000_movies.csv to tmdb_5000_movies (11).csv
Dataset loaded. Shape: (4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [None]:
Introduction — dataset, prediction goal (e.g., predict movie revenue).

Data — summary of columns,

Preprocessing — steps you applied: fill/convert, feature engineering, why you removed zero revenues (if done).

Modeling — models tried, final chosen model, and brief rationale.

Evaluation — metrics on test set (include RMSE, MAE, R2 both log and original scale), confusion for classification if applicable. Include plots: actual vs predicted, feature importances.

In [45]:

from matplotlib.backends.backend_pdf import PdfPages


def metrics_dict(y_true, y_pred):
    res = {}
    res['rmse_log'] = mean_squared_error(y_true, y_pred, squared=False)
    res['mae_log'] = mean_absolute_error(y_true, y_pred)
    res['r2_log'] = r2_score(y_true, y_pred)
    y_true_o = np.expm1(y_true); y_pred_o = np.expm1(y_pred)
    res['rmse_orig'] = mean_squared_error(y_true_o, y_pred_o, squared=False)
    res['mae_orig'] = mean_absolute_error(y_true_o, y_pred_o)
    return res

m_lr = metrics_dict(y_test, y_pred_lr)
m_rfr = metrics_dict(y_test, y_pred_rfr)


pdf_path = "model_report_tmdb.pdf"
with PdfPages(pdf_path) as pdf:

    fig = plt.figure(figsize=(8.27, 11.69))  # A4 portrait in inches
    fig.text(0.1, 0.92, "TMDB Revenue Prediction - Short Report", fontsize=18, weight='bold')
    fig.text(0.1, 0.86, f"Rows (after filtering revenue>0): {len(df_clean)}", fontsize=11)
    fig.text(0.1, 0.82, f"Features used: {', '.join(features)}", fontsize=11)
    fig.text(0.1, 0.78, "Preprocessing summary:", fontsize=12, weight='bold')
    txt = (
        "- Parsed release_date; extracted release_year\n"
        "- Filled missing numeric values with median\n"
        "- Created genre_count and encoded original_language\n"
        "- Log-transformed target (log1p) to reduce skew\n"
    )
    fig.text(0.1, 0.68, txt, fontsize=10)
    fig.text(0.1, 0.56, "Modeling: Linear Regression (baseline) and Random Forest Regressor (primary).", fontsize=11)
    pdf.savefig(fig); plt.close(fig)


    fig = plt.figure(figsize=(8.27, 11.69))
    fig.text(0.1, 0.92, "Evaluation Metrics", fontsize=16, weight='bold')
    fig.text(0.1, 0.86, "Linear Regression (log1p target):", fontsize=12, weight='bold')
    fig.text(0.1, 0.82, f"RMSE (log): {m_lr['rmse_log']:.4f}, MAE (log): {m_lr['mae_log']:.4f}, R2: {m_lr['r2_log']:.4f}")
    fig.text(0.1, 0.78, f"RMSE (orig): {m_lr['rmse_orig']:,.2f}, MAE (orig): {m_lr['mae_orig']:,.2f}")
    fig.text(0.1, 0.72, "Random Forest (log1p target):", fontsize=12, weight='bold')
    fig.text(0.1, 0.68, f"RMSE (log): {m_rfr['rmse_log']:.4f}, MAE (log): {m_rfr['mae_log']:.4f}, R2: {m_rfr['r2_log']:.4f}")
    fig.text(0.1, 0.64, f"RMSE (orig): {m_rfr['rmse_orig']:,.2f}, MAE (orig): {m_rfr['mae_orig']:,.2f}")

    if hasattr(rfr, 'feature_importances_'):
        fi_text = "\n".join([f"{i+1}. {feat}: {imp:.4f}" for i,(feat,imp) in enumerate(zip(features, rfr.feature_importances_))])
        fig.text(0.1, 0.56, "Feature importances (Random Forest):", fontsize=12, weight='bold')
        fig.text(0.1, 0.40, fi_text, fontsize=10)
    pdf.savefig(fig); plt.close(fig)



    if hasattr(rfr, 'feature_importances_'):
        fig, ax = plt.subplots(figsize=(8,6))
        fi_series = pd.Series(rfr.feature_importances_, index=features).sort_values(ascending=False)
        fi_series.plot(kind='bar', ax=ax)
        ax.set_title("Feature importances (Random Forest)")
        ax.set_ylabel("Importance")
        plt.tight_layout()
        pdf.savefig(fig); plt.close(fig)


    fig, ax = plt.subplots(figsize=(8,6))
    ax.scatter(y_test_orig, y_pred_rfr_orig, alpha=0.4)
    ax.plot([0, y_test_orig.max()], [0, y_test_orig.max()], linestyle='--')
    ax.set_xlabel("Actual revenue")
    ax.set_ylabel("Predicted revenue")
    ax.set_title("Actual vs Predicted revenue (Random Forest)")
    plt.tight_layout()
    pdf.savefig(fig); plt.close(fig)

print(f"PDF report saved to: {pdf_path}")


TypeError: got an unexpected keyword argument 'squared'

In [None]:
evaluate_reg(y_test, y_pred_lr, label="Linear Regression")
evaluate_reg(y_test, y_pred_rfr, label="Random Forest")


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def evaluate_reg(y_true, y_pred, label="Model"):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"\n{label} (log1p target) -> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")


    y_true_orig = np.expm1(y_true)
    y_pred_orig = np.expm1(y_pred)

    rmse_orig = mean_squared_error(y_true_orig, y_pred_orig, squared=False)
    mae_orig = mean_absolute_error(y_true_orig, y_pred_orig)
    r2_orig = r2_score(y_true_orig, y_pred_orig)

    print(f"{label} (original revenue) -> RMSE: {rmse_orig:,.2f}, MAE: {mae_orig:,.2f}, R2: {r2_orig:.4f}")


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)


rfr = RandomForestRegressor(n_estimators=200, random_state=42)
rfr.fit(X_train, y_train)
y_pred_rfr = rfr.predict(X_test)

print("Models trained successfully.")


In [None]:


import os, ast
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

print("Starting recovery + evaluation...")


def evaluate_reg(y_true, y_pred, label="Model"):
    if y_true is None or y_pred is None:
        raise ValueError(f"{label}: y_true or y_pred is None.")
    if len(y_true) != len(y_pred):
        raise ValueError(f"{label}: length mismatch: len(y_true)={len(y_true)} vs len(y_pred)={len(y_pred)}")
    rmse_log = mean_squared_error(y_true, y_pred, squared=False)
    mae_log = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{label} (log1p target) -> RMSE: {rmse_log:.4f}, MAE: {mae_log:.4f}, R2: {r2:.4f}")

    y_true_orig = np.expm1(np.array(y_true))
    y_pred_orig = np.expm1(np.array(y_pred))
    y_true_orig = np.nan_to_num(y_true_orig, nan=0.0, posinf=0.0, neginf=0.0)
    y_pred_orig = np.nan_to_num(y_pred_orig, nan=0.0, posinf=0.0, neginf=0.0)
    rmse_orig = mean_squared_error(y_true_orig, y_pred_orig, squared=False)
    mae_orig = mean_absolute_error(y_true_orig, y_pred_orig)
    print(f"{label} (original revenue) -> RMSE: {rmse_orig:,.2f}, MAE: {mae_orig:,.2f}")


if 'df_clean' not in globals():
    print("df_clean not found in workspace — attempting to rebuild from 'df' (raw dataframe).")
    if 'df' not in globals():
        raise NameError("Neither 'df_clean' nor raw 'df' exist. Please run the upload cell (or re-upload the CSV).")

    df_raw = df.copy()
    df_raw['release_date'] = pd.to_datetime(df_raw.get('release_date', None), errors='coerce')
    df_raw['release_year'] = df_raw['release_date'].dt.year
    if df_raw['release_year'].notna().sum() > 0:
        df_raw['release_year'] = df_raw['release_year'].fillna(int(df_raw['release_year'].median()))
    else:
        df_raw['release_year'] = df_raw['release_year'].fillna(0).astype(int)
    if 'runtime' in df_raw.columns:
        df_raw['runtime'] = pd.to_numeric(df_raw['runtime'], errors='coerce').fillna(df_raw['runtime'].median())
    for c in ['budget','popularity','vote_average','vote_count']:
        if c in df_raw.columns:
            df_raw[c] = pd.to_numeric(df_raw[c], errors='coerce').fillna(df_raw[c].median())
    def count_genres(x):
        if pd.isna(x): return 0
        try:
            parsed = ast.literal_eval(x) if isinstance(x, str) else x
            if isinstance(parsed, list): return len(parsed)
        except Exception:
            s = str(x)
            return s.count("'name'") + s.count('"name"')
        return 0
    if 'genres' in df_raw.columns:
        df_raw['genre_count'] = df_raw['genres'].apply(count_genres)
    else:
        df_raw['genre_count'] = 0
    if 'original_language' in df_raw.columns:
        le_temp = LabelEncoder()
        df_raw['original_language'] = df_raw['original_language'].fillna('unknown')
        df_raw['orig_lang_enc'] = le_temp.fit_transform(df_raw['original_language'])
    else:
        df_raw['orig_lang_enc'] = 0
    df_raw['revenue'] = pd.to_numeric(df_raw.get('revenue', 0), errors='coerce').fillna(0)
    initial_rows = len(df_raw)
    df_clean = df_raw[df_raw['revenue'] > 0].copy()
    print(f"Rebuilt df_clean from df. Dropped {initial_rows - len(df_clean)} rows with revenue <=0. Remaining: {len(df_clean)}")
else:
    print("df_clean already present. Using existing df_clean.")


default_features = [f for f in ['budget','popularity','runtime','vote_average','vote_count','release_year','genre_count','orig_lang_enc'] if f in df_clean.columns]
if 'features' not in globals() or not isinstance(features, (list,tuple)) or len(features)==0:
    features = default_features
    print("Set 'features' to:", features)
else:

    features = [f for f in features if f in df_clean.columns]
    if len(features)==0:
        features = default_features
    print("Using features:", features)

if len(features) == 0:
    raise ValueError("No valid features found. Check your dataframe columns.")


if 'X_train' not in globals() or 'X_test' not in globals() or 'y_train' not in globals() or 'y_test' not in globals():
    print("Train/test split missing — creating new split.")
    X = df_clean[features].copy()
    y = np.log1p(df_clean['revenue'].astype(float))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    print("Created X_train/X_test shapes:", X_train.shape, X_test.shape)
else:
    print("Train/test variables already present; using them.")

if 'scaler' not in globals():
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Fitted new StandardScaler.")
else:

    try:
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        print("Used existing scaler to transform features.")
    except Exception:
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        print("Refit scaler due to transform error.")


need_train = False
if 'y_pred_lr' not in globals() or 'y_pred_rfr' not in globals():
    need_train = True
if need_train:
    print("Predictions missing — training quick baseline models now (this may take a short while).")

    lr = LinearRegression()
    lr.fit(X_train_scaled, y_train)
    y_pred_lr = lr.predict(X_test_scaled)

    rfr = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42, n_jobs=-1)
    rfr.fit(X_train_scaled, y_train)
    y_pred_rfr = rfr.predict(X_test_scaled)
    print("Training complete. Predictions created: y_pred_lr, y_pred_rfr.")
else:
    print("Predictions found in workspace; using them.")
try:
    evaluate_reg(y_test, y_pred_lr, label="Linear Regression")
    evaluate_reg(y_test, y_pred_rfr, label="Random Forest")
    print("\nEvaluation finished successfully.")
except Exception as e:
    print("Evaluation error encountered:", type(e).__name__, str(e))

    print("Diagnostics:")
    for name in ['y_test','y_pred_lr','y_pred_rfr','X_train','X_test']:
        print(f" - {name} in globals():", name in globals())

    try:
        print("Shapes -> y_test:", getattr(y_test, 'shape', None),
              "y_pred_lr:", getattr(y_pred_lr, 'shape', None),
              "y_pred_rfr:", getattr(y_pred_rfr, 'shape', None))
    except Exception:
        pass
    raise


In [None]:
# --- Step 7: Robust evaluation function and calls ---
# Make sure required imports are present
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def evaluate_reg(y_true, y_pred, label="Model"):
    """
    Evaluate regression predictions.
    Expects y_true and y_pred to be on the same scale (here: log1p(revenue)).
    Prints RMSE/MAE/R2 on the log scale and RMSE/MAE on the original revenue scale.
    """
    # Basic checks
    if y_true is None or y_pred is None:
        raise ValueError(f"{label}: y_true or y_pred is None.")
    if len(y_true) != len(y_pred):
        raise ValueError(f"{label}: length mismatch: len(y_true)={len(y_true)} vs len(y_pred)={len(y_pred)}")

    # Metrics on transformed (log1p) target
    rmse_log = mean_squared_error(y_true, y_pred, squared=False)
    mae_log = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{label} (log1p target) -> RMSE: {rmse_log:.4f}, MAE: {mae_log:.4f}, R2: {r2:.4f}")

    # Convert back to original revenue scale for intuitive interpretation
    y_true_orig = np.expm1(np.array(y_true))
    y_pred_orig = np.expm1(np.array(y_pred))

    # If any negative/NaN values after expm1 (shouldn't happen), clip to 0
    y_true_orig = np.nan_to_num(y_true_orig, nan=0.0, posinf=0.0, neginf=0.0)
    y_pred_orig = np.nan_to_num(y_pred_orig, nan=0.0, posinf=0.0, neginf=0.0)

    rmse_orig = mean_squared_error(y_true_orig, y_pred_orig, squared=False)
    mae_orig = mean_absolute_error(y_true_orig, y_pred_orig)
    print(f"{label} (original revenue) -> RMSE: {rmse_orig:,.2f}, MAE: {mae_orig:,.2f}")



_missing = []
for varname in ['y_test', 'y_pred_lr', 'y_pred_rfr']:
    if varname not in globals() and varname not in locals():
        _missing.append(varname)
if _missing:
    raise NameError(f"The following required variables are missing: {_missing}. "
                    "Make sure you have run the training cells and they produced these variables "
                    "(y_test, y_pred_lr, y_pred_rfr).")

# Now call evaluate_reg for both models
evaluate_reg(y_test, y_pred_lr, label="Linear Regression")
evaluate_reg(y_test, y_pred_rfr, label="Random Forest")


In [None]:
def evaluate_reg(y_true, y_pred, label="Model"):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{label} (log1p target) -> RMSE: {rmse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

    y_true_orig = np.expm1(y_true)
    y_pred_orig = np.expm1(y_pred)
    rmse_orig = mean_squared_error(y_true_orig, y_pred_orig, squared=False)
    mae_orig = mean_absolute_error(y_true_orig, y_pred_orig)
    print(f"{label} (original revenue) -> RMSE: {rmse_orig:,.2f}, MAE: {mae_orig:,.2f}")


evaluate_reg(y_test, y_pred_lr, label="Linear Regression")
evaluate_reg(y_test, y_pred_rfr, label="Random Forest")


In [None]:

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)


rfr = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42, n_jobs=-1)
rfr.fit(X_train_scaled, y_train)
y_pred_rfr = rfr.predict(X_test_scaled)

print("Models trained.")


In [None]:
X = df_clean[features]
y = df_clean['log_revenue']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:

df_clean = df.copy()


df_clean['release_date'] = pd.to_datetime(df_clean.get('release_date', None), errors='coerce')
df_clean['release_year'] = df_clean['release_date'].dt.year

if df_clean['release_year'].notna().sum() > 0:
    df_clean['release_year'] = df_clean['release_year'].fillna(int(df_clean['release_year'].median()))
else:
    df_clean['release_year'] = df_clean['release_year'].fillna(0).astype(int)


if 'runtime' in df_clean.columns:
    df_clean['runtime'] = pd.to_numeric(df_clean['runtime'], errors='coerce')
    df_clean['runtime'] = df_clean['runtime'].fillna(df_clean['runtime'].median())


numeric_candidates = ['budget','popularity','vote_average','vote_count']
for c in numeric_candidates:
    if c in df_clean.columns:
        df_clean[c] = pd.to_numeric(df_clean[c], errors='coerce')
        df_clean[c] = df_clean[c].fillna(df_clean[c].median())


def count_genres(x):
    if pd.isna(x):
        return 0
    try:
        parsed = ast.literal_eval(x) if isinstance(x, str) else x
        if isinstance(parsed, list):
            return len(parsed)
    except Exception:

        s = str(x)
        return s.count("'name'") + s.count('"name"') if isinstance(s, str) else 0
    return 0

if 'genres' in df_clean.columns:
    df_clean['genre_count'] = df_clean['genres'].apply(count_genres)
else:
    df_clean['genre_count'] = 0


if 'original_language' in df_clean.columns:
    le = LabelEncoder()
    df_clean['original_language'] = df_clean['original_language'].fillna('unknown')
    df_clean['orig_lang_enc'] = le.fit_transform(df_clean['original_language'])
else:
    df_clean['orig_lang_enc'] = 0


df_clean['revenue'] = pd.to_numeric(df_clean.get('revenue', 0), errors='coerce').fillna(0)


initial_rows = len(df_clean)
df_clean = df_clean[df_clean['revenue'] > 0].copy()
dropped = initial_rows - len(df_clean)
print(f"Dropped {dropped} rows with non-positive revenue. Remaining rows: {len(df_clean)}")


df_clean['log_revenue'] = np.log1p(df_clean['revenue'])


features = [f for f in ['budget','popularity','runtime','vote_average','vote_count','release_year','genre_count','orig_lang_enc'] if f in df_clean.columns]
print("Features to use:", features)


df_clean[features + ['revenue','log_revenue']].head()


In [None]:
print("Columns:", df.columns.tolist())
print("\nData types:\n", df.dtypes)
print("\nMissing counts (top 20):\n", df.isna().sum().sort_values(ascending=False).head(20))
print("\nBasic stats for numeric cols:\n", df.describe().T[['count','mean','std','min','max']])
