In [19]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestRegressor

# ===== Helper Functions =====
def extract_year(x):
    if pd.isna(x): return np.nan
    m = re.search(r"(19|20)\d{2}", str(x))
    return int(m.group()) if m else np.nan

def parse_duration(x):
    if pd.isna(x): return np.nan
    s = str(x).lower()
    m = re.search(r"(\d+)\s*min", s) or re.search(r"^\s*(\d+)\s*$", s)
    return float(m.group(1)) if m else np.nan

def parse_votes(x):
    if pd.isna(x): return np.nan
    s = str(x).replace(",", "").strip()
    m = re.search(r"(\d+)", s)
    return float(m.group(1)) if m else np.nan

def split_genres(x):
    if pd.isna(x): return []
    return [p.strip() for p in re.split(r"[|,/]", str(x)) if p.strip()]

def frequency_encode(series, reference):
    counts = reference.fillna("Unknown").astype(str).value_counts()
    return series.fillna("Unknown").astype(str).map(counts).fillna(0).astype(float)

# ===== Load Data and Train Model =====
df = pd.read_csv("/content/drive/MyDrive/IMDb Movies India.csv", encoding="latin1")

# Prepare training data
df["Year_num"] = df["Year"].apply(extract_year)
df["Duration_min"] = df["Duration"].apply(parse_duration)
df["Votes_num"] = df["Votes"].apply(parse_votes)
df["Votes_log1p"] = np.log1p(df["Votes_num"])

mlb = MultiLabelBinarizer()
genre_mat = mlb.fit_transform(df["Genre"].apply(split_genres))
genre_df = pd.DataFrame(genre_mat, columns=[f"Genre_{g}" for g in mlb.classes_])

df["Director_freq"] = frequency_encode(df["Director"], df["Director"])
df["Actor1_freq"] = frequency_encode(df["Actor 1"], df["Actor 1"])
df["Actor2_freq"] = frequency_encode(df["Actor 2"], df["Actor 2"])
df["Actor3_freq"] = frequency_encode(df["Actor 3"], df["Actor 3"])

X = pd.concat([
    df[["Year_num", "Duration_min", "Votes_num", "Votes_log1p",
        "Director_freq", "Actor1_freq", "Actor2_freq", "Actor3_freq"]],
    genre_df
], axis=1)

impute_medians = X.median()
X = X.fillna(impute_medians)
y = df["Rating"]

model = RandomForestRegressor(random_state=42)
model.fit(X[~y.isna()], y.dropna())

# ===== Take Movie Input =====
print("\nEnter movie details for prediction:\n")
year = input("Year (e.g. 2023): ")
duration = input("Duration (e.g. 150 min): ")
votes = input("Votes (e.g. 12,345): ")
genres = input("Genres (comma-separated, e.g. Drama, Romance): ")
director = input("Director Name: ")
actor1 = input("Actor 1: ")
actor2 = input("Actor 2: ")
actor3 = input("Actor 3: ")

# ===== Preprocess New Movie =====
new_movie = pd.DataFrame([{
    "Year": year,
    "Duration": duration,
    "Votes": votes,
    "Genre": genres,
    "Director": director,
    "Actor 1": actor1,
    "Actor 2": actor2,
    "Actor 3": actor3
}])

new_movie["Year_num"] = new_movie["Year"].apply(extract_year)
new_movie["Duration_min"] = new_movie["Duration"].apply(parse_duration)
new_movie["Votes_num"] = new_movie["Votes"].apply(parse_votes)
new_movie["Votes_log1p"] = np.log1p(new_movie["Votes_num"])

genre_mat_new = mlb.transform(new_movie["Genre"].apply(split_genres))
genre_df_new = pd.DataFrame(genre_mat_new, columns=genre_df.columns)

new_movie["Director_freq"] = frequency_encode(new_movie["Director"], df["Director"])
new_movie["Actor1_freq"] = frequency_encode(new_movie["Actor 1"], df["Actor 1"])
new_movie["Actor2_freq"] = frequency_encode(new_movie["Actor 2"], df["Actor 2"])
new_movie["Actor3_freq"] = frequency_encode(new_movie["Actor 3"], df["Actor 3"])

X_new = pd.concat([
    new_movie[["Year_num", "Duration_min", "Votes_num", "Votes_log1p",
               "Director_freq", "Actor1_freq", "Actor2_freq", "Actor3_freq"]],
    genre_df_new
], axis=1)

X_new = X_new.fillna(impute_medians)

# ===== Predict =====
predicted_rating = model.predict(X_new)[0]
print(f"\nPredicted IMDb Rating: {round(float(predicted_rating), 2)}")


Enter movie details for prediction:

Year (e.g. 2023): 2025
Duration (e.g. 150 min): 133
Votes (e.g. 12,345): 36
Genres (comma-separated, e.g. Drama, Romance): Mythological horror, also described as horror drama or supernatural thriller.
Director Name: Vishal Furia
Actor 1: Kajol
Actor 2: Indraneil Sengupta
Actor 3: Ronit Roy

Predicted IMDb Rating: 5.88


