# Import

In [2]:
import pandas as pd
import ast
import os

# Paths & reading data

In [3]:
DATA_DIR = "../../Data"
movies = pd.read_csv(os.path.join(DATA_DIR, "movies_metadata.csv"), low_memory=False)
credits = pd.read_csv(os.path.join(DATA_DIR, "credits.csv"))
keywords = pd.read_csv(os.path.join(DATA_DIR, "keywords.csv"))

In [4]:
# Colonnes utiles dans movies_metadata
movies = movies[[
    "id", "title", "genres", "runtime", "budget",
    "vote_average", "vote_count", "popularity"
]]

# Conversion types
movies["id"] = pd.to_numeric(movies["id"], errors="coerce")
movies["budget"] = pd.to_numeric(movies["budget"], errors="coerce")
movies["runtime"] = pd.to_numeric(movies["runtime"], errors="coerce")

movies = movies.dropna(subset=["id", "title"])
movies["id"] = movies["id"].astype(int)

In [5]:
def parse_json_list(json_str, key):
    try:
        data = ast.literal_eval(json_str)
        return [item[key] for item in data]
    except:
        return []

In [6]:
movies["genres"] = movies["genres"].apply(lambda x: parse_json_list(x, "name"))
credits["actors"] = credits["cast"].apply(lambda x: parse_json_list(x, "name"))
credits["crew"] = credits["crew"].apply(lambda x: parse_json_list(x, "name"))
keywords["keywords"] = keywords["keywords"].apply(lambda x: parse_json_list(x, "name"))

In [7]:
df = movies.merge(
    credits[["id", "actors", "crew"]],
    on="id",
    how="left"
)

df = df.merge(
    keywords[["id", "keywords"]],
    on="id",
    how="left"
)

In [8]:
df["weighted_rating"] = (
    df["vote_average"] * df["vote_count"]
) / (df["vote_count"] + 100)

In [9]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42
)

In [10]:
from collections import Counter

actor_popularity = Counter()

for actors in train_df["actors"]:
    actor_popularity.update(actors)


In [11]:
crew_popularity = Counter()

for crew in train_df["crew"]:
    crew_popularity.update(crew)


In [17]:
def popularity_features(names, popularity_dict):
    # Si NaN ou pas une liste
    if not isinstance(names, list) or len(names) == 0:
        return 0.0, 0.0, 0

    pops = [popularity_dict.get(name, 0) for name in names]

    return (
        sum(pops) / len(pops),   # popularité moyenne
        max(pops),               # popularité max
        sum(p > 5 for p in pops) # nb de personnes "connues"
    )


In [18]:
train_df[[
    "actor_pop_mean",
    "actor_pop_max",
    "actor_pop_known"
]] = train_df["actors"].apply(
    lambda x: pd.Series(popularity_features(x, actor_popularity))
)

test_df[[
    "actor_pop_mean",
    "actor_pop_max",
    "actor_pop_known"
]] = test_df["actors"].apply(
    lambda x: pd.Series(popularity_features(x, actor_popularity))
)


In [19]:
train_df[[
    "crew_pop_mean",
    "crew_pop_max",
    "crew_pop_known"
]] = train_df["crew"].apply(
    lambda x: pd.Series(popularity_features(x, crew_popularity))
)

test_df[[
    "crew_pop_mean",
    "crew_pop_max",
    "crew_pop_known"
]] = test_df["crew"].apply(
    lambda x: pd.Series(popularity_features(x, crew_popularity))
)


In [None]:
train_df[[
    "actor_pop_mean", "actor_pop_max", "actor_pop_known",
    "crew_pop_mean", "crew_pop_max", "crew_pop_known"
]].describe()


Unnamed: 0,actor_pop_mean,actor_pop_max,actor_pop_known,crew_pop_mean,crew_pop_max,crew_pop_known
count,37300.0,37300.0,37300.0,37300.0,37300.0,37300.0
mean,10.470147,31.556005,5.783164,9.653932,26.166676,4.542708
std,8.635252,26.485943,6.25464,9.759163,32.159583,7.160077
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.75,10.0,1.0,3.0,5.0,0.0
50%,8.5,27.0,4.0,6.647059,14.0,2.0
75%,15.357143,47.0,8.0,12.928571,34.0,6.0
max,121.0,196.0,140.0,122.0,214.0,90.0


: 