In [41]:
# Cody Giles - Student ID: 010506641
# C964 Capstone - Movie Audience Rating Predictor aka The MARP

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
# read data
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


In [42]:
class GenreEncoder:
    def fit(self, df: pd.DataFrame) -> None:
        genres = set()
        for row in df['genre']:
            for genre in row:
                genres.add(genre)
        genres.remove('nan')
        genres = list(genres)
        self.genres = genres

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        a = np.zeros((df.shape[0], len(self.genres)))
        for i, row in enumerate(df.iloc):
            for j, genre in enumerate(self.genres):
                if genre in row['genre']:
                    a[i, j] = 1
        df = df.reset_index().join(pd.DataFrame(a, columns=self.genres)).set_index('index')
        return df

    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        self.fit(df)
        return self.transform(df)

In [58]:
class ActorEncoder:
    def __init__(self, n_actors=3, multipliers=None):
        self.n_actors = n_actors
        self.multipliers = multipliers if multipliers else [3, 2, 1.5, 1]

    def fit(self, X: pd.DataFrame, y: pd.DataFrame) -> None:
        df = X.join(y)
        actor_scores_df = (
            df[['cast', 'date', 'score']]
            .explode('cast')
        )

        actor_scores_df['adjusted_score'] = actor_scores_df['score']

        # Apply multipliers based on user_score and actor order
        for idx, row in df.iterrows():
            actor_list = row['cast']
            user_score = row['score']

            current_multipliers = [1] * len(actor_list) if user_score < 50 else self.multipliers

            for i, actor in enumerate(actor_list):
                multiplier = current_multipliers[i] if i < len(current_multipliers) else 1
                weighted_score = multiplier * user_score
                actor_scores_df.loc[(actor_scores_df['cast'] == actor) &
                                    (actor_scores_df['date'] == row['date']), 'adjusted_score'] = weighted_score

        # Compute expanding average of adjusted scores
        actor_scores_df['avg_score'] = (
            actor_scores_df
            .groupby('cast')['adjusted_score']
            .expanding()
            .mean()
            .reset_index(drop=True)
        )

        actor_scores_df = (
            actor_scores_df
            .drop(['score', 'adjusted_score'], axis=1)
            .sort_values(['cast', 'date'])
            .rename({'cast': 'actor'}, axis=1)
        )

        actor_scores_df['actor'] = actor_scores_df['actor'].apply(lambda x: str(x))
        self.actor_scores_df = actor_scores_df

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        for i in range(self.n_actors):
            df = df.reset_index()
            df[f'actor_{i}'] = df['cast'].apply(lambda x: str(x[i]) if len(x) > i else '~')
            df2 = pd.merge(df, self.actor_scores_df, left_on=f'actor_{i}', right_on='actor')
            df2 = df2[df2['date_x'] > df2['date_y']].sort_values(['actor', 'date_x']).groupby('index').last()[['avg_score']]
            df = df.set_index('index').join(df2)
            df[f'actor_{i}'] = df['avg_score']
            df = df.drop('avg_score', axis=1)

        df['actor_mean'] = 0
        for i in range(self.n_actors):
            df['actor_mean'] += df[f'actor_{i}']

        df['actor_mean'] /= self.n_actors

        return df

    def fit_transform(self, X: pd.DataFrame, y: pd.DataFrame) -> pd.DataFrame:
        self.fit(X, y)
        return self.transform(X)

In [44]:
class OneHotEncoderWraper(OneHotEncoder):
    def fit(self, X: pd.DataFrame) -> pd.DataFrame:
        super().fit(X[['lang', 'country']])

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df = pd.DataFrame(super().transform(X[['lang', 'country']]).toarray(), columns=self.get_feature_names_out())
        return pd.concat([X.reset_index(),df], axis=1).set_index('index')

    def fit_transform(self, X: pd.DataFrame) -> pd.DataFrame:
        self.fit(X)
        return self.transform(X)

In [48]:
class PipeLine:
    def __init__(self, **kwargs):
        self.ohe = OneHotEncoderWraper(handle_unknown='ignore')
        self.ge = GenreEncoder()
        self.ae = ActorEncoder(n_actors=9)
        self.sc = StandardScaler()
        self.model = XGBRegressor(**kwargs)

    def fit(self, X, y):
        X = self._preprocess(X)
        X = self.ae.fit_transform(X, y)
        X = self.ge.fit_transform(X)
        X = self.ohe.fit_transform(X)
        X = X.select_dtypes(['number'])
        cols = X.columns.to_list()
        # X = self.sc.fit_transform(X)
        self.model.fit(X, y)

        self.feature_importance = (
            pd.DataFrame(
                list(zip(cols, self.model.feature_importances_)),
                columns=['feature', 'importance'])
            .sort_values('importance', ascending=False)
        )

    def _preprocess(self, X):
        X = X.copy()
        X['year'] = X['date'].dt.year
        X['month_x'] = (np.sin(2 * np.pi * X['date'].dt.month/12)+1)/2
        X['month_y'] = (np.cos(2 * np.pi * X['date'].dt.month/12)+1)/2
        X['day_x'] = (np.sin(2 * np.pi * X['date'].dt.day/X['date'].dt.days_in_month)+1)/2
        X['day_y'] = (np.cos(2 * np.pi * X['date'].dt.day/X['date'].dt.days_in_month)+1)/2
        X['dow_x'] = (np.sin(2 * np.pi * X['date'].dt.day_of_week/7)+1)/2
        X['dow_y'] = (np.cos(2 * np.pi * X['date'].dt.day_of_week/7)+1)/2
        return X

    def _transform(self, X):
        X = X.copy()
        X = self._preprocess(X)
        X = self.ae.transform(X)
        X = self.ge.transform(X)
        X = self.ohe.transform(X)
        X = X.select_dtypes(['number'])
        # X = self.sc.transform(X)
        return X

    def predict(self, X):
        X = self._transform(X)
        return self.model.predict(X)

    def score(self, X, y):
        X = self._transform(X)
        return self.model.score(X, y)

    def inference(self, date, genre, cast,
                  lang, budget, country):
        df = pd.DataFrame(
            {
                'date': [date],
                'genre': [genre],
                'cast': [cast],
                'lang': [lang],
                'budget': [budget],
                'country': [country]
            }
        )
        df['date'] = pd.to_datetime(df['date'])
        return float(self.predict(df)[0])

In [49]:
df = pd.read_csv('imdb_movies.csv')
df = df[df['status'] == ' Released']
df = (
    df
    .drop(['names', 'overview', 'orig_title', 'status', 'revenue'], axis=1)
    .rename({'date_x':'date', 'orig_lang':'lang', 'crew':'cast', 'budget_x':'budget'}, axis=1)
)

df['score'] = df['score'] / 10
df['cast'] = df['cast'].apply(lambda x: str(x).split(', ')[::2])
df['date'] = pd.to_datetime(df['date'])
df['genre'] = df['genre'].apply(lambda x: str(x).replace('\xa0', ' ')).str.split(', ')
df['lang'] = df['lang'].str.replace(' ', '').apply(lambda x: x.split(',')[0])
df = df.sort_values('date').reset_index(drop=True)

print(df.shape[0])
df.head()

10077


Unnamed: 0,date,score,genre,cast,lang,budget,country
0,1903-05-15,6.3,"[Drama, History]","[Madame Moreau, Monsieur Moreau]",French,106400000.0,FR
1,1907-06-20,8.0,"[Adventure, Science Fiction]","[Georges Meliès, Bleuette Bernon, François Lal...",French,5985.0,AU
2,1915-02-08,6.1,"[Drama, History, War]","[Lillian Gish, Mae Marsh, Henry B. Walthall, M...",English,10000000.0,US
3,1915-02-08,6.1,"[Drama, History, War]","[Lillian Gish, Mae Marsh, Henry B. Walthall, M...",English,110000.0,US
4,1920-02-27,8.0,"[Drama, Horror, Thriller, Crime]","[Werner Krauß, Conrad Veidt, Friedrich Feher, ...",German,18000.0,DE


In [59]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('score', axis=1), df['score'], shuffle=False, test_size=.2)

pipeline = PipeLine()
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.21754560944272927

In [70]:
pipeline.inference(
    date = '2024-07-26',
    genre = ['Buddy Comedy', 'Action', 'Comedy', 'Superhero'],
    cast = ['Ryan Reynolds', 'Hugh Jackman', 'Emma Corrin', 'Matthew Macfadyen', 'Dafne Keen', 'Jon Favreau', 'Morena Baccarin', 'Rob Delaney', 'Leslie Uggams'],
    lang = 'English',
    budget = 200_000_000.00,
    country = 'AU'
)

7.717780113220215

In [71]:
v = pipeline.feature_importance
v

Unnamed: 0,feature,importance
108,country_KR,0.124504
79,country_AU,0.077915
33,Horror,0.058011
30,Drama,0.054595
26,Animation,0.054410
...,...,...
78,country_AT,0.000000
59,lang_Latin,0.000000
57,lang_Kannada,0.000000
54,lang_Indonesian,0.000000
