In [13]:
# Cody Giles - Student ID: 010506641
# C964 Capstone - Movie Audience Rating Predictor aka The MARP

import pandas as pd
import numpy as np
import seaborn as sns
import datetime
from xgboost import XGBRegressor
# read data
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error


In [14]:
class GenreEncoder:
    def fit(self, df: pd.DataFrame) -> None:
        genres = set()
        for row in df['genre']:
            for genre in row:
                genres.add(genre)
        genres.remove('nan')
        genres = list(genres)
        self.genres = genres

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        a = np.zeros((df.shape[0], len(self.genres)))
        for i, row in enumerate(df.iloc):
            for j, genre in enumerate(self.genres):
                if genre in row['genre']:
                    a[i, j] = 1
        df = df.reset_index().join(pd.DataFrame(a, columns=self.genres)).set_index('index')
        return df

    def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
        self.fit(df)
        return self.transform(df)

In [15]:
class ActorEncoder:
    def __init__(self, n_actors=3):
        self.n_actors = n_actors

    def fit(self, X: pd.DataFrame, y: pd.DataFrame) -> None:
        df = X.join(y)
        actor_scores_df = (
            df[['cast', 'date', 'score']]
            .explode('cast')
        )

        actor_scores_df['avg_score'] = (
            actor_scores_df
            .groupby('cast')['score']
            .expanding()
            .mean()
            .reset_index(drop=True)
        )

        actor_scores_df = (
            actor_scores_df
            .drop('score', axis=1)
            .sort_values(['cast', 'date'])
            .rename({'cast': 'actor'}, axis=1)
        )

        actor_scores_df['actor'] = actor_scores_df['actor'].apply(lambda x: str(x))
        self.actor_scores_df = actor_scores_df


    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()
        for i in range(self.n_actors):
            df = df.reset_index()
            df[f'actor_{i}'] = df['cast'].apply(lambda x: str(x[i]) if len(x) > i else '~')
            df2 = pd.merge(df, self.actor_scores_df, left_on=f'actor_{i}', right_on='actor')
            df2 = df2[df2['date_x'] > df2['date_y']].sort_values(['actor', 'date_x']).groupby('index').last()[['avg_score']]
            df = df.set_index('index').join(df2)
            df[f'actor_{i}'] = df['avg_score']
            df = df.drop('avg_score', axis=1)

        df['actor_mean'] = 0
        df['n_scores'] = 0

        for i in range(self.n_actors):
            df['actor_mean'] += df[f'actor_{i}'].apply(lambda x: x if x >= 0 else 0)
            df['n_scores'] += df[f'actor_{i}'].apply(lambda x: 1 if x >= 0 else 0)

        df['actor_mean'] = np.divide(df['actor_mean'], df['n_scores'])

        for i in range(self.n_actors):
            df[f'actor_{i}'] = df[[f'actor_{x+i}' for x in range(self.n_actors-i)]].bfill(axis=1).iloc[:,0]

        return df

    def fit_transform(self, X: pd.DataFrame, y: pd.DataFrame) -> pd.DataFrame:
        self.fit(X, y)
        return self.transform(X)

In [16]:
class OneHotEncoderWrapper(OneHotEncoder):
    def fit(self, X: pd.DataFrame) -> pd.DataFrame:
        super().fit(X[['lang', 'country']])

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        df = pd.DataFrame(super().transform(X[['lang', 'country']]).toarray(), columns=self.get_feature_names_out())
        return pd.concat([X.reset_index(),df], axis=1).set_index('index')

    def fit_transform(self, X: pd.DataFrame) -> pd.DataFrame:
        self.fit(X)
        return self.transform(X)

In [17]:
class PipeLine:
    def __init__(self, **kwargs):
        self.ohe = OneHotEncoderWrapper(handle_unknown='ignore')
        self.ge = GenreEncoder()
        self.ae = ActorEncoder(n_actors=9)
        self.sc = StandardScaler()
        self.model = XGBRegressor(**kwargs)

    def fit(self, X, y):
        X = self._preprocess(X)
        X = self.ae.fit_transform(X, y)
        X = self.ge.fit_transform(X)
        X = self.ohe.fit_transform(X)
        X = X.select_dtypes(['number'])
        cols = X.columns.to_list()
        # X = self.sc.fit_transform(X)
        self.model.fit(X, y)

        self.feature_importance = (
            pd.DataFrame(
                list(zip(cols, self.model.feature_importances_)),
                columns=['feature', 'importance'])
            .sort_values('importance', ascending=False)
        )

# From Towards Data Science to 
    def _preprocess(self, X):
        X = X.copy()
        X['year'] = X['date'].dt.year
        X['month_x'] = (np.sin(2 * np.pi * X['date'].dt.month/12)+1)/2
        X['month_y'] = (np.cos(2 * np.pi * X['date'].dt.month/12)+1)/2
        X['day_x'] = (np.sin(2 * np.pi * X['date'].dt.day/X['date'].dt.days_in_month)+1)/2
        X['day_y'] = (np.cos(2 * np.pi * X['date'].dt.day/X['date'].dt.days_in_month)+1)/2
        X['dow_x'] = (np.sin(2 * np.pi * X['date'].dt.day_of_week/7)+1)/2
        X['dow_y'] = (np.cos(2 * np.pi * X['date'].dt.day_of_week/7)+1)/2
        return X

    def _transform(self, X):
        X = X.copy()
        X = self._preprocess(X)
        X = self.ae.transform(X)
        X = self.ge.transform(X)
        X = self.ohe.transform(X)
        X = X.select_dtypes(['number'])
        # X = self.sc.transform(X)
        return X

    def predict(self, X):
        X = self._transform(X)
        return self.model.predict(X)

    def score(self, X, y):
        X = self._transform(X)
        return self.model.score(X, y)

    def inference(self, date, genre, cast,
                  lang, budget, country):
        df = pd.DataFrame(
            {
                'date': [date],
                'genre': [genre],
                'cast': [cast],
                'lang': [lang],
                'budget': [budget],
                'country': [country]
            }
        )
        df['date'] = pd.to_datetime(df['date'])
        return float(self.predict(df)[0])

In [31]:
df = pd.read_csv('imdb_movies.csv')
df = df[(df['status'] == ' Released')]
df = (
    df
    .drop(['names', 'overview', 'orig_title', 'status', 'revenue'], axis=1)
    .rename({'date_x':'date', 'orig_lang':'lang', 'crew':'cast', 'budget_x':'budget'}, axis=1)
)

df['score'] = df['score'] / 10
df['cast'] = df['cast'].apply(lambda x: str(x).split(', ')[::2])
df['date'] = pd.to_datetime(df['date'])
df['genre'] = df['genre'].apply(lambda x: str(x).replace('\xa0', ' ')).str.split(', ')
df['lang'] = df['lang'].str.replace(' ', '').apply(lambda x: x.split(',')[0])
df = df.sort_values('date').reset_index(drop=True)

print(df.shape[0])
df.head()

10075


Unnamed: 0,date,score,genre,cast,lang,budget,country
0,1903-05-15,6.3,"[Drama, History]","[Madame Moreau, Monsieur Moreau]",French,106400000.0,FR
1,1907-06-20,8.0,"[Adventure, Science Fiction]","[Georges Meliès, Bleuette Bernon, François Lal...",French,5985.0,AU
2,1915-02-08,6.1,"[Drama, History, War]","[Lillian Gish, Mae Marsh, Henry B. Walthall, M...",English,110000.0,US
3,1920-02-27,8.0,"[Drama, Horror, Thriller, Crime]","[Werner Krauß, Conrad Veidt, Friedrich Feher, ...",German,18000.0,DE
4,1923-04-01,7.9,"[Comedy, Romance, Thriller]","[Harold Lloyd, Mildred Davis, Bill Strother, N...",English,121000.0,US


In [35]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('score', axis=1), df['score'], shuffle=False, test_size=.25)

pipeline = PipeLine()
pipeline.fit(X_train, y_train)
# pipeline.score(X_train, y_train)
pipeline.score(X_test, y_test)

0.32163048938400574

In [21]:
# pipeline = PipeLine(
#     learning_rate=0.1,
#     max_depth=5,
#     reg_alpha=0.3,
#     reg_lambda=0.05,
#     subsample=1
# )

In [24]:
# Deadpool vs Wolverine (actual is an 8.0)
pipeline.inference(
    date = '2024-07-26',
    genre = ['Buddy Comedy', 'Action', 'Comedy', 'Superhero', 'Adventure'],
    cast = ['Ryan Reynolds', 'Hugh Jackman', 'Emma Corrin', 'Matthew Macfadyen', 'Dafne Keen', 'Jon Favreau', 'Morena Baccarin', 'Rob Delaney', 'Leslie Uggams'],
    lang = 'English',
    budget = 200_000_000.00,
    country = 'AU'
)

6.689740180969238

In [36]:
while True:
    date_input = input("Year of movie? YYYY-MM-DD format")
    user_genres = input("Genre of the movie? Can be multiple, separate with a comma e.g. Comedy, Action")
    genres = [s.strip() for s in user_genres.split(",")]
    cast_input = input("Who's starring in the movie? Can be multiple, separate with a comma.")
    lang_input = input("What language is the movie in?")
    budget_input = input("What is the budget of the movie?")
    country_input = input("Where is the movie from? Use ISO 3166 country codes, e.g. US, AU, KR (Korea), etc.")
    
    pipeline.inference(date_input, user_genres, cast_input, lang_input, budget_input, country_input)
    

ValueError: feature_names mismatch: ['budget', 'year', 'month_x', 'month_y', 'day_x', 'day_y', 'dow_x', 'dow_y', 'actor_0', 'actor_1', 'actor_2', 'actor_3', 'actor_4', 'actor_5', 'actor_6', 'actor_7', 'actor_8', 'actor_mean', 'n_scores', 'Fantasy', 'Western', 'Comedy', 'War', 'Crime', 'History', 'Horror', 'Romance', 'Family', 'Music', 'Adventure', 'Drama', 'Mystery', 'Action', 'Documentary', 'Science Fiction', 'Thriller', 'Animation', 'TV Movie', 'lang_Arabic', 'lang_Basque', 'lang_Bokmal', 'lang_Cantonese', 'lang_CentralKhmer', 'lang_Chinese', 'lang_Czech', 'lang_Danish', 'lang_Dutch', 'lang_English', 'lang_Finnish', 'lang_French', 'lang_German', 'lang_Greek', 'lang_Hindi', 'lang_Hungarian', 'lang_Icelandic', 'lang_Indonesian', 'lang_Italian', 'lang_Japanese', 'lang_Kannada', 'lang_Korean', 'lang_Latin', 'lang_Malay', 'lang_Malayalam', 'lang_NoLanguage', 'lang_Norwegian', 'lang_Persian', 'lang_Polish', 'lang_Portuguese', 'lang_Russian', 'lang_Serbian', 'lang_Serbo-Croatian', 'lang_Spanish', 'lang_Swedish', 'lang_Tagalog', 'lang_Telugu', 'lang_Thai', 'lang_Turkish', 'country_AR', 'country_AT', 'country_AU', 'country_BE', 'country_BO', 'country_BR', 'country_CA', 'country_CH', 'country_CL', 'country_CN', 'country_CO', 'country_CZ', 'country_DE', 'country_DK', 'country_DO', 'country_ES', 'country_FI', 'country_FR', 'country_GB', 'country_GR', 'country_GT', 'country_HK', 'country_HU', 'country_ID', 'country_IE', 'country_IL', 'country_IN', 'country_IR', 'country_IS', 'country_IT', 'country_JP', 'country_KR', 'country_MX', 'country_MY', 'country_NL', 'country_NO', 'country_PE', 'country_PH', 'country_PL', 'country_PR', 'country_PY', 'country_RU', 'country_SE', 'country_SU', 'country_TH', 'country_TR', 'country_TW', 'country_UA', 'country_US', 'country_XC', 'country_ZA'] ['year', 'month_x', 'month_y', 'day_x', 'day_y', 'dow_x', 'dow_y', 'actor_0', 'actor_1', 'actor_2', 'actor_3', 'actor_4', 'actor_5', 'actor_6', 'actor_7', 'actor_8', 'actor_mean', 'n_scores', 'Fantasy', 'Western', 'Comedy', 'War', 'Crime', 'History', 'Horror', 'Romance', 'Family', 'Music', 'Adventure', 'Drama', 'Mystery', 'Action', 'Documentary', 'Science Fiction', 'Thriller', 'Animation', 'TV Movie', 'lang_Arabic', 'lang_Basque', 'lang_Bokmal', 'lang_Cantonese', 'lang_CentralKhmer', 'lang_Chinese', 'lang_Czech', 'lang_Danish', 'lang_Dutch', 'lang_English', 'lang_Finnish', 'lang_French', 'lang_German', 'lang_Greek', 'lang_Hindi', 'lang_Hungarian', 'lang_Icelandic', 'lang_Indonesian', 'lang_Italian', 'lang_Japanese', 'lang_Kannada', 'lang_Korean', 'lang_Latin', 'lang_Malay', 'lang_Malayalam', 'lang_NoLanguage', 'lang_Norwegian', 'lang_Persian', 'lang_Polish', 'lang_Portuguese', 'lang_Russian', 'lang_Serbian', 'lang_Serbo-Croatian', 'lang_Spanish', 'lang_Swedish', 'lang_Tagalog', 'lang_Telugu', 'lang_Thai', 'lang_Turkish', 'country_AR', 'country_AT', 'country_AU', 'country_BE', 'country_BO', 'country_BR', 'country_CA', 'country_CH', 'country_CL', 'country_CN', 'country_CO', 'country_CZ', 'country_DE', 'country_DK', 'country_DO', 'country_ES', 'country_FI', 'country_FR', 'country_GB', 'country_GR', 'country_GT', 'country_HK', 'country_HU', 'country_ID', 'country_IE', 'country_IL', 'country_IN', 'country_IR', 'country_IS', 'country_IT', 'country_JP', 'country_KR', 'country_MX', 'country_MY', 'country_NL', 'country_NO', 'country_PE', 'country_PH', 'country_PL', 'country_PR', 'country_PY', 'country_RU', 'country_SE', 'country_SU', 'country_TH', 'country_TR', 'country_TW', 'country_UA', 'country_US', 'country_XC', 'country_ZA']
expected budget in input data

In [40]:
# GUI
def validate_date(date_str):
    try:
        datetime.datetime.strptime(date_str, '%Y-%m-%d')
        return True
    except ValueError:
        print("Invalid date format. Please use YYYY-MM-DD format.")
        return False

def validate_budget(budget_str):
    try:
        budget = float(budget_str.replace(',', '').replace('_', ''))
        return budget
    except ValueError:
        print("Invalid budget format. Please enter a numeric value.")
        return None

def validate_input():
    while True:
        date_input = input("What is the release date of the movie? YYYY-MM-DD format: ")
        if not validate_date(date_input):
            continue

        user_genres = input("Genre of the movie? Can be multiple, separate with a comma e.g. Comedy, Action: ")
        genres = [s.strip() for s in user_genres.split(",")]

        cast_input = input("Who's starring in the movie? Can be multiple, separate with a comma: ")
        cast = [s.strip() for s in cast_input.split(",")]

        lang_input = input("What language is the movie in? ")

        budget_input = input("What is the budget of the movie? ")
        budget = validate_budget(budget_input)
        if budget is None:
            continue

        country_input = input("Where is the movie from? Use ISO 3166 country codes, e.g. US, AU, KR (Korea), etc.: ")

        return {
            'date': date_input,
            'genre': genres,
            'cast': cast,
            'lang': lang_input,
            'budget': budget,
            'country': country_input
        }

while True:
    inputs = validate_input()
    output = pipeline.inference(
        date=inputs['date'],
        genre=inputs['genre'],
        cast=inputs['cast'],
        lang=inputs['lang'],
        budget=inputs['budget'],
        country=inputs['country']
    )
    print(f"Your movie will have an estimated audience/user score of: {round(output, 1)}")
    break

Your movie will have an estimated audience/user score of: 6.5


In [None]:

v = pipeline.feature_importance
v

In [None]:
sns.histplot(df['score'], bins=20)


In [None]:
r2_score(y_test, y_train)

In [None]:
resids = y_test - y_pred
sns.histplot(resids, bins=50)
plt.show()

In [None]:
sns.scatterplot(df, x='month_x', y='month_y')
plt.show()