In [7]:
# Cody Giles - Student ID: 010506641
# C964 Capstone - Movie Audience Rating Predictor aka The MARP

import pandas as pd
from xgboost import XGBRegressor
# read data
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [22]:
class ActorEncoder:
    def __init__(self, n_actors=3, multipliers=None):
        # Initialize with the number of actors and custom multipliers
        self.n_actors = n_actors
        self.multipliers = multipliers if multipliers else [3, 2, 1.5, 1]
        self.actor_scores = {}
        self.actor_movie_count = {}

    def fit(self, X: pd.DataFrame, y: pd.Series) -> None:
        """
        Fits the model to the data, calculating the actor scores.
        """
        for idx, row in X.iterrows():
            actor_list = row['crew']
            user_score = y.iloc[idx]
            
            # If user_score is < 50 then multiplier is set to 1
            current_multipliers = [1] * len(actor_list) if user_score < 50 else self.multipliers
            
            for i, actor in enumerate(actor_list):
                multiplier = current_multipliers[i] if i < len(current_multipliers) else 1
                weighted_score = multiplier * user_score
                # Update the actor's score
                if actor in self.actor_scores:
                    self.actor_scores[actor] += weighted_score
                    self.actor_movie_count[actor] += 1
                else:
                    self.actor_scores[actor] = weighted_score
                    self.actor_movie_count[actor] = 1

        # Calculate final actor scores
        self.final_actor_scores = {actor: self.actor_scores[actor] / self.actor_movie_count[actor]
                                   for actor in self.actor_scores}

    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Transforms the input DataFrame by adding the calculated actor scores.
        """
        df = df.copy()
        for i in range(self.n_actors):
            df[f'actor_{i}_score'] = df['crew'].apply(
                lambda x: self.final_actor_scores.get(str(x[i]), 0) if len(x) > i else 0
            )

        # Calculate the mean score across the actors
       # df['actor_mean'] = df[[f'actor_{i}_score' for i in range(self.n_actors)]].mean(axis=1)

        return df

    def fit_transform(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
        """
        Combines fit and transform operations for convenience.
        """
        self.fit(X, y)
        return self.transform(X)

# Prepare the data
movies_df = pd.read_csv('imdb_movies.csv', encoding='utf-8')
movies_df['crew'] = movies_df['crew'].apply(lambda x: [y for y in str(x).split(', ')[::2]])

# Select the features and target
X = movies_df[['crew', 'budget_x']]
y = movies_df['score']

# Initialize and apply the ActorEncoder
encoder = ActorEncoder(n_actors=3)
encoded_df = encoder.fit_transform(X, y)

# Display the transformed DataFrame
print(encoded_df.head(10))


                                                crew     budget_x  \
0  [Michael B. Jordan, Tessa Thompson, Jonathan M...   75000000.0   
1  [Sam Worthington, Zoe Saldana, Sigourney Weave...  460000000.0   
2  [Chris Pratt, Anya Taylor-Joy, Charlie Day, Ja...  100000000.0   
3  [Oscar Barberan, Ana Esther Alborg, Luis Perez...   12300000.0   
4  [Skeet Ulrich, Anne Heche, Daniel Diemer, Jord...   77000000.0   
5  [Keri Russell, Alden Ehrenreich, O'Shea Jackso...   35000000.0   
6  [Keanu Reeves, Donnie Yen, Bill Skarsgard, Ian...  100000000.0   
7  [Antonio Banderas, Salma Hayek, Harvey Guillen...   90000000.0   
8  [Paul Bianchi, Erin Coker, Jack Pearson, Antho...   71000000.0   
9  [Chloe Guidry, Nhedrick Jabier, Carmina Garay,...  119200000.0   

   actor_0_score  actor_1_score  actor_2_score  
0     163.714286     110.227273     142.200000  
1     151.363636     102.942308     115.671429  
2     134.203704     128.571429      89.636364  
3     205.500000     104.000000     105.0000

0        73
1        78
2        76
3        70
4        61
         ..
10119    73
10120    54
10121    61
10122    55
10123    70
Name: score, Length: 10124, dtype: int64

In [20]:
class Processor:
    def __init__(self, **kwargs):
        self.ae = ActorEncoder(n_actors=9)
        self.sc = StandardScaler()
        self.model = XGBRegressor(**kwargs)

    def fit(self, X, y):
        X = self._preprocess(X)
        X = self.ae.fit_transform(X, y)
        X = X.select_dtypes(['number'])
        cols = X.columns.to_list()
        # X = self.sc.fit_transform(X)
        self.model.fit(X, y)

        self.feature_importance = (
            pd.DataFrame(
                list(zip(cols, self.model.feature_importances_)),
                columns=['feature', 'importance'])
            .sort_values('importance', ascending=False)
        )
    def _preprocess(self, X):
        X = X.copy()
        # X['year'] = X['date'].dt.year
        # X['month_x'] = (np.sin(2 * np.pi * X['date'].dt.month/12)+1)/2
        # X['month_y'] = (np.cos(2 * np.pi * X['date'].dt.month/12)+1)/2
        # X['day_x'] = (np.sin(2 * np.pi * X['date'].dt.day/X['date'].dt.days_in_month)+1)/2
        # X['day_y'] = (np.cos(2 * np.pi * X['date'].dt.day/X['date'].dt.days_in_month)+1)/2
        # X['dow_x'] = (np.sin(2 * np.pi * X['date'].dt.day_of_week/7)+1)/2
        # X['dow_y'] = (np.cos(2 * np.pi * X['date'].dt.day_of_week/7)+1)/2
        return X
    
    def _transform(self, X):
        X = X.copy()
        X = self._preprocess(X)
        X = self.ae.transform(X)
        X = X.select_dtypes(['number'])
        # X = self.sc.transform(X)
        return X

    def predict(self, X):
        X = self._transform(X)
        return self.model.predict(X)

    def score(self, X, y):
        X = self._transform(X)
        return self.model.score(X, y)

    def inference(self, crew, budget):
        df = pd.DataFrame(
            {
            'crew': [crew],
            'budget': [budget],
            }
        )
        return float(self.predict(df)[0])

In [10]:
df = pd.read_csv('imdb_movies.csv')
df = df[df['status'] == ' Released']
df = (
    df
    .drop(['names', 'overview', 'orig_title', 'status', 'revenue'], axis=1)
    .rename({'date_x':'date', 'orig_lang':'lang', 'budget_x':'budget'}, axis=1)
)

df['score'] = df['score'] / 10
df['crew'] = df['crew'].apply(lambda x: str(x).split(', ')[::2])
df['date'] = pd.to_datetime(df['date'])
df['genre'] = df['genre'].apply(lambda x: str(x).replace('\xa0', ' ')).str.split(', ')
df['lang'] = df['lang'].str.replace(' ', '').apply(lambda x: x.split(',')[0])
df = df.sort_values('date').reset_index(drop=True)

print(df.shape[0])
df.head()

10077


Unnamed: 0,date,score,genre,cast,lang,budget,country
0,1903-05-15,6.3,"[Drama, History]","[Madame Moreau, Monsieur Moreau]",French,106400000.0,FR
1,1907-06-20,8.0,"[Adventure, Science Fiction]","[Georges Meliès, Bleuette Bernon, François Lal...",French,5985.0,AU
2,1915-02-08,6.1,"[Drama, History, War]","[Lillian Gish, Mae Marsh, Henry B. Walthall, M...",English,10000000.0,US
3,1915-02-08,6.1,"[Drama, History, War]","[Lillian Gish, Mae Marsh, Henry B. Walthall, M...",English,110000.0,US
4,1920-02-27,8.0,"[Drama, Horror, Thriller, Crime]","[Werner Krauß, Conrad Veidt, Friedrich Feher, ...",German,18000.0,DE


In [18]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('score', axis=1), df['score'], shuffle=False, test_size=.2)

processor = Processor()
processor.fit(X_train, y_train)
processor.score(X_test, y_test)

KeyError: 'crew'

In [7]:
data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(selected_columns.drop('score', axis=1), selected_columns['score'], shuffle=False, test_size=.2)
# create model instance
bst = XGBRegressor(n_estimators=2, max_depth=2, learning_rate=1, objective='reg:squarederror')

# pipeline = Pipeline()
# pipeline.fit(X_train, y_train)
# pipeline.score(X_test, y_test)

# fit model
bst.fit(X_train, y_train)
# make predictions
preds = bst.predict(X_test)
bst.score(X_test, y_test)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:names: object, crew: object

In [2]:

x = df3.select_dtypes(['number']).drop('score', axis=1)
y = df3['score']

from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor

X_train, X_test, y_train, y_test = train_test_split(X,y)

model.score(X_test.to_numpy(), y_test.to_numpy())


Unnamed: 0,crew,score
0,Michael B. Jordan,2336.0
1,Tessa Thompson,1212.5
2,Jonathan Majors,711.0
3,Wood Harris,446.0
4,Phylicia Rashād,702.0
...,...,...
39486,Jesse Collins,55.0
39487,Jack Langedijk,55.0
39488,Nina Herzog,210.0
39489,Gardner Jaas,70.0
