## Imports

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

Regressors = {}
Hyperparams = {}

## Cleaning

### General

In [2]:
data = pd.read_csv('raw_complete.csv')
data.drop(data.columns[[0,1]], axis=1, inplace=True)
data.head(2)

#Target Feature
target = ["Rating"]

#Clean the budget
data['Budget'] = data['Budget'].str.replace(',', '', regex=False).str.extract(r'\$(\d+)')
budget = pd.to_numeric(data['Budget'], errors='coerce')
data['Budget'] = budget

#Make features everything but the target
features = list(data.columns)
features.remove(target[0])
feature_dict = {}

print(f'Dataset is now {len(data.columns)}-dimensional')
data.tail(2)

Dataset is now 12-dimensional


Unnamed: 0,Title,Year,Plot,Rating,Budget,Cast,MPA Rating,Directors,Distributors,Producers,Genre,Runtime in Minutes
3242,14. Jamie Foxx: What Had Happened Was...,2024,A stand-up special with Jamie Foxx performing ...,5.8,,"Jamie Foxx,Craig Brockman,Nisan Stewart,Bennet...",TV-MA,Hamish Hamilton,"Netflix,Netflix","Done and Dusted Productions,Foxxhole Productio...","Documentary,Comedy",68
3243,15. Jack in Time for Christmas,2024,Jack Whitehall races against time to get from ...,4.6,,"Jack Whitehall,Dave Bautista,Michael Bublé,Dai...",TV-MA,Phil Ashton,"Amazon Prime Video,Amazon Prime Video","Jackpot Productions,Workerbee",Comedy,66


### MPA Ratings

In [3]:
Ratings = {
    'G': 1,
    'PG': 2,
    'PG-13': 3,
    'R': 4,
    'NC-17':5}

data['MPA Rating'] = data['MPA Rating'].map(Ratings)
data['MPA Rating'] = data['MPA Rating'].fillna(6)

print(f'Dataset is now {len(data.columns)}-dimensional')
data.head(2)

Dataset is now 12-dimensional


Unnamed: 0,Title,Year,Plot,Rating,Budget,Cast,MPA Rating,Directors,Distributors,Producers,Genre,Runtime in Minutes
0,1. Whiplash,2014,A promising young drummer enrolls at a cut-thr...,8.5,3300000.0,"Miles Teller,J.K. Simmons,Paul Reiser,Melissa ...",6.0,Damien Chazelle,"Sony Pictures Classics,Sony Pictures Worldwide...","Bold Films,Blumhouse Productions,Right of Way ...","Drama,Music",106
1,2. RoboCop,2014,"In 2028 Detroit, when Alex Murphy, a loving hu...",6.1,100000000.0,"Joel Kinnaman,Gary Oldman,Michael Keaton,Abbie...",6.0,José Padilha,"Columbia Pictures,ACME,ACME,ACME,B&H Film Dist...","Metro-Goldwyn-Mayer (MGM),Columbia Pictures,St...","Action,Crime,Sci-Fi,Thriller",117


In [4]:
# Calculate quartiles and IQR
Q1 = data['Budget'].quantile(0.25)
Q3 = data['Budget'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter outliers
outliers_iqr = data[(data['Budget'] < lower_bound) | (data['Budget'] > upper_bound)]
outliers_iqr.sort_values(by=['Budget']).tail(10)

Unnamed: 0,Title,Year,Plot,Rating,Budget,Cast,MPA Rating,Directors,Distributors,Producers,Genre,Runtime in Minutes
2119,2. Zack Snyder's Justice League,2021,Determined to ensure that Superman's ultimate ...,7.9,300000000.0,"Ben Affleck,Henry Cavill,Amy Adams,Gal Gadot,R...",4.0,Zack Snyder,"Warner Bros.,HBO Max,Amazon Prime Video,Cosmot...","Atlas Entertainment,DC Entertainment,HBO Max,R...","Action,Adventure,Fantasy,Sci-Fi",242
1164,3. Star Wars: Episode VIII - The Last Jedi,2017,Rey develops her abilities with the help of Lu...,6.9,317000000.0,"Mark Hamill,Carrie Fisher,Adam Driver,Daisy Ri...",3.0,Rian Johnson,"2i Film,B&H Film Distribution,Cinecolor Films ...","Lucasfilm,Ram Bergman Productions,Bad Robot","Action,Adventure,Fantasy,Sci-Fi",152
1260,1. Avengers: Infinity War,2018,The Avengers and their allies must be willing ...,8.4,321000000.0,"Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo...",3.0,"Anthony Russo,Joe Russo","Walt Disney Studios Motion Pictures,B&H Film D...","Marvel Studios,Jason Roberts Productions,South...","Action,Adventure,Sci-Fi",149
2768,2. Fast X,2023,Dom Toretto and his family are targeted by the...,5.7,340000000.0,"Vin Diesel,Michelle Rodriguez,Jason Statham,Jo...",3.0,Louis Leterrier,"Universal Pictures,B&H Film Distribution,Cinem...","Universal Pictures,China Film Co., Ltd.,Dentsu...","Action,Adventure,Crime,Mystery,Thriller",141
2643,2. Avatar: The Way of Water,2022,Jake Sully lives with his newfound family form...,7.5,350000000.0,"Sam Worthington,Zoe Saldaña,Sigourney Weaver,S...",3.0,James Cameron,"20th Century Studios,Walt Disney Studios Motio...","20th Century Studios,TSG Entertainment,Lightst...","Action,Adventure,Fantasy,Sci-Fi",192
2770,4. The Little Mermaid,2023,A young mermaid makes a deal with a sea witch ...,7.2,355100000.0,"Halle Bailey,Jonah Hauer-King,Melissa McCarthy...",2.0,Rob Marshall,"Walt Disney Studios Motion Pictures,Feelgood E...","Walt Disney Pictures,Lucamar Productions,Marc ...","Adventure,Family,Fantasy,Musical,Romance",135
1557,1. Avengers: Endgame,2019,After the devastating events of Avengers: Infi...,8.4,356000000.0,"Robert Downey Jr.,Chris Evans,Mark Ruffalo,Chr...",3.0,"Anthony Russo,Joe Russo","B&H Film Distribution,CJ CGV Viet Nam,Cinecolo...",Marvel Studios,"Action,Adventure,Sci-Fi",181
2772,6. Indiana Jones and the Dial of Destiny,2023,Archaeologist Indiana Jones races against time...,6.5,387200000.0,"Harrison Ford,Phoebe Waller-Bridge,Antonio Ban...",3.0,James Mangold,"Walt Disney Studios Motion Pictures,Feelgood E...","Walt Disney Pictures,Lucasfilm,Paramount Pictures","Action,Adventure,Sci-Fi",154
2472,6. Doctor Strange in the Multiverse of Madness,2022,Doctor Strange teams up with a mysterious teen...,6.9,414900000.0,"Benedict Cumberbatch,Elizabeth Olsen,Chiwetel ...",3.0,Sam Raimi,"Feelgood Entertainment,Forum Hungary,HKC Enter...",Marvel Studios,"Action,Adventure,Fantasy,Sci-Fi",126
573,3. Star Wars: Episode VII - The Force Awakens,2015,"As a new threat to the galaxy rises, Rey, a de...",7.8,533000000.0,"Harrison Ford,Mark Hamill,Carrie Fisher,Adam D...",3.0,J.J. Abrams,"Walt Disney Studios Motion Pictures,B&H Film D...","Lucasfilm,Bad Robot","Action,Adventure,Sci-Fi",138


In [9]:
# I checked, they should be the same.
data.loc[2770, 'Budget'] = 40_000_000
data.loc[573, 'Budget'] = 245_000_000
data.loc[669, 'Budget'] = 30_000_000

('4. The Little Mermaid',
 '3. Star Wars: Episode VII - The Force Awakens',
 "24. Pee-wee's Big Holiday")

In [None]:
# Calculate quartiles and IQR
Q1 = data['Runtime in Minutes'].quantile(0.25)
Q3 = data['Runtime in Minutes'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter outliers
outliers_iqr = data[(data['Runtime in Minutes'] < lower_bound) | (data['Runtime in Minutes'] > upper_bound)]
outliers_iqr.sort_values(by=['Runtime in Minutes']).tail(1)

In [None]:
# Calculate dynamic bin edges
min_val = data['Rating'].min()
max_val = data['Rating'].max()
q1 = data['Rating'].quantile(0.25)
q2 = data['Rating'].quantile(0.50)
q3 = data['Rating'].quantile(0.75)

# Create bins with these values
bins = [min_val, q2, max_val]

# Use pd.cut to assign categorical labels based on the computed bins
data['Rating'] = pd.cut(
    data['Rating'],
    bins=bins,
    labels=['bad','good'],
    include_lowest=True
)

## Split

In [6]:
# Split data -> nontest & test
x_nontest, x_test, y_nontest, y_test = train_test_split(data[features], data[target], test_size=0.2, random_state=1337)

# 60% of the entire dataset = train
# 20% of the entire dataset = val
x_train, x_val, y_train, y_val = train_test_split(x_nontest, y_nontest, test_size=0.25, random_state=1337)

print(f'Val Size: {len(x_val)}')
print(f'Train size: {len(x_train)}')
print(f'Test size: {len(x_test)}')

Val Size: 649
Train size: 1946
Test size: 649


## Vectorization

In [7]:
# fit using Train Set
vectorizer = TfidfVectorizer(token_pattern =r'[a-zA-Z]+', min_df=0.001, stop_words=list(sklearn.feature_extraction.text.ENGLISH_STOP_WORDS))
vectorizer.fit(x_train['Plot'])
tokens = vectorizer.get_feature_names_out()
feature_dict['tokens'] = list(tokens)

# Train Set
trainIDF = vectorizer.transform(x_train['Plot'])
x_train_wordvec = pd.DataFrame(trainIDF.toarray(), columns=tokens, index=x_train.index)
x_train_temp = pd.concat([x_train, x_train_wordvec], axis=1)
x_train.drop(columns=['Plot'], inplace=True)

# Test Set
testIDF = vectorizer.transform(x_test['Plot'])
x_test_wordvec = pd.DataFrame(testIDF.toarray(), columns=tokens, index=x_test.index)
x_test_temp = pd.concat([x_test, x_test_wordvec], axis=1)
x_test.drop(columns=['Plot'], inplace=True)

# Validation Set
valIDF = vectorizer.transform(x_val['Plot'])
x_val_wordvec = pd.DataFrame(valIDF.toarray(), columns=tokens, index=x_val.index)
x_val_temp = pd.concat([x_val, x_val_wordvec], axis=1)
x_val.drop(columns=['Plot'], inplace=True)

print(f'Dataset is now {len(x_val_temp.columns)}-dimensional')
x_val.head(2)

Dataset is now 12207-dimensional


Unnamed: 0,Title,Year,Budget,Cast,MPA Rating,Directors,Distributors,Producers,Genre,Runtime in Minutes
2313,22. The Manor,2021,,"Barbara Hershey,Bruce Davison,Nicholas Alexand...",6.0,Axelle Carolyn,Amazon Prime Video,"Amazon Studios,Blumhouse Television,Storm King...","Horror,Mystery",81
1309,25. The Toybox,2018,,"Denise Richards,Mischa Barton,Jeff Denton,Bria...",6.0,Tom Nagel,"High Fliers Films,Skyline Entertainment,Sunfil...","Steel House Productions,Millman Productions,Sk...",Horror,95


In [8]:
to_be_encoded = ['Cast', 'Directors', 'Distributors', 'Producers', 'Genre']

for feat in to_be_encoded:
    vectorizer = CountVectorizer(token_pattern =r'(.+?),', max_features = 100)
    vectorizer.fit(x_train[feat])
    tokens = vectorizer.get_feature_names_out()
    feature_dict[feat] = list(tokens)

    #Train Set
    trainIDF = vectorizer.transform(x_train[feat])
    x_train_wordvec = pd.DataFrame(trainIDF.toarray(), columns=tokens, index=x_train.index)
    x_train = pd.concat([x_train, x_train_wordvec], axis=1)

    #Test Set
    testIDF = vectorizer.transform(x_test[feat])
    x_test_wordvec = pd.DataFrame(testIDF.toarray(), columns=tokens, index=x_test.index)
    x_test = pd.concat([x_test, x_test_wordvec], axis=1)

    #Validation Set
    valIDF = vectorizer.transform(x_val[feat])
    x_val_wordvec = pd.DataFrame(valIDF.toarray(), columns=tokens, index=x_test.index)
    x_val = pd.concat([x_val, x_val_wordvec], axis=1)

x_val.drop(columns=to_be_encoded, inplace=True)
x_nontest.drop(columns=to_be_encoded, inplace=True)
x_test.drop(columns=to_be_encoded, inplace=True)

print(f'Dataset is now {len(x_test.columns)}-dimensional')
x_test.head(2)

ValueError: np.nan is an invalid document, expected byte or unicode string.

## PCA