In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
from ast import literal_eval

In [4]:
titleData = pd.read_csv("../resources/titles.csv", converters={'genres': literal_eval,  'production_countries': literal_eval})
titleData.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,[documentation],[US],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"[drama, crime]",[US],,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"[drama, action, thriller, european]",[US],,tt0068473,7.7,107673.0,10.01,7.3
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"[fantasy, action, comedy]",[GB],,tt0071853,8.2,534486.0,15.461,7.811
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"[war, action]","[GB, US]",,tt0061578,7.7,72662.0,20.398,7.6


In [5]:
titleData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5850 entries, 0 to 5849
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    5850 non-null   object 
 1   title                 5849 non-null   object 
 2   type                  5850 non-null   object 
 3   description           5832 non-null   object 
 4   release_year          5850 non-null   int64  
 5   age_certification     3231 non-null   object 
 6   runtime               5850 non-null   int64  
 7   genres                5850 non-null   object 
 8   production_countries  5850 non-null   object 
 9   seasons               2106 non-null   float64
 10  imdb_id               5447 non-null   object 
 11  imdb_score            5368 non-null   float64
 12  imdb_votes            5352 non-null   float64
 13  tmdb_popularity       5759 non-null   float64
 14  tmdb_score            5539 non-null   float64
dtypes: float64(5), int64(

In [6]:
showData = pd.DataFrame(titleData.loc[titleData["type"].values == ["SHOW"], ["id", "title", "release_year", "age_certification", "runtime", "genres", "production_countries", "seasons", "imdb_score", "imdb_votes"]])
showData.dropna(subset=['imdb_score'], inplace=True)
showData.reset_index(inplace=True, drop=True)
showData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1939 entries, 0 to 1938
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    1939 non-null   object 
 1   title                 1939 non-null   object 
 2   release_year          1939 non-null   int64  
 3   age_certification     1731 non-null   object 
 4   runtime               1939 non-null   int64  
 5   genres                1939 non-null   object 
 6   production_countries  1939 non-null   object 
 7   seasons               1939 non-null   float64
 8   imdb_score            1939 non-null   float64
 9   imdb_votes            1939 non-null   float64
dtypes: float64(3), int64(2), object(5)
memory usage: 151.6+ KB


In [7]:
movieData = pd.DataFrame(titleData.loc[titleData["type"].values == ["MOVIE"], ["id", "title", "release_year", "age_certification", "runtime", "genres", "production_countries", "imdb_score", "imdb_votes"]])
movieData.dropna(subset=['imdb_score'], inplace=True)
movieData.reset_index(inplace=True, drop=True)
movieData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3429 entries, 0 to 3428
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    3429 non-null   object 
 1   title                 3429 non-null   object 
 2   release_year          3429 non-null   int64  
 3   age_certification     1302 non-null   object 
 4   runtime               3429 non-null   int64  
 5   genres                3429 non-null   object 
 6   production_countries  3429 non-null   object 
 7   imdb_score            3429 non-null   float64
 8   imdb_votes            3413 non-null   float64
dtypes: float64(2), int64(2), object(5)
memory usage: 241.2+ KB


In [8]:
creditData = pd.read_csv("../resources/credits.csv")
creditData['person_id'] = creditData['person_id'].astype(str)
creditData['crew'] = creditData['name'] + " " + creditData['person_id'] + " " + creditData['role']
creditData.drop(inplace=True, columns=['character', 'name', 'role', 'person_id'])
creditData

Unnamed: 0,id,crew
0,tm84618,Robert De Niro 3748 ACTOR
1,tm84618,Jodie Foster 14658 ACTOR
2,tm84618,Albert Brooks 7064 ACTOR
3,tm84618,Harvey Keitel 3739 ACTOR
4,tm84618,Cybill Shepherd 48933 ACTOR
...,...,...
77796,tm1059008,Adelaida Buscato 736339 ACTOR
77797,tm1059008,Luz Stella Luengas 399499 ACTOR
77798,tm1059008,Inés Prieto 373198 ACTOR
77799,tm1059008,Isabel Gaona 378132 ACTOR


In [9]:
creditListData = pd.DataFrame(creditData.groupby(['id'])['crew'].apply(list))
creditListData

Unnamed: 0_level_0,crew
id,Unnamed: 1_level_1
tm1000037,"[Luna Wedler 251702 ACTOR, Jannis Niewöhner 41..."
tm1000147,"[Guy Pearce 5080 ACTOR, Matilda Anna Ingrid Lu..."
tm100015,"[Idris Elba 1675 ACTOR, Paul Walker 2673 ACTOR..."
tm1000166,"[Glenn Fredly 248002 ACTOR, Marcello Tahitoe 1..."
tm1000185,"[Adrianna Chlebicka 1562688 ACTOR, Mateusz Ban..."
...,...
ts97584,"[Sebastian Perry 1323916 ACTOR, William Mitche..."
ts9794,[Jerry Seinfeld 32954 ACTOR]
ts98252,"[Tom McGrath 9240 ACTOR, John DiMaggio 1950 AC..."
ts98316,[John Hurt 4824 ACTOR]


In [10]:
showMerged = showData.merge(creditListData, left_on='id', right_on='id', how='left')
showMerged.fillna(0, inplace=True)
showMerged.head()

Unnamed: 0,id,title,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_score,imdb_votes,crew
0,ts22164,Monty Python's Flying Circus,1969,TV-14,30,"[comedy, european]",[GB],4.0,8.8,73424.0,"[Graham Chapman 11472 ACTOR, Michael Palin 114..."
1,ts45948,Monty Python's Fliegender Zirkus,1972,TV-MA,43,[comedy],[],1.0,8.1,2151.0,"[Graham Chapman 11472 ACTOR, John Cleese 1549 ..."
2,ts20681,Seinfeld,1989,TV-PG,24,[comedy],[US],9.0,8.9,308824.0,"[Jerry Seinfeld 32954 ACTOR, Jason Alexander 1..."
3,ts22082,Knight Rider,1982,TV-PG,51,"[scifi, action, crime, drama]",[US],4.0,6.9,34115.0,"[David Hasselhoff 8464 ACTOR, Edward Mulhare 6..."
4,ts21715,Thomas & Friends,1984,TV-Y,10,"[animation, family, comedy, fantasy, drama, ac...",[GB],24.0,6.5,5104.0,[Rachael Louise Miller 1381431 ACTOR]


In [11]:
movieMerged = movieData.merge(creditListData, left_on='id', right_on='id', how='left')
movieMerged.fillna(0, inplace=True)
movieMerged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3429 entries, 0 to 3428
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    3429 non-null   object 
 1   title                 3429 non-null   object 
 2   release_year          3429 non-null   int64  
 3   age_certification     3429 non-null   object 
 4   runtime               3429 non-null   int64  
 5   genres                3429 non-null   object 
 6   production_countries  3429 non-null   object 
 7   imdb_score            3429 non-null   float64
 8   imdb_votes            3429 non-null   float64
 9   crew                  3429 non-null   object 
dtypes: float64(2), int64(2), object(6)
memory usage: 268.0+ KB


In [12]:
showMerged.to_csv('../resources/showData.csv')

In [13]:
movieMerged.to_csv('../resources/movieData.csv')

# Deep Learning Pre-processing

In [14]:
yMovieScore = movieMerged['imdb_score']
XMovieScore = movieMerged.drop(columns='imdb_score')

In [15]:
yMovieVotes = movieMerged['imdb_votes']
XMovieVotes = movieMerged.drop(columns='imdb_votes')

In [16]:
yShowScore = showMerged['imdb_score']
XShowScore = showMerged.drop(columns='imdb_score')

In [17]:
yShowVotes = showMerged['imdb_votes']
XShowVotes = showMerged.drop(columns='imdb_votes')

In [18]:
XMovieScoreEx = XMovieScore.explode('genres').explode('production_countries').explode('crew')
XMovieVotesEx = XMovieVotes.explode('genres').explode('production_countries').explode('crew')
XShowScoreEx = XShowScore.explode('genres').explode('production_countries').explode('crew')
XShowVotesEx = XShowVotes.explode('genres').explode('production_countries').explode('crew')

