In [73]:
import pandas as pd

IMDB (https://datasets.imdbws.com/) has data available for use, but it requires some preprocessing. For starters, the name of the movie and the rating aren't in the same dataset. So we will combine those to one pandas dataframe.

In [74]:
imdb_basics = pd.read_csv('data_imdb_basics.tsv', sep='\t')
imdb_basics.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(4805392, 9)

In [75]:
imdb_ratings = pd.read_csv('data_imdb_ratings.tsv', sep='\t')
imdb_ratings.shape

(1280237, 3)

In [76]:
netflix = pd.read_csv('netflix_titles.csv')
netflix.shape

(8807, 12)

In [77]:
print(f"NETFLIX:\n{netflix.iloc[:1]}\n\n\nIMDB BASICS:\n{imdb_basics.iloc[:5]}\n\n\nIMDB RATINGS:\n{imdb_ratings.iloc[:5]}")

NETFLIX:
  show_id   type                 title         director cast        country  \
0      s1  Movie  Dick Johnson Is Dead  Kirsten Johnson  NaN  United States   

           date_added  release_year rating duration      listed_in  \
0  September 25, 2021          2020  PG-13   90 min  Documentaries   

                                         description  
0  As her father nears the end of his life, filmm...  


IMDB BASICS:
      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  isAdult startYear endYear runtimeMinutes                    genres  
0       0      1894      \N              1        

In [78]:
imdb_combined = pd.concat([imdb_basics, imdb_ratings], axis=1, join='inner')
print(f"{imdb_combined.shape}")

print(f"\nIMDB COMBINED:\n{imdb_combined.iloc[:5]}")

(1280237, 12)

IMDB COMBINED:
      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  isAdult startYear endYear runtimeMinutes                    genres  \
0       0      1894      \N              1         Documentary,Short   
1       0      1892      \N              5           Animation,Short   
2       0      1892      \N              4  Animation,Comedy,Romance   
3       0      1892      \N             12           Animation,Short   
4       0      1893      \N              1              Comedy,Short   

      tconst  averageRating  numVotes  
0  tt0000001            5.7      1952  
1  tt0000002

Data is now in two sets, Netflix information, and IMDB information. Lets create one data set that contains Netflix, and IMDB movies together.

In [79]:
netflix_imdb_combined = pd.merge(netflix,imdb_combined, suffixes=['_netflix','_imdb'], left_on='title', right_on='originalTitle')
print(f"{netflix_imdb_combined.shape}\n\nNETFLIX IMDB COMBINED:\n{netflix_imdb_combined.iloc[:1]}")

(13351, 24)

NETFLIX IMDB COMBINED:
  show_id     type          title       director  \
0      s6  TV Show  Midnight Mass  Mike Flanagan   

                                                cast country  \
0  Kate Siegel, Zach Gilford, Hamish Linklater, H...     NaN   

           date_added  release_year rating  duration  ...   primaryTitle  \
0  September 24, 2021          2021  TV-MA  1 Season  ...  Midnight Mass   

   originalTitle isAdult startYear endYear runtimeMinutes genres     tconst  \
0  Midnight Mass       0      1999      \N             \N  Drama  tt0216854   

  averageRating numVotes  
0           5.3       11  

[1 rows x 24 columns]


Now we have a combined dataframe of the shows available on netflix, and their imdb information. Lets filter out the TV shows

In [80]:
netflix_imdb_combined_no_tv = netflix_imdb_combined[(netflix_imdb_combined['type'] == 'Movie')]
print(f"{netflix_imdb_combined_no_tv.shape}\n\nCOMBINED NO TV:\n{netflix_imdb_combined_no_tv.iloc[:1]}")

(9317, 24)

COMBINED NO TV:
  show_id   type                             title  \
2      s7  Movie  My Little Pony: A New Generation   

                        director  \
2  Robert Cullen, José Luis Ucha   

                                                cast country  \
2  Vanessa Hudgens, Kimiko Glenn, James Marsden, ...     NaN   

           date_added  release_year rating duration  ...  \
2  September 24, 2021          2021     PG   91 min  ...   

                       primaryTitle                     originalTitle isAdult  \
2  My Little Pony: A New Generation  My Little Pony: A New Generation       0   

  startYear endYear runtimeMinutes                      genres     tconst  \
2      2021      \N             90  Adventure,Animation,Comedy  tt4485950   

  averageRating numVotes  
2           7.9       12  

[1 rows x 24 columns]


 Lets start by creating our test/training data split (85/15)

In [81]:
from sklearn.model_selection import train_test_split

In [82]:
train, test = train_test_split(netflix_imdb_combined_no_tv, test_size=0.15)

print(f"TRAINING:\t{train.shape}\n{train.iloc[:1]}\n\n\nTESTING:\t{test.shape}\n{test.iloc[:1]}")

TRAINING:	(7919, 24)
     show_id   type                title         director  \
9524   s6493  Movie  Clash of the Titans  Louis Leterrier   

                                                   cast  \
9524  Sam Worthington, Liam Neeson, Ralph Fiennes, J...   

                                       country      date_added  release_year  \
9524  United States, United Kingdom, Australia  August 1, 2019          2010   

     rating duration  ...         primaryTitle        originalTitle isAdult  \
9524  PG-13  106 min  ...  Clash of the Titans  Clash of the Titans       0   

     startYear endYear runtimeMinutes                   genres     tconst  \
9524      1981      \N            118  Action,Adventure,Family  tt0108877   

     averageRating numVotes  
9524           8.5       45  

[1 rows x 24 columns]


TESTING:	(1398, 24)
      show_id   type        title                      director cast  \
11829   s8062  Movie  Sour Grapes  Jerry Rothwell, Reuben Atlas  NaN   

            

In [83]:
# Generate list of all genres
genre_list = []

for genres in train['genres']:
  genre_sep = genres.split(',')
  genre_list = genre_list + genre_sep

# Find unique genres from our data.
list_set = set(genre_list)
unique_list_genre = (list(list_set))
for genre in unique_list_genre:
  print(genre)

Documentary
History
Talk-Show
Romance
\N
Game-Show
Short
Adult
Sport
Adventure
Family
Crime
Western
Drama
Film-Noir
Musical
Animation
Sci-Fi
Thriller
Action
Music
Fantasy
Mystery
News
Reality-TV
Biography
War
Comedy
Horror


In [84]:
Documentary = train.loc[(train['genres'].str.contains('Documentary'))]
print(f'Documentary:\t{Documentary.shape}\n{Documentary.iloc[:1]}')


History = train.loc[(train['genres'].str.contains('History'))]
print(f'\n\nHistory:\t{History.shape}\n{History.iloc[:1]}')


Romance = train.loc[(train['genres'].str.contains('Romance'))]
print(f'\n\nRomance:\t{Romance.shape}\n{Romance.iloc[:1]}')


Short = train.loc[(train['genres'].str.contains('Short'))]
print(f'\n\nShort:\t{Short.shape}\n{Short.iloc[:1]}')


Sport = train.loc[(train['genres'].str.contains('Sport'))]
print(f'\n\nSport:\t{Sport.shape}\n{Sport.iloc[:1]}')


Adventure = train.loc[(train['genres'].str.contains('Adventure'))]
print(f'\n\nAdventure:\t{Adventure.shape}\n{Adventure.iloc[:1]}')


Family = train.loc[(train['genres'].str.contains('Family'))]
print(f'\n\nFamily:\t{Family.shape}\n{Family.iloc[:1]}')


Crime = train.loc[(train['genres'].str.contains('Crime'))]
print(f'\n\nCrime:\t{Crime.shape}\n{Crime.iloc[:1]}')


Western = train.loc[(train['genres'].str.contains('Western'))]
print(f'\n\nWestern:\t{Western.shape}\n{Western.iloc[:1]}')


Drama = train.loc[(train['genres'].str.contains('Drama'))]
print(f'\n\nDrama:\t{Drama.shape}\n{Drama.iloc[:1]}')


Film_Noir = train.loc[(train['genres'].str.contains('Film-Noir'))]
print(f'\n\nFilm-Noir:\t{Film_Noir.shape}\n{Film_Noir.iloc[:1]}')


Musical = train.loc[(train['genres'].str.contains('Musical'))]
print(f'\n\nMusical:\t{Musical.shape}\n{Musical.iloc[:1]}')


Animation = train.loc[(train['genres'].str.contains('Animation'))]
print(f'\n\nAnimation:\t{Animation.shape}\n{Animation.iloc[:1]}')


Sci_Fi = train.loc[(train['genres'].str.contains('Sci-Fi'))]
print(f'\n\nSci-Fi:\t{Sci_Fi.shape}\n{Sci_Fi.iloc[:1]}')


Thriller = train.loc[(train['genres'].str.contains('Thriller'))]
print(f'\n\nThriller:\t{Thriller.shape}\n{Thriller.iloc[:1]}')


Action = train.loc[(train['genres'].str.contains('Action'))]
print(f'\n\nAction:\t{Action.shape}\n{Action.iloc[:1]}')


Music = train.loc[(train['genres'].str.contains('Music'))]
print(f'\n\nMusic:\t{Music.shape}\n{Music.iloc[:1]}')


Fantasy = train.loc[(train['genres'].str.contains('Fantasy'))]
print(f'\n\nFantasy:\t{Fantasy.shape}\n{Fantasy.iloc[:1]}')


Mystery = train.loc[(train['genres'].str.contains('Mystery'))]
print(f'\n\nMystery:\t{Mystery.shape}\n{Mystery.iloc[:1]}')


War = train.loc[(train['genres'].str.contains('War'))]
print(f'\n\nWar:\t{War.shape}\n{War.iloc[:1]}')


Comedy = train.loc[(train['genres'].str.contains('Comedy'))]
print(f'\n\nComedy:\t{Comedy.shape}\n{Comedy.iloc[:1]}')


Horror = train.loc[(train['genres'].str.contains('Horror'))]
print(f'\n\nHorror:\t{Horror.shape}\n{Horror.iloc[:1]}')

Documentary:	(477, 24)
     show_id   type   title            director  \
3019   s1822  Movie  Listen  Philippe Aractingi   

                                                   cast  country  \
3019  Hadi Bou Ayash, Ruba Zarour, Yara Bou Nassar, ...  Lebanon   

            date_added  release_year rating duration  ... primaryTitle  \
3019  October 19, 2020          2017  TV-MA  103 min  ...       Listen   

     originalTitle isAdult startYear endYear runtimeMinutes       genres  \
3019        Listen       0      2004      \N             \N  Documentary   

         tconst averageRating numVotes  
3019  tt2763990           7.7       11  

[1 rows x 24 columns]


History:	(103, 24)
      show_id   type                 title         director  \
10632   s7157  Movie  K-19: The Widowmaker  Kathryn Bigelow   

                                                    cast  \
10632  Harrison Ford, Liam Neeson, Peter Sarsgaard, S...   

                                              country        

Now all of our data is split by genre as well.