In [80]:
import pandas as pd

IMDB (https://datasets.imdbws.com/) has data available for use, but it requires some preprocessing. For starters, the name of the movie and the rating aren't in the same dataset. So we will combine those to one pandas dataframe. We can also combine any of the IMDB datasets for further data mining, as they have a shared constant for every row (tconst)


In [81]:
imdb_basics = pd.read_csv('data_imdb_basics.tsv', sep='\t')
imdb_basics.shape

  imdb_basics = pd.read_csv('data_imdb_basics.tsv', sep='\t')


(9621894, 9)

In [82]:
imdb_ratings = pd.read_csv('data_imdb_ratings.tsv', sep='\t')
imdb_ratings.shape

(1280237, 3)

In [83]:
# DEPRECATED DATASET WE ORIGINALLY WANTED TO USE
#netflix = pd.read_csv('netflix_titles.csv')
#netflix.shape

In [84]:
# DEPRECATED DATASET
# NETFLIX:\n{netflix.iloc[:1]}\n\n\n
print(f"IMDB BASICS:\n{imdb_basics.iloc[:5]}\n\n\nIMDB RATINGS:\n{imdb_ratings.iloc[:5]}")

IMDB BASICS:
      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  isAdult startYear endYear runtimeMinutes                    genres  
0       0      1894      \N              1         Documentary,Short  
1       0      1892      \N              5           Animation,Short  
2       0      1892      \N              4  Animation,Comedy,Romance  
3       0      1892      \N             12           Animation,Short  
4       0      1893      \N              1              Comedy,Short  


IMDB RATINGS:
      tconst  averageRating  numVotes
0  tt0000001            5.7      1952
1  tt0000002            

In [85]:
imdb_combined = pd.concat([imdb_basics, imdb_ratings], axis=1, join='inner')
print(f"{imdb_combined.shape}")

print(f"\nIMDB COMBINED:\n{imdb_combined.iloc[:5]}")

(1280237, 12)

IMDB COMBINED:
      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short          Pauvre Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

  isAdult startYear endYear runtimeMinutes                    genres  \
0       0      1894      \N              1         Documentary,Short   
1       0      1892      \N              5           Animation,Short   
2       0      1892      \N              4  Animation,Comedy,Romance   
3       0      1892      \N             12           Animation,Short   
4       0      1893      \N              1              Comedy,Short   

      tconst  averageRating  numVotes  
0  tt0000001            5.7      1952  
1  tt0000002

Data is now in two sets, Netflix information, and IMDB information. Lets create one data set that contains Netflix, and IMDB movies together.

In [86]:
# DEPRECATED DATASET
#netflix_imdb_combined = pd.merge(netflix,imdb_combined, suffixes=['_netflix','_imdb'], left_on='title', right_on='originalTitle')
#print(f"{netflix_imdb_combined.shape}\n\nNETFLIX IMDB COMBINED:\n{netflix_imdb_combined.iloc[:1]}")

Now we have a combined dataframe of the shows available on netflix, and their imdb information. Lets filter out the TV shows

In [87]:
# DEPRECATED DATASET
#netflix_imdb_combined_no_tv = netflix_imdb_combined[(netflix_imdb_combined['type'] == 'Movie')]
#print(f"{netflix_imdb_combined_no_tv.shape}\n\nCOMBINED NO TV:\n{netflix_imdb_combined_no_tv.iloc[:1]}")

Lets also filter out an NaN rows

In [88]:

imdb_combined = imdb_combined.dropna()

 Lets start by creating our test/training data split (85/15)

In [89]:
from sklearn.model_selection import train_test_split

In [90]:
train, test = train_test_split(imdb_combined, test_size=0.15)

print(f"TRAINING:\t{train.shape}\n{train.iloc[:1]}\n\n\nTESTING:\t{test.shape}\n{test.iloc[:1]}")

TRAINING:	(1088200, 12)
           tconst titleType primaryTitle originalTitle isAdult startYear  \
385198  tt0401901     movie   Temptation         Yuhok       0      1969   

       endYear runtimeMinutes          genres     tconst  averageRating  \
385198      \N             95  Drama,Thriller  tt0793028            7.6   

        numVotes  
385198        62  


TESTING:	(192036, 12)
           tconst titleType  primaryTitle originalTitle isAdult startYear  \
212757  tt0222033     short  In der Nacht  In der Nacht       0      1931   

       endYear runtimeMinutes genres     tconst  averageRating  numVotes  
212757      \N              7  Short  tt0380628            6.5        53  


In [91]:
# Generate list of all genres
genre_list = []

genres = train['genres'].unique()
for genresgroup in genres:
  if genresgroup != genresgroup:
    print(genresgroup)
  genre_sep = genresgroup.split(',')
  genre_list = genre_list + genre_sep

# Find unique genres from our data.
list_set = set(genre_list)
unique_list_genre = (list(list_set))
unique_list_genre.remove("\\N")
for genre in unique_list_genre:
  print(genre)

Western
Sci-Fi
Animation
Game-Show
Crime
Musical
Film-Noir
Adult
Horror
War
History
Thriller
Biography
Reality-TV
Short
Music
Drama
Family
Action
Romance
Adventure
Talk-Show
Mystery
News
Fantasy
Sport
Comedy
Documentary


In [92]:
# Create dictionary for all genres
genre_split = {}
# Add each genre as a key, and its dictionary as the value
for genre in unique_list_genre:
  genre_split[genre] = train.loc[(train['genres'].str.contains(genre))]

print(genre_split['Horror'].iloc[:1])

             tconst titleType         primaryTitle        originalTitle  \
1139847  tt10309384     short  Grammar Nazi Killer  Grammar Nazi Killer   

        isAdult startYear endYear runtimeMinutes        genres     tconst  \
1139847       0      2019      \N             10  Horror,Short  tt6446234   

         averageRating  numVotes  
1139847            8.2         5  


Now all of our data is split by genre as well.

In [95]:
genre_split_avg = {}
# for each genre
for genre in unique_list_genre:
  #initalize values
  avg = 0
  count = 0
  #declare working genre
  print(f"GENRE: {genre}:")
  #iterate over dataframe to find the average rating, and number of movies
  for index, row in genre_split[genre].iterrows():
    #print(row['primaryTitle'], row['averageRating'])
    avg = avg + row['averageRating']
    count = count + 1
  avg = avg/count
  print(f"\nAverage: {avg}, Number: {count}\n\n")
  #split dataframe above and below average
  genre_split_avg['B'+genre] = genre_split[genre][genre_split[genre]['averageRating'] <= avg]
  genre_split_avg['A'+genre] = genre_split[genre][genre_split[genre]['averageRating'] > avg]

GENRE: Western:

Average: 6.807561017238401, Number: 17577


GENRE: Sci-Fi:

Average: 6.9022174336160464, Number: 14386


GENRE: Animation:

Average: 6.918637813380806, Number: 53928


GENRE: Game-Show:

Average: 7.055489436619745, Number: 28400


GENRE: Crime:

Average: 6.923916035787982, Number: 81368


GENRE: Musical:

Average: 6.768030219273988, Number: 10854


GENRE: Film-Noir:

Average: 6.197754293262883, Number: 757


GENRE: Adult:

Average: 6.8731170610867975, Number: 37946


GENRE: Horror:

Average: 6.877791274955127, Number: 18934


GENRE: War:

Average: 6.755327996673252, Number: 9619


GENRE: History:

Average: 6.904943273905983, Number: 16042


GENRE: Thriller:

Average: 6.848720772619444, Number: 23608


GENRE: Biography:

Average: 6.861423499298138, Number: 12111


GENRE: Reality-TV:

Average: 7.071633939685979, Number: 24903


GENRE: Short:

Average: 6.92291333212695, Number: 143698


GENRE: Music:

Average: 6.961449907702556, Number: 59590


GENRE: Drama:

Average: 6.9

In [96]:
print(f"{genre_split_avg['AAction'].iloc[:1]}\n\n\n{genre_split_avg['BAction'].iloc[:1]}")

           tconst titleType    primaryTitle     originalTitle isAdult  \
159762  tt0165239     movie  Attack to Kill  Da jiang nan bei       0   

       startYear endYear runtimeMinutes        genres     tconst  \
159762      1975      \N             88  Action,Drama  tt0263729   

        averageRating  numVotes  
159762            8.6         6  


           tconst titleType          primaryTitle         originalTitle  \
281623  tt0294181  tvSeries  Sei jûshi Bisumaruku  Sei jûshi Bisumaruku   

       isAdult startYear endYear runtimeMinutes                      genres  \
281623       0      1984    1985             30  Action,Adventure,Animation   

           tconst  averageRating  numVotes  
281623  tt0540647            6.8         5  
