In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', None, 'display.max_columns', None)

## Reading in the original Netflix data set
movie = pd.read_csv('Movie_Data.csv')

## Printing the first five observations
movie.head(1)

Unnamed: 0,Title,Genre,Tags,Languages,Series or Movie,Hidden Gem Score,Country Availability,Runtime,Director,Writer,Actors,View Rating,IMDb Score,Rotten Tomatoes Score,Metacritic Score,Awards Received,Awards Nominated For,Boxoffice,Release Date,Netflix Release Date,Production House,Netflix Link,IMDb Link,Summary,IMDb Votes,Image,Poster,TMDb Trailer,Trailer Site
0,Lets Fight Ghost,"Crime, Drama, Fantasy, Horror, Romance","Comedy Programmes,Romantic TV Comedies,Horror ...","Swedish, Spanish",Series,4.3,Thailand,< 30 minutes,Tomas Alfredson,John Ajvide Lindqvist,"Lina Leandersson, Kåre Hedebrant, Per Ragnar, ...",R,7.9,98.0,82.0,74.0,57.0,"$2,122,065",12 Dec 2008,2021-03-04,"Canal+, Sandrew Metronome",https://www.netflix.com/watch/81415947,https://www.imdb.com/title/tt1139797,A med student with a supernatural gift tries t...,205926.0,https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/...,https://m.media-amazon.com/images/M/MV5BOWM4NT...,https://www.youtube.com/watch?v=LqB6XJix-dM,YouTube


In [2]:
movie.shape

(9425, 29)

In [3]:
movie.columns

Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie',
       'Hidden Gem Score', 'Country Availability', 'Runtime', 'Director',
       'Writer', 'Actors', 'View Rating', 'IMDb Score',
       'Rotten Tomatoes Score', 'Metacritic Score', 'Awards Received',
       'Awards Nominated For', 'Boxoffice', 'Release Date',
       'Netflix Release Date', 'Production House', 'Netflix Link', 'IMDb Link',
       'Summary', 'IMDb Votes', 'Image', 'Poster', 'TMDb Trailer',
       'Trailer Site'],
      dtype='object')

### Data Cleaning

In [4]:
## Changing variable types for all necessary columns

movie['Languages'] = movie['Languages'].astype(str)
movie['Genre'] = movie['Genre'].astype(str)
movie['Tags'] = movie['Tags'].astype(str)
movie['Actors'] = movie['Actors'].astype(str)
movie['Runtime'] = movie['Runtime'].astype(str)
movie['View Rating'] = movie['View Rating'].astype(str)
movie['Release Date'] = movie['Release Date'].astype(str)

In [5]:
## Changing the Languages, Genre, Tags, and Actors variables to lists

for i in range(0, movie.shape[0]):
    movie['Languages'][i] = movie['Languages'][i].split(", ")
    movie['Genre'][i] = movie['Genre'][i].split(", ")
    movie['Tags'][i] = movie['Tags'][i].split(", ")
    movie['Actors'][i] = movie['Actors'][i].split(", ")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie['Languages'][i] = movie['Languages'][i].split(", ")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie['Genre'][i] = movie['Genre'][i].split(", ")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie['Tags'][i] = movie['Tags'][i].split(", ")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie['Actor

In [6]:
## Dropping unnecessary columns
movie = movie.drop(columns = ['Country Availability', 'Writer', 'Netflix Release Date', 'Production House', 'IMDb Votes', 
                             'Trailer Site', 'Boxoffice'])

In [7]:
## Creating a new data point for each "Language" option using the explode() function from Pandas

movie = movie.explode('Languages').reset_index(drop = True)

In [8]:
## Dropping observations missing a languages 

movie = movie[movie['Languages'] != 'nan']

## Resetting the index
movie = movie.reset_index(drop = True)

In [9]:
## Only keeping languages with more than 100 observations

top_languages = ['English', 'Japanese', 'Spanish', 'French', 'Korean', 'German', 'Hindi', 'Mandarin', 
                'Italian', 'Russian', 'Arabic', 'Cantonese', 'Portuguese', 'Thai', 'Dutch']


for i in range(0, movie.shape[0]):
    
    ## If the movie language is not a top language:
    if np.isin(movie['Languages'][i], top_languages, invert = True):
        
        ## Drop that observation from the data set
        movie = movie.drop([i])

## Resetting the index
movie = movie.reset_index(drop = True)

### Extra Cleaning

In [10]:
## Runtime
movie.at[8447, 'Runtime'] = '30-60 mins'

In [11]:
## View Rating
movie['View Rating'] = np.where(movie['View Rating'] == 'NOT RATED', 'Not Rated', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'Unrated', 'Not Rated', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'nan', 'Not Rated', movie['View Rating'])

movie['View Rating'] = np.where(movie['View Rating'] == 'X', 'NC-17', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'MA-17', 'NC-17', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'M/PG', 'R', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'Approved', 'PG-13', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'Passed', 'PG-13', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'GP', 'PG', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'E', 'G', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'AL', 'G', movie['View Rating'])

movie['View Rating'] = np.where(movie['View Rating'] == 'TV-Y', 'G', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-Y7', 'G', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-Y7-FV', 'G', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-G', 'G', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-PG', 'PG', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-13', 'PG-13', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-14', 'PG-13', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-MA', 'NC-17', movie['View Rating'])

### Variable Engineering

In [12]:
movie.head()

Unnamed: 0,Title,Genre,Tags,Languages,Series or Movie,Hidden Gem Score,Runtime,Director,Actors,View Rating,IMDb Score,Rotten Tomatoes Score,Metacritic Score,Awards Received,Awards Nominated For,Release Date,Netflix Link,IMDb Link,Summary,Image,Poster,TMDb Trailer
0,Lets Fight Ghost,"[Crime, Drama, Fantasy, Horror, Romance]","[Comedy Programmes,Romantic TV Comedies,Horror...",Spanish,Series,4.3,< 30 minutes,Tomas Alfredson,"[Lina Leandersson, Kåre Hedebrant, Per Ragnar,...",R,7.9,98.0,82.0,74.0,57.0,12 Dec 2008,https://www.netflix.com/watch/81415947,https://www.imdb.com/title/tt1139797,A med student with a supernatural gift tries t...,https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/...,https://m.media-amazon.com/images/M/MV5BOWM4NT...,https://www.youtube.com/watch?v=LqB6XJix-dM
1,HOW TO BUILD A GIRL,[Comedy],"[Dramas,Comedies,Films Based on Books,British]",English,Movie,7.0,1-2 hour,Coky Giedroyc,"[Cleo, Paddy Considine, Beanie Feldstein, Dóna...",R,5.8,79.0,69.0,1.0,,08 May 2020,https://www.netflix.com/watch/81041267,https://www.imdb.com/title/tt4193072,"When nerdy Johanna moves to London, things get...",https://occ-0-1081-999.1.nflxso.net/dnm/api/v6...,https://m.media-amazon.com/images/M/MV5BZGUyN2...,https://www.youtube.com/watch?v=eIbcxPy4okQ
2,The Con-Heartist,"[Comedy, Romance]","[Romantic Comedies,Comedies,Romantic Films,Tha...",Thai,Movie,8.6,> 2 hrs,Mez Tharatorn,"[Kathaleeya McIntosh, Nadech Kugimiya, Pimchan...",Not Rated,7.4,,,,,03 Dec 2020,https://www.netflix.com/watch/81306155,https://www.imdb.com/title/tt13393728,After her ex-boyfriend cons her out of a large...,https://occ-0-2188-64.1.nflxso.net/dnm/api/v6/...,https://m.media-amazon.com/images/M/MV5BODAzOG...,https://www.youtube.com/watch?v=md3CmFLGK6Y
3,Snowroller,[Comedy],"[Sports Movies,Sports Comedies,Comedies,Swedis...",English,Movie,5.3,1-2 hour,Lasse Åberg,"[Lasse Åberg, Cecilia Walton, Eva Millberg, Jo...",Not Rated,6.6,,,,,04 Oct 1985,https://www.netflix.com/watch/81382187,https://www.imdb.com/title/tt0090115,"Two friends take a ski trip to the Alps, where...",https://occ-0-2851-41.1.nflxso.net/dnm/api/v6/...,https://m.media-amazon.com/images/M/MV5BZDY2NG...,https://www.youtube.com/watch?v=tjWouBLwe3c
4,Snowroller,[Comedy],"[Sports Movies,Sports Comedies,Comedies,Swedis...",German,Movie,5.3,1-2 hour,Lasse Åberg,"[Lasse Åberg, Cecilia Walton, Eva Millberg, Jo...",Not Rated,6.6,,,,,04 Oct 1985,https://www.netflix.com/watch/81382187,https://www.imdb.com/title/tt0090115,"Two friends take a ski trip to the Alps, where...",https://occ-0-2851-41.1.nflxso.net/dnm/api/v6/...,https://m.media-amazon.com/images/M/MV5BZDY2NG...,https://www.youtube.com/watch?v=tjWouBLwe3c


In [13]:
## Runtime Categoricals

movie['Runtime_1'] = np.where(movie['Runtime'] == '< 30 minutes', 1, 0)
movie['Runtime_2'] = np.where(movie['Runtime'] == '30-60 mins', 1, 0)
movie['Runtime_3'] = np.where(movie['Runtime'] == '1-2 hour', 1, 0)
movie['Runtime_4'] = np.where(movie['Runtime'] == '> 2 hrs', 1, 0)

In [14]:
movie['Runtime'].value_counts()

1-2 hour        6812
< 30 minutes    2812
> 2 hrs         2350
30-60 mins       135
Name: Runtime, dtype: int64

In [15]:
## Average Score

## Putting all scores on the same scale
movie['Hidden Gem Score'] = movie['Hidden Gem Score'] * 10
movie['IMDb Score'] = movie['IMDb Score'] * 10

## Calculating the average of all scores
movie['Score'] = movie[['Hidden Gem Score', 'IMDb Score', 'Rotten Tomatoes Score', 'Metacritic Score']].mean(axis = 1)

In [16]:
movie['Score'].describe()

count    12109.000000
mean        62.935633
std         15.116867
min         10.250000
25%         54.000000
50%         65.500000
75%         75.000000
max         95.500000
Name: Score, dtype: float64

In [17]:
## View Rating Categoricals

movie['Rating_G'] = np.where(movie['View Rating'] == 'G', 1, 0)
movie['Rating_PG'] = np.where(movie['View Rating'] == 'PG', 1, 0)
movie['Rating_PG-13'] = np.where(movie['View Rating'] == 'PG-13', 1, 0)
movie['Rating_R'] = np.where(movie['View Rating'] == 'R', 1, 0)
movie['Rating_NC-17'] = np.where(movie['View Rating'] == 'NC-17', 1, 0)
movie['Rating_Not_Rated'] = np.where(movie['View Rating'] == 'Not Rated', 1, 0)

In [35]:
## Release Year

movie['Release'] = 0
movie['Release'] = movie['Release'].astype(str)

for i in range(0, movie.shape[0]):
    movie['Release'][i] = movie['Release Date'][i][-4:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie['Release'][i] = movie['Release Date'][i][-4:]


In [44]:
## Release Decade

movie['Release'] = np.where(movie['Release'] == 'nan', 0, movie['Release'])
movie['Release'] = movie['Release'].astype(int)

movie['Release_0'] = np.where((movie['Release'] >= 1900) & (movie['Release'] < 1940), 1, 0)
movie['Release_40'] = np.where((movie['Release'] >= 1940) & (movie['Release'] < 1950), 1, 0)
movie['Release_50'] = np.where((movie['Release'] >= 1950) & (movie['Release'] < 1960), 1, 0)
movie['Release_60'] = np.where((movie['Release'] >= 1960) & (movie['Release'] < 1970), 1, 0)
movie['Release_70'] = np.where((movie['Release'] >= 1970) & (movie['Release'] < 1980), 1, 0)
movie['Release_80'] = np.where((movie['Release'] >= 1980) & (movie['Release'] < 1990), 1, 0)
movie['Release_90'] = np.where((movie['Release'] >= 1990) & (movie['Release'] < 2000), 1, 0)
movie['Release_00'] = np.where((movie['Release'] >= 2000) & (movie['Release'] < 2010), 1, 0)
movie['Release_10'] = np.where((movie['Release'] >= 2010) & (movie['Release'] < 2020), 1, 0)
movie['Release_20'] = np.where((movie['Release'] >= 2020) & (movie['Release'] < 2030), 1, 0)

  res_values = method(rvalues)


In [None]:
## Genre

for i in range(0, 10):

movie['Genre_Crime'] = ''
movie['Genre_Drama'] = ''
movie['Genre_Fantasy'] = ''
movie['Genre_Horror'] = ''
movie['Genre_Romance'] = ''
movie['Genre_Comedy'] = ''
movie['Genre_Mystery'] = ''
movie['Genre_Thriller'] = ''
movie['Genre_Short'] = ''
movie['Genre_Action'] = ''
movie['Genre_Adventure'] = ''
movie['Genre_Sci-Fi'] = ''
movie['Genre_Animation'] = ''
movie['Genre_Family'] = ''
movie['Genre_Biography'] = ''
movie['Genre_War'] = ''
movie['Genre_History'] = ''
movie['Genre_Documentary'] = ''
movie['Genre_Film-Noir'] = ''
movie['Genre_Sport'] = ''
movie['Genre_Game-Show'] = ''
movie['Genre_Western'] = ''
movie['Genre_Music'] = ''
movie['Genre_Musical'] = ''
movie['Genre_ nan '] = ''
movie['Genre_Reality-TV'] = ''
movie['Genre_News'] = ''
movie['Genre_Talk-Show'] = ''
movie['Genre_Adult'] = ''

In [59]:
genres = []

for i in range(0, movie.shape[0]):
    for j in range(0, len(movie['Genre'][i])):
          if np.isin(movie['Genre'][i][j], genres, invert = True):
                   genres.append(movie['Genre'][i][j])

genres

['Crime',
 'Drama',
 'Fantasy',
 'Horror',
 'Romance',
 'Comedy',
 'Mystery',
 'Thriller',
 'Short',
 'Action',
 'Adventure',
 'Sci-Fi',
 'Animation',
 'Family',
 'Biography',
 'War',
 'History',
 'Documentary',
 'Film-Noir',
 'Sport',
 'Game-Show',
 'Western',
 'Music',
 'Musical',
 'nan',
 'Reality-TV',
 'News',
 'Talk-Show',
 'Adult']

In [67]:
movie[movie['Genre'] == ['nan']]

ValueError: Lengths must match to compare

In [61]:
len(genres)

29

In [73]:
for i in range(0, len(genres)):
    print("movie['Genre_", genres[i], "'] = ''")

movie['Genre_ Crime '] = ''
movie['Genre_ Drama '] = ''
movie['Genre_ Fantasy '] = ''
movie['Genre_ Horror '] = ''
movie['Genre_ Romance '] = ''
movie['Genre_ Comedy '] = ''
movie['Genre_ Mystery '] = ''
movie['Genre_ Thriller '] = ''
movie['Genre_ Short '] = ''
movie['Genre_ Action '] = ''
movie['Genre_ Adventure '] = ''
movie['Genre_ Sci-Fi '] = ''
movie['Genre_ Animation '] = ''
movie['Genre_ Family '] = ''
movie['Genre_ Biography '] = ''
movie['Genre_ War '] = ''
movie['Genre_ History '] = ''
movie['Genre_ Documentary '] = ''
movie['Genre_ Film-Noir '] = ''
movie['Genre_ Sport '] = ''
movie['Genre_ Game-Show '] = ''
movie['Genre_ Western '] = ''
movie['Genre_ Music '] = ''
movie['Genre_ Musical '] = ''
movie['Genre_ nan '] = ''
movie['Genre_ Reality-TV '] = ''
movie['Genre_ News '] = ''
movie['Genre_ Talk-Show '] = ''
movie['Genre_ Adult '] = ''


In [None]:
top_languages = ['English', 'Japanese', 'Spanish', 'French', 'Korean', 'German', 'Hindi', 'Mandarin', 
                'Italian', 'Russian', 'Arabic', 'Cantonese', 'Portuguese', 'Thai', 'Dutch']


for i in range(0, movie.shape[0]):
    
    ## If the movie language is not a top language:
    if np.isin(movie['Languages'][i], top_languages, invert = True):
        
        ## Drop that observation from the data set
        movie = movie.drop([i])

## Resetting the index
movie = movie.reset_index(drop = True)

In [53]:
movie['Genre'][10][0]

'Animation'

In [49]:
movie['Genre'].value_counts()

[Drama]                                                                                                      679
[Comedy]                                                                                                     621
[Drama, Romance]                                                                                             395
[Documentary]                                                                                                383
[Comedy, Drama, Romance]                                                                                     371
[Comedy, Drama]                                                                                              339
[Comedy, Romance]                                                                                            323
[Action, Crime, Thriller]                                                                                    239
[Crime, Drama, Thriller]                                                                        

In [47]:
movie.head(1)

Unnamed: 0,Title,Genre,Tags,Languages,Series or Movie,Hidden Gem Score,Runtime,Director,Actors,View Rating,IMDb Score,Rotten Tomatoes Score,Metacritic Score,Awards Received,Awards Nominated For,Release Date,Netflix Link,IMDb Link,Summary,Image,Poster,TMDb Trailer,Runtime_1,Runtime_2,Runtime_3,Runtime_4,Score,Rating_G,Rating_PG,Rating_PG-13,Rating_R,Rating_NC-17,Rating_Not_Rated,Release,Release_0,Release_40,Release_50,Release_60,Release_70,Release_80,Release_90,Release_00,Release_10,Release_20
0,Lets Fight Ghost,"[Crime, Drama, Fantasy, Horror, Romance]","[Comedy Programmes,Romantic TV Comedies,Horror...",Spanish,Series,43.0,< 30 minutes,Tomas Alfredson,"[Lina Leandersson, Kåre Hedebrant, Per Ragnar,...",R,79.0,98.0,82.0,74.0,57.0,12 Dec 2008,https://www.netflix.com/watch/81415947,https://www.imdb.com/title/tt1139797,A med student with a supernatural gift tries t...,https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/...,https://m.media-amazon.com/images/M/MV5BOWM4NT...,https://www.youtube.com/watch?v=LqB6XJix-dM,1,0,0,0,75.5,0,0,0,1,0,0,2008,0,0,0,0,0,0,0,1,0,0


In [None]:
## Dropping unnecessary columns after engineering process
movie = movie.drop(columns = ['Release Date', 'Release', 'Hdden Gem Score', 'Runtime', 'IMDb Score', 'Rotten Tomatoes Score', 
                             'Metacritic Score', ''])



### Data Subsetting for Algorithm

In [None]:
## Creating "Movie" and "Series" subsets

movies = movie[movie['Series or Movie'] == 'Movie'].reset_index(drop = True)

series = movie[movie['Series or Movie'] == 'Series'].reset_index(drop = True)