In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer
pd.set_option('display.max_rows', None, 'display.max_columns', None)

## Reading in the original Netflix data set
movie = pd.read_csv('Movie_Data.csv')

## Printing the first five observations
movie.head()

Unnamed: 0,Title,Genre,Tags,Languages,Series or Movie,Hidden Gem Score,Country Availability,Runtime,Director,Writer,Actors,View Rating,IMDb Score,Rotten Tomatoes Score,Metacritic Score,Awards Received,Awards Nominated For,Boxoffice,Release Date,Netflix Release Date,Production House,Netflix Link,IMDb Link,Summary,IMDb Votes,Image,Poster,TMDb Trailer,Trailer Site
0,Lets Fight Ghost,"Crime, Drama, Fantasy, Horror, Romance","Comedy Programmes,Romantic TV Comedies,Horror ...","Swedish, Spanish",Series,4.3,Thailand,< 30 minutes,Tomas Alfredson,John Ajvide Lindqvist,"Lina Leandersson, Kåre Hedebrant, Per Ragnar, ...",R,7.9,98.0,82.0,74.0,57.0,"$2,122,065",12 Dec 2008,2021-03-04,"Canal+, Sandrew Metronome",https://www.netflix.com/watch/81415947,https://www.imdb.com/title/tt1139797,A med student with a supernatural gift tries t...,205926.0,https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/...,https://m.media-amazon.com/images/M/MV5BOWM4NT...,https://www.youtube.com/watch?v=LqB6XJix-dM,YouTube
1,HOW TO BUILD A GIRL,Comedy,"Dramas,Comedies,Films Based on Books,British",English,Movie,7.0,Canada,1-2 hour,Coky Giedroyc,Caitlin Moran,"Cleo, Paddy Considine, Beanie Feldstein, Dónal...",R,5.8,79.0,69.0,1.0,,"$70,632",08 May 2020,2021-03-04,"Film 4, Monumental Pictures, Lionsgate",https://www.netflix.com/watch/81041267,https://www.imdb.com/title/tt4193072,"When nerdy Johanna moves to London, things get...",2838.0,https://occ-0-1081-999.1.nflxso.net/dnm/api/v6...,https://m.media-amazon.com/images/M/MV5BZGUyN2...,https://www.youtube.com/watch?v=eIbcxPy4okQ,YouTube
2,The Con-Heartist,"Comedy, Romance","Romantic Comedies,Comedies,Romantic Films,Thai...",Thai,Movie,8.6,Thailand,> 2 hrs,Mez Tharatorn,"Pattaranad Bhiboonsawade, Mez Tharatorn, Thods...","Kathaleeya McIntosh, Nadech Kugimiya, Pimchano...",,7.4,,,,,,03 Dec 2020,2021-03-03,,https://www.netflix.com/watch/81306155,https://www.imdb.com/title/tt13393728,After her ex-boyfriend cons her out of a large...,131.0,https://occ-0-2188-64.1.nflxso.net/dnm/api/v6/...,https://m.media-amazon.com/images/M/MV5BODAzOG...,https://www.youtube.com/watch?v=md3CmFLGK6Y,YouTube
3,Gleboka woda,Drama,"TV Dramas,Polish TV Shows,Social Issue TV Dramas",Polish,Series,8.7,Poland,< 30 minutes,,,"Katarzyna Maciag, Piotr Nowak, Marcin Dorocins...",,7.5,,,2.0,4.0,,14 Jun 2011,2021-03-03,,https://www.netflix.com/watch/81307527,https://www.imdb.com/title/tt2300049,A group of social welfare workers led by their...,47.0,https://occ-0-2508-2706.1.nflxso.net/dnm/api/v...,https://m.media-amazon.com/images/M/MV5BMTc0Nz...,https://www.youtube.com/watch?v=5kyF2vy63r0,YouTube
4,Only a Mother,Drama,"Social Issue Dramas,Dramas,Movies Based on Boo...",Swedish,Movie,8.3,"Lithuania,Poland,France,Italy,Spain,Greece,Bel...",1-2 hour,Alf Sjöberg,Ivar Lo-Johansson,"Hugo Björne, Eva Dahlbeck, Ulf Palme, Ragnar F...",,6.7,,,2.0,1.0,,31 Oct 1949,2021-03-03,,https://www.netflix.com/watch/81382068,https://www.imdb.com/title/tt0041155,An unhappily married farm worker struggling to...,88.0,https://occ-0-2851-41.1.nflxso.net/dnm/api/v6/...,https://m.media-amazon.com/images/M/MV5BMjVmMz...,https://www.youtube.com/watch?v=H0itWKFwMpQ,YouTube


In [2]:
movie.shape

(9425, 29)

In [3]:
movie.columns

Index(['Title', 'Genre', 'Tags', 'Languages', 'Series or Movie',
       'Hidden Gem Score', 'Country Availability', 'Runtime', 'Director',
       'Writer', 'Actors', 'View Rating', 'IMDb Score',
       'Rotten Tomatoes Score', 'Metacritic Score', 'Awards Received',
       'Awards Nominated For', 'Boxoffice', 'Release Date',
       'Netflix Release Date', 'Production House', 'Netflix Link', 'IMDb Link',
       'Summary', 'IMDb Votes', 'Image', 'Poster', 'TMDb Trailer',
       'Trailer Site'],
      dtype='object')

### Data Cleaning

In [4]:
## Changing variable types for all necessary columns

movie['Languages'] = movie['Languages'].astype(str)
movie['Genre'] = movie['Genre'].astype(str)
movie['Tags'] = movie['Tags'].astype(str)
movie['Actors'] = movie['Actors'].astype(str)
movie['Runtime'] = movie['Runtime'].astype(str)
movie['View Rating'] = movie['View Rating'].astype(str)
movie['Release Date'] = movie['Release Date'].astype(str)

In [5]:
## Changing the Languages, Genre, Tags, and Actors variables to lists

for i in range(0, movie.shape[0]):
    movie.at[i, 'Languages'] = movie.at[i, 'Languages'].split(", ")
    movie.at[i, 'Genre'] = movie.at[i, 'Genre'].split(", ")
    movie.at[i, 'Tags'] = movie.at[i, 'Tags'].split(",")
    movie.at[i, 'Actors'] = movie.at[i, 'Actors'].split(", ")

In [6]:
## Dropping unnecessary columns
movie = movie.drop(columns = ['Country Availability', 'Writer', 'Netflix Release Date', 'Production House', 'IMDb Votes', 
                             'Trailer Site', 'Boxoffice', 'Hidden Gem Score', 'Rotten Tomatoes Score', 'Metacritic Score'])

In [7]:
## Creating a new data point for each "Language" option using the explode() function from Pandas

movie = movie.explode('Languages').reset_index(drop = True)

In [8]:
## Dropping observations missing a languages 

movie = movie[movie['Languages'] != 'nan']

## Resetting the index
movie = movie.reset_index(drop = True)

In [9]:
## Only keeping languages with more than 100 observations

top_languages = ['English', 'Japanese', 'Spanish', 'French', 'Korean', 'German', 'Hindi', 'Mandarin', 
                'Italian', 'Russian', 'Arabic', 'Cantonese', 'Portuguese', 'Thai', 'Dutch']


for i in range(0, movie.shape[0]):
    
    ## If the movie language is not a top language:
    if np.isin(movie['Languages'][i], top_languages, invert = True):
        
        ## Drop that observation from the data set
        movie = movie.drop([i])

## Resetting the index
movie = movie.reset_index(drop = True)

### Extra Cleaning

In [10]:
## Runtime
movie.at[8447, 'Runtime'] = '30-60 mins'

In [11]:
## View Rating
movie['View Rating'] = np.where(movie['View Rating'] == 'NOT RATED', 'Not Rated', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'Unrated', 'Not Rated', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'nan', 'Not Rated', movie['View Rating'])

movie['View Rating'] = np.where(movie['View Rating'] == 'X', 'NC-17', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'MA-17', 'NC-17', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'M/PG', 'R', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'Approved', 'PG-13', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'Passed', 'PG-13', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'GP', 'PG', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'E', 'G', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'AL', 'G', movie['View Rating'])

movie['View Rating'] = np.where(movie['View Rating'] == 'TV-Y', 'G', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-Y7', 'G', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-Y7-FV', 'G', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-G', 'G', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-PG', 'PG', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-13', 'PG-13', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-14', 'PG-13', movie['View Rating'])
movie['View Rating'] = np.where(movie['View Rating'] == 'TV-MA', 'NC-17', movie['View Rating'])

### Variable Engineering

In [12]:
movie.head()

Unnamed: 0,Title,Genre,Tags,Languages,Series or Movie,Runtime,Director,Actors,View Rating,IMDb Score,Awards Received,Awards Nominated For,Release Date,Netflix Link,IMDb Link,Summary,Image,Poster,TMDb Trailer
0,Lets Fight Ghost,"[Crime, Drama, Fantasy, Horror, Romance]","[Comedy Programmes, Romantic TV Comedies, Horr...",Spanish,Series,< 30 minutes,Tomas Alfredson,"[Lina Leandersson, Kåre Hedebrant, Per Ragnar,...",R,7.9,74.0,57.0,12 Dec 2008,https://www.netflix.com/watch/81415947,https://www.imdb.com/title/tt1139797,A med student with a supernatural gift tries t...,https://occ-0-4708-64.1.nflxso.net/dnm/api/v6/...,https://m.media-amazon.com/images/M/MV5BOWM4NT...,https://www.youtube.com/watch?v=LqB6XJix-dM
1,HOW TO BUILD A GIRL,[Comedy],"[Dramas, Comedies, Films Based on Books, British]",English,Movie,1-2 hour,Coky Giedroyc,"[Cleo, Paddy Considine, Beanie Feldstein, Dóna...",R,5.8,1.0,,08 May 2020,https://www.netflix.com/watch/81041267,https://www.imdb.com/title/tt4193072,"When nerdy Johanna moves to London, things get...",https://occ-0-1081-999.1.nflxso.net/dnm/api/v6...,https://m.media-amazon.com/images/M/MV5BZGUyN2...,https://www.youtube.com/watch?v=eIbcxPy4okQ
2,The Con-Heartist,"[Comedy, Romance]","[Romantic Comedies, Comedies, Romantic Films, ...",Thai,Movie,> 2 hrs,Mez Tharatorn,"[Kathaleeya McIntosh, Nadech Kugimiya, Pimchan...",Not Rated,7.4,,,03 Dec 2020,https://www.netflix.com/watch/81306155,https://www.imdb.com/title/tt13393728,After her ex-boyfriend cons her out of a large...,https://occ-0-2188-64.1.nflxso.net/dnm/api/v6/...,https://m.media-amazon.com/images/M/MV5BODAzOG...,https://www.youtube.com/watch?v=md3CmFLGK6Y
3,Snowroller,[Comedy],"[Sports Movies, Sports Comedies, Comedies, Swe...",English,Movie,1-2 hour,Lasse Åberg,"[Lasse Åberg, Cecilia Walton, Eva Millberg, Jo...",Not Rated,6.6,,,04 Oct 1985,https://www.netflix.com/watch/81382187,https://www.imdb.com/title/tt0090115,"Two friends take a ski trip to the Alps, where...",https://occ-0-2851-41.1.nflxso.net/dnm/api/v6/...,https://m.media-amazon.com/images/M/MV5BZDY2NG...,https://www.youtube.com/watch?v=tjWouBLwe3c
4,Snowroller,[Comedy],"[Sports Movies, Sports Comedies, Comedies, Swe...",German,Movie,1-2 hour,Lasse Åberg,"[Lasse Åberg, Cecilia Walton, Eva Millberg, Jo...",Not Rated,6.6,,,04 Oct 1985,https://www.netflix.com/watch/81382187,https://www.imdb.com/title/tt0090115,"Two friends take a ski trip to the Alps, where...",https://occ-0-2851-41.1.nflxso.net/dnm/api/v6/...,https://m.media-amazon.com/images/M/MV5BZDY2NG...,https://www.youtube.com/watch?v=tjWouBLwe3c


In [13]:
## Scoring System

## For missing values, metric is zero
movie['IMDb Score'] = np.where(movie['IMDb Score'] > 0, movie['IMDb Score'], 0)
movie['Awards Received'] = np.where(movie['Awards Received'] > 0, movie['Awards Received'], 0)
movie['Awards Nominated For'] = np.where(movie['Awards Nominated For'] > 0, movie['Awards Nominated For'], 0)

## Putting the three metrics on the same scale
scaler = MinMaxScaler(feature_range = (0, 1))

movie[['IMDb Score', 'Awards Received', 'Awards Nominated For']] = scaler.fit_transform(
    movie[['IMDb Score', 'Awards Received', 'Awards Nominated For']])

movie['Popularity_Score'] = movie['IMDb Score'] + movie['Awards Received'] + movie['Awards Nominated For']

In [14]:
## Runtime Categoricals

movie['Runtime_1'] = np.where(movie['Runtime'] == '< 30 minutes', 1, 0)
movie['Runtime_2'] = np.where(movie['Runtime'] == '30-60 mins', 1, 0)
movie['Runtime_3'] = np.where(movie['Runtime'] == '1-2 hour', 1, 0)
movie['Runtime_4'] = np.where(movie['Runtime'] == '> 2 hrs', 1, 0)

In [15]:
## View Rating Categoricals

movie['Rating_G'] = np.where(movie['View Rating'] == 'G', 1, 0)
movie['Rating_PG'] = np.where(movie['View Rating'] == 'PG', 1, 0)
movie['Rating_PG-13'] = np.where(movie['View Rating'] == 'PG-13', 1, 0)
movie['Rating_R'] = np.where(movie['View Rating'] == 'R', 1, 0)
movie['Rating_NC-17'] = np.where(movie['View Rating'] == 'NC-17', 1, 0)
movie['Rating_Not_Rated'] = np.where(movie['View Rating'] == 'Not Rated', 1, 0)

In [16]:
## Release Year

movie['Release'] = 0
movie['Release'] = movie['Release'].astype(str)

for i in range(0, movie.shape[0]):
    movie.at[i, 'Release'] = movie.at[i, 'Release Date'][-4:]

In [17]:
## Release Decade

movie['Release'] = np.where(movie['Release'] == 'nan', 0, movie['Release'])
movie['Release'] = movie['Release'].astype(int)

movie['Release_0'] = np.where((movie['Release'] >= 1900) & (movie['Release'] < 1940), 1, 0)
movie['Release_40'] = np.where((movie['Release'] >= 1940) & (movie['Release'] < 1950), 1, 0)
movie['Release_50'] = np.where((movie['Release'] >= 1950) & (movie['Release'] < 1960), 1, 0)
movie['Release_60'] = np.where((movie['Release'] >= 1960) & (movie['Release'] < 1970), 1, 0)
movie['Release_70'] = np.where((movie['Release'] >= 1970) & (movie['Release'] < 1980), 1, 0)
movie['Release_80'] = np.where((movie['Release'] >= 1980) & (movie['Release'] < 1990), 1, 0)
movie['Release_90'] = np.where((movie['Release'] >= 1990) & (movie['Release'] < 2000), 1, 0)
movie['Release_00'] = np.where((movie['Release'] >= 2000) & (movie['Release'] < 2010), 1, 0)
movie['Release_10'] = np.where((movie['Release'] >= 2010) & (movie['Release'] < 2020), 1, 0)
movie['Release_20'] = np.where((movie['Release'] >= 2020) & (movie['Release'] < 2030), 1, 0)

In [18]:
## Genre and Tags

mlb = MultiLabelBinarizer()

genres = pd.DataFrame(mlb.fit_transform(movie['Genre']),columns = mlb.classes_, index = movie.index)
tags = pd.DataFrame(mlb.fit_transform(movie['Tags']),columns = mlb.classes_, index = movie.index)

movie = pd.concat([movie, genres, tags], axis = 1)

In [19]:
## Dropping unnecessary columns after engineering process
movie = movie.drop(columns = ['Release Date', 'Release', 'Runtime', 'IMDb Score', 'Tags', 'Awards Received', 
                              'Awards Nominated For', 'Director', 'Actors', 'nan'])

### Exporting final data set

In [20]:
movie.shape

(12109, 1004)

In [21]:
movie.to_csv('blockbuster.csv', index = False)