In [90]:
import pandas as pd
import numpy as np
import re
import pickle

import sys
sys.path.append('../')

import src.support as sp

In [61]:
originals = pd.read_csv("../data/netflix_originals.csv", index_col = 0)

In [62]:
originals.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi


In [63]:
originals.shape

(584, 6)

In [66]:
df= pd.read_csv("../data/netflix_titles.csv")
df.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."


In [67]:
originals_df = df.merge(originals, left_on= "title", right_on="Title", how="inner")

In [68]:
originals_df.drop(['release_year', 'rating', 'duration', 'listed_in', 'description','Title', "date_added"], axis = 1, inplace=True)

In [69]:
originals_df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,Genre,Premiere,Runtime,IMDB Score,Language
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentary,"October 2, 2020",90,7.5,English
1,s142,Movie,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada",Action,"April 24, 2020",117,6.7,English


# Cleaning

## Country

In [70]:
originals_df.isnull().sum()

show_id        0
type           0
title          0
director      22
cast          91
country       14
Genre          0
Premiere       0
Runtime        0
IMDB Score     0
Language       0
dtype: int64

In [71]:
originals_df["country"] = originals_df["country"].apply(lambda x: x.split(",")[0] if type(x) == str else x)
originals_df.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,Genre,Premiere,Runtime,IMDB Score,Language
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentary,"October 2, 2020",90,7.5,English


## Language

In [72]:
originals_df["Language"] = originals_df["Language"].apply(lambda x: x.split("/")[0] if type(x) == str else x)
originals_df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,Genre,Premiere,Runtime,IMDB Score,Language
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentary,"October 2, 2020",90,7.5,English
1,s142,Movie,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...",United States,Action,"April 24, 2020",117,6.7,English


## Cast

In [73]:
originals_df["cast"] = originals_df["cast"].apply(lambda x: x.split(",") if type(x) == str else x)
all_cast = originals_df.explode("cast")

In [45]:
originals_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,Genre,Premiere,Runtime,IMDB Score,Language
1,s142,Movie,Extraction,Steven C. Miller,Bruce Willis,United States,Action,"April 24, 2020",117,6.7,English
1,s142,Movie,Extraction,Steven C. Miller,Kellan Lutz,United States,Action,"April 24, 2020",117,6.7,English
1,s142,Movie,Extraction,Steven C. Miller,Gina Carano,United States,Action,"April 24, 2020",117,6.7,English
1,s142,Movie,Extraction,Steven C. Miller,D.B. Sweeney,United States,Action,"April 24, 2020",117,6.7,English
1,s142,Movie,Extraction,Steven C. Miller,Joshua Mikel,United States,Action,"April 24, 2020",117,6.7,English


In [80]:
all_cast.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,Genre,Premiere,Runtime,IMDB Score,Language
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentary,"October 2, 2020",90,7.5,English
1,s142,Movie,Extraction,Steven C. Miller,Bruce Willis,United States,Action,"April 24, 2020",117,6.7,English
1,s142,Movie,Extraction,Steven C. Miller,Kellan Lutz,United States,Action,"April 24, 2020",117,6.7,English


In [89]:
top10_actors = all_cast.groupby("cast").size().reset_index().sort_values(by = 0, ascending = False)["cast"].iloc[:10].to_list()
top10_actors

['Adam Sandler',
 ' Maya Rudolph',
 ' Andrew Bachelor',
 ' Jacki Weaver',
 ' Zachary Quinto',
 ' Lakeith Stanfield',
 ' Nick Swardson',
 ' Keegan-Michael Key',
 ' Rob Schneider',
 ' Robbie Amell']

In [93]:

with open('../data/pickle/top10_actors.pickle', 'wb') as handle:
    pickle.dump(top10_actors, handle)

## Director

In [98]:
top10_directors = all_cast.groupby("director").size().reset_index().sort_values(by = 0, ascending = False)["director"].iloc[:10].to_list()
top10_directors

['McG',
 'John Schultz',
 'Amy Poehler',
 'Christopher Guest',
 'Steve Brill',
 'Kyle Newacheck',
 'Javier Colinas',
 'Frank Coraci',
 'Damien O’Connor',
 'Robert Rodriguez']

In [100]:
with open ('../data/pickle/top10_directors.pickle', "wb") as dir:
    pickle.dump(top10_directors, dir)

## Genre

In [94]:
originals_df["Genre"] = originals_df["Genre"].apply(lambda x: x.split("/")[0].split("-")[0].replace("film", "").replace("thriller", "").strip() if type(x) == str else x)
originals_df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,Genre,Premiere,Runtime,IMDB Score,Language
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentary,"October 2, 2020",90,7.5,English
1,s142,Movie,Extraction,Steven C. Miller,"[Bruce Willis, Kellan Lutz, Gina Carano, D....",United States,Action,"April 24, 2020",117,6.7,English


In [95]:
def clean_genre(x):
    
    drama = re.findall("drama", x)
    romantic = re.findall("Roman", x)
    dance = re.findall("[D|d]ance|[M|m]us",x)
    comedy = re.findall("[C|c]ome", x)
    nulos = re.findall("^$|One", x)

    if drama:
        return ""
    elif dance:
        return "Musical"
    elif romantic:
        return "Romantic"
    elif comedy:
        return "Comedy"
    
    elif nulos:
        return np.nan
    else:
        return x

    

In [96]:
originals_df["Genre"] = originals_df["Genre"].apply(clean_genre)

In [50]:
originals_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,Genre,Premiere,Runtime,IMDB Score,Language
1,s142,Movie,Extraction,Steven C. Miller,Bruce Willis,United States,Action,"April 24, 2020",117,6.7,English
1,s142,Movie,Extraction,Steven C. Miller,Kellan Lutz,United States,Action,"April 24, 2020",117,6.7,English
1,s142,Movie,Extraction,Steven C. Miller,Gina Carano,United States,Action,"April 24, 2020",117,6.7,English
1,s142,Movie,Extraction,Steven C. Miller,D.B. Sweeney,United States,Action,"April 24, 2020",117,6.7,English
1,s142,Movie,Extraction,Steven C. Miller,Joshua Mikel,United States,Action,"April 24, 2020",117,6.7,English


In [60]:
originals_df.groupby(["director"])["IMDB Score"].size().reset_index().sort_values(by = "IMDB Score", ascending = False)

Unnamed: 0,director,IMDB Score
238,McG,43
181,John Schultz,29
341,Steve Brill,23
17,Amy Poehler,23
205,Kyle Newacheck,22
...,...,...
186,Juan Carlos Rulfo,1
95,David Sampliner,1
355,Thom Zimny,1
357,Tom Donahue,1


In [20]:
originals_df.to_csv("../data/netflix_originals_clean.csv")