In [3]:
import pandas as pd
import numpy as np
import re
import pickle

# Adding the parent directory to the path so that the `src` directory can be imported.
import sys
sys.path.append('../')

# Importing the `support` module from the `src` directory.
import src.support as sp

In [4]:
# Reading the csv file and storing it in a dataframe called df.
originals = pd.read_csv("../data/netflix_originals.csv", index_col = 0)
originals.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi


In [5]:
# Reading the csv file and storing it in a dataframe called df.
df= pd.read_csv("../data/netflix_titles.csv")
df.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."


In [6]:
# merging two datasets
originals_df = df.merge(originals, left_on= "title", right_on="Title", how="inner")

In [7]:
# Dropping the columns that are not needed for the analysis.
originals_df.drop(['release_year', 'rating', 'duration', 'listed_in', 'description','Title', "date_added"], axis = 1, inplace=True)
originals_df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,Genre,Premiere,Runtime,IMDB Score,Language
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentary,"October 2, 2020",90,7.5,English
1,s142,Movie,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada",Action,"April 24, 2020",117,6.7,English


In [8]:
# Dropping all rows that have a NaN value in any column.
originals_df2 = originals_df.dropna()

# Cleaning

## Country

In [9]:
# get the main production country
originals_df["country"] = originals_df["country"].apply(lambda x: x.split(",")[0] if type(x) == str else x)
originals_df.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,Genre,Premiere,Runtime,IMDB Score,Language
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentary,"October 2, 2020",90,7.5,English


## Language

In [10]:
# get the main language

originals_df["Language"] = originals_df["Language"].apply(lambda x: x.split("/")[0] if type(x) == str else x)
originals_df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,Genre,Premiere,Runtime,IMDB Score,Language
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentary,"October 2, 2020",90,7.5,English
1,s142,Movie,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...",United States,Action,"April 24, 2020",117,6.7,English


## Cast

In [30]:
# extracting all the actors that participate in the films, and separate each actor in differents rows
originals_df["cast"] = originals_df["cast"].apply(lambda x: x.split(",") if type(x) == str else x)
all_cast = originals_df.explode("cast")
all_cast["cast"] = all_cast["cast"].str.strip()
originals_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,Genre,Premiere,Runtime,IMDB Score,Language
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentary,"October 2, 2020",90,7.5,English
1,s142,Movie,Extraction,Steven C. Miller,"[Bruce Willis, Kellan Lutz, Gina Carano, D....",United States,Action,"April 24, 2020",117,6.7,English
2,s625,Movie,Prime Time,Jakub Piątek,"[Bartosz Bielenia, Magdalena Popławska, Andr...",,Thriller,"April 14, 2021",91,5.7,Polish
3,s835,Movie,Blue Miracle,Julio Quintana,"[Jimmy Gonzales, Dennis Quaid, Anthony Gonza...",United States,Drama,"May 27, 2021",95,6.7,English
4,s837,Movie,Ghost Lab,Paween Purijitpanya,"[Thanapob Leeratanakachorn, Paris Intarakomal...",Thailand,Horror,"May 26, 2021",117,5.2,Thai


## Director

In [22]:
# getting the most populats directors
top10_directors = all_cast.groupby("director").size().reset_index().sort_values(by = 0, ascending = False)["director"].iloc[:10].to_list()
top10_directors

['McG',
 'John Schultz',
 'Amy Poehler',
 'Christopher Guest',
 'Steve Brill',
 'Kyle Newacheck',
 'Javier Colinas',
 'Frank Coraci',
 'Damien O’Connor',
 'Robert Rodriguez']

In [23]:
# The above code is opening the pickle file and writing the top10_directors dataframe to the pickle file.
with open ('../data/pickle/top10_directors.pickle', "wb") as dir:
    pickle.dump(top10_directors, dir)

## Genre

In [24]:
# extracting all the genres that participate in the films, and separate each actor in differents rows
originals_df["Genre"] = originals_df["Genre"].apply(lambda x: x.split("/")[0].split("-")[0].replace("film", "").replace("thriller", "").strip() if type(x) == str else x)
originals_df.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,Genre,Premiere,Runtime,IMDB Score,Language
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,Documentary,"October 2, 2020",90,7.5,English
1,s142,Movie,Extraction,Steven C. Miller,"[Bruce Willis, Kellan Lutz, Gina Carano, D....",United States,Action,"April 24, 2020",117,6.7,English


In [25]:
def clean_genre(x):
    
    drama = re.findall("drama", x)
    romantic = re.findall("Roman", x)
    dance = re.findall("[D|d]ance|[M|m]us",x)
    comedy = re.findall("[C|c]ome", x)
    nulos = re.findall("^$|One", x)

    if drama:
        return ""
    elif dance:
        return "Musical"
    elif romantic:
        return "Romantic"
    elif comedy:
        return "Comedy"
    
    elif nulos:
        return np.nan
    else:
        return x

    

In [26]:
# aplying the clean_genre function
originals_df["Genre"] = originals_df["Genre"].apply(clean_genre)

In [28]:
originals_df.groupby(["director"])["IMDB Score"].size().reset_index().sort_values(by = "IMDB Score", ascending = False)

Unnamed: 0,director,IMDB Score
285,McG,3
403,Spike Lee,2
48,"Bonni Cohen, Jon Shenk",2
143,Florent Bodin,2
260,Liz Garbus,2
...,...,...
156,George C. Wolfe,1
155,Genevieve Nnaji,1
154,Gaspar Antillo,1
153,Gareth Evans,1


In [29]:
originals_df.to_csv("../data/netflix_originals_clean.csv")