In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})

from time import strptime
from datetime import datetime, date 

import re

from sklearn.preprocessing import MultiLabelBinarizer

import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import CountVectorizer

import spacy # Leading library for NLP
nlp = spacy.load('en_core_web_sm')
from nltk.stem import SnowballStemmer #WordNetLemmatizer, 
import string

In [None]:
# Load the dataset
netflix_titles = pd.read_csv('netflix_titles.csv')


# Only filter for TV shows
netflix_shows = netflix_titles[(netflix_titles.type == 'TV Show')].reset_index(drop=True)


netflix_shows.head()

In [None]:
netflix_shows.info()

In [None]:
netflix_shows.nunique()

In [None]:
netflix_shows.isna().sum(axis=0)

In [None]:
# Remove unnecessary columns
netflix_shows.drop(["show_id", "type"], axis=1, inplace=True)

#Correcting Data Formats
display(netflix_shows[['duration', 'date_added']].head().style.set_caption("Before correcting data format"))

# Convert duration from string format to integer and remove keyword "season"
netflix_shows["duration"] = netflix_shows.duration.apply(lambda x: x.split(' ')[0])
# Convert to integer
netflix_shows["duration"] = netflix_shows["duration"].astype(int)


# Split the date format into year, month, day
# Convert to string
netflix_shows.date_added = netflix_shows.date_added.astype(str)
# Clean data where first character in string is a space
netflix_shows.date_added = netflix_shows.date_added.apply(lambda x: x[1:] if x[0] == " " else x)

# Get the month
netflix_shows['date_added_month'] = netflix_shows.date_added.apply(lambda x: x.split(" ")[0])
# Convert string to integer
netflix_shows['date_added_month'] = netflix_shows.date_added_month.apply(lambda x: strptime(x,'%B').tm_mon if ((x != "")&(x != "nan")) else np.nan)
# Fill nan and convert to integer
netflix_shows['date_added_month'] = netflix_shows['date_added_month'].fillna(1).astype(int)

# Get the day
netflix_shows['date_added_day'] = netflix_shows.date_added.apply(lambda x: x.split(" ")[1] if len(x.split(" ")) > 1 else np.nan)
# Remove ","
netflix_shows['date_added_day'] = netflix_shows.date_added_day.apply(lambda x: x.split(",")[0] if x==x else np.nan)
# Fill nan and convert to integer
netflix_shows['date_added_day'] = netflix_shows['date_added_day'].fillna(1).astype(int)

# Get the year
netflix_shows['date_added_year'] = netflix_shows.date_added.apply(lambda x: x.split(" ")[2] if len(x.split(" ")) > 2 else np.nan)
# Fill nan and convert to integer
netflix_shows['date_added_year'] = netflix_shows['date_added_year'].fillna(1800).astype(int)

# Convert date_added to datetime format
netflix_shows['date_added'] = netflix_shows.apply(lambda x: datetime(x.date_added_year, x.date_added_month, x.date_added_day), axis=1)

display(netflix_shows[['duration', 'date_added', 'date_added_day', 'date_added_month', 'date_added_year']].head().style.set_caption("After correcting data format"))

In [27]:
display(netflix_shows[['listed_in']].head().style.set_caption("Before using MultiLabelBinarizer"))

mlb_columns = ["listed_in", "cast", "director", "country"]
mlb = MultiLabelBinarizer()

for col in mlb_columns:
    netflix_shows[col] = netflix_shows[col].fillna("Unknown")
    netflix_shows[col] = netflix_shows[col].apply(lambda x: x.split(", "))
    netflix_shows = netflix_shows.join(pd.DataFrame(mlb.fit_transform(netflix_shows[col]),columns= [f"{col}_{re.sub(' ', '', c)}" for c in mlb.classes_]))


display(netflix_shows[netflix_shows.columns[netflix_shows.columns.str.startswith('listed_in')]].head().style.set_caption("After using MultiLabelBinarizer"))

Unnamed: 0,listed_in
0,"International TV Shows, TV Dramas, TV Mysteries"
1,"Crime TV Shows, International TV Shows, TV Action & Adventure"
2,"Docuseries, Reality TV"
3,"International TV Shows, Romantic TV Shows, TV Comedies"
4,"TV Dramas, TV Horror, TV Mysteries"


Unnamed: 0,listed_in,listed_in_AnimeSeries,listed_in_BritishTVShows,listed_in_Classic&CultTV,listed_in_CrimeTVShows,listed_in_Docuseries,listed_in_InternationalTVShows,listed_in_Kids'TV,listed_in_KoreanTVShows,listed_in_RealityTV,listed_in_RomanticTVShows,listed_in_Science&NatureTV,listed_in_Spanish-LanguageTVShows,listed_in_Stand-UpComedy&TalkShows,listed_in_TVAction&Adventure,listed_in_TVComedies,listed_in_TVDramas,listed_in_TVHorror,listed_in_TVMysteries,listed_in_TVSci-Fi&Fantasy,listed_in_TVShows,listed_in_TVThrillers,listed_in_TeenTVShows
0,"['International TV Shows', 'TV Dramas', 'TV Mysteries']",0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
1,"['Crime TV Shows', 'International TV Shows', 'TV Action & Adventure']",0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,"['Docuseries', 'Reality TV']",0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"['International TV Shows', 'Romantic TV Shows', 'TV Comedies']",0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0
4,"['TV Dramas', 'TV Horror', 'TV Mysteries']",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0


In [None]:
display(netflix_shows[['description']].head().style.set_table_attributes("style='display:inline'").set_caption("Before creating Document-Term Matrix"))


def clean_text(text):
    # Convert text to lowercase
    text = text.lower() 

    # Remove punctuation
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)

    # Remove non-Roman characters
    text = re.sub("([^\x00-\x7F])+", " ", text)
    
    # Tokenize
    text = nlp(text)

    stemmer = SnowballStemmer("english")

    text_clean = ""
    for token in text:
        # Remove stop words and remove words with fewer than 3 chars
        if (not token.is_stop) and len(token) > 3:
            # Lemmatize and tokenize
            text_clean += stemmer.stem(token.lemma_) + " "
    
    return text_clean


netflix_shows['description_clean'] = netflix_shows.description.apply(lambda x: clean_text(x))

cv = CountVectorizer(ngram_range=(1,1)) # Doesn't include bigrams
data_cv = cv.fit_transform(netflix_shows['description_clean'])
data_dtm = pd.DataFrame(data_cv.toarray(), columns = cv.get_feature_names())
data_dtm.index = netflix_shows.index
data_dtm.columns = [f"dtm_descr_{c}" for c in data_dtm.columns]

netflix_shows = netflix_shows.join(data_dtm)

cv = CountVectorizer(ngram_range=(1,1)) # Doesn't include bigrams
data_cv = cv.fit_transform(netflix_shows['title'])
data_dtm = pd.DataFrame(data_cv.toarray(), columns = cv.get_feature_names())
data_dtm.index = netflix_shows.index
data_dtm.columns = [f"dtm_title_{c}" for c in data_dtm.columns]

netflix_shows = netflix_shows.join(data_dtm)

display(netflix_shows[netflix_shows.columns[netflix_shows.columns.str.startswith('dtm_descr_')]].head().style.set_caption("After creating Document-Term Matrix"))

# Feature Engineering

In [29]:
# The weekday on which the TV show was added to the library
netflix_shows['date_added_weekday'] = pd.DatetimeIndex(netflix_shows['date_added']).weekday

# The release year of the first season of the TV show
netflix_shows['first_release_year'] = netflix_shows.release_year - netflix_shows.duration

# The time between the original release of the TV show and the TV show being added to the Netflix library
netflix_shows['time_first_release_to_netflix'] = netflix_shows.date_added_year - netflix_shows.first_release_year

# Save current state to output
netflix_shows.to_csv("netflix_shows.csv", index=False)