Netflix wants to identify similar movies based on movie characteristics.

https://www.kaggle.com/shivamb/netflix-shows

In [100]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from wordcloud import WordCloud

# Display preference
warnings.filterwarnings('ignore')
pd.set_option('Display.max_columns', 100)
pd.set_option('Display.max_rows', 10000)

In [118]:
netflix = pd.read_csv('../../data/netflix_titles.csv')

In [119]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 12 columns):
show_id         6234 non-null int64
type            6234 non-null object
title           6234 non-null object
director        4265 non-null object
cast            5664 non-null object
country         5758 non-null object
date_added      6223 non-null object
release_year    6234 non-null int64
rating          6224 non-null object
duration        6234 non-null object
listed_in       6234 non-null object
description     6234 non-null object
dtypes: int64(2), object(10)
memory usage: 584.6+ KB


In [120]:
netflix = netflix.drop(['director'], axis=1)
netflix = netflix.dropna(subset=['cast', 'date_added'])
netflix['country'] = netflix['country'].fillna('Unknown')
netflix['rating'] = netflix['rating'].fillna('Unknown')

In [121]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5654 entries, 0 to 6222
Data columns (total 11 columns):
show_id         5654 non-null int64
type            5654 non-null object
title           5654 non-null object
cast            5654 non-null object
country         5654 non-null object
date_added      5654 non-null object
release_year    5654 non-null int64
rating          5654 non-null object
duration        5654 non-null object
listed_in       5654 non-null object
description     5654 non-null object
dtypes: int64(2), object(9)
memory usage: 530.1+ KB


In [53]:
# Create netflix column labels for show categories
show_categories = set()

for show_category in netflix['listed_in']:
    
    # Create list of show categories
    categories = show_category.split(',')
    
    # Add category to set
    for category in categories:
        show_categories.add(category.strip().lower())

In [54]:
# Create new netflix column labels
for show_category in sorted(show_categories):
    
    # Assign initial value
    netflix[show_category] = False

In [55]:
# Retrieve index for description column
description_index = netflix.columns.tolist().index('description')
tv_thriller_index = netflix.columns.tolist().index('tv thrillers')

In [56]:
for show_category in enumerate(netflix['listed_in'], 0):
    
    # Create list of show categories
    categories = show_category[1].split(',')
    
    for category in categories:
        
        # Remove whitespace and set to lowercase
        category = category.strip().lower()
        
        # Show category in netflix columns
        if category in netflix.columns[(description_index + 1):(tv_thriller_index + 1)]:
            
            # Update show category value to True
            netflix[category][show_category[0]] = True

In [135]:
# Create netflix column labels for countries
countries = set()

for country in netflix['country']:
    
    show_countries = country.split(',')

    if '' in show_countries:
        show_countries.remove('')
    
    for show_country in show_countries:
        countries.add(show_country.strip().lower())

In [None]:
for country in countries:
    netflix[country] = False

In [96]:
def lead_role(cast):
    show_cast = cast.split(',')
    return show_cast[0].strip()

In [97]:
netflix['lead_role'] = netflix['cast'].apply(lead_role)