In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [2]:
print('Reading csv file')
df = pd.read_csv('netflix.csv')

Reading csv file


In [3]:
df.shape

(8807, 12)

In [4]:
print('Viewing the data of the top 5 rows ')
df.head()

Viewing the data of the top 5 rows 


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [5]:
print('Checking the datatypes of columns')
df.info()

Checking the datatypes of columns
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [6]:
print('Checking for null values')
df.isnull().sum()

Checking for null values


show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [30]:
print('Filling missing values of data using mode')

Filling missing values of data using mode


In [9]:
df['duration'].value_counts()

duration
1 Season     1793
2 Seasons     425
3 Seasons     199
90 min        152
94 min        146
             ... 
16 min          1
186 min         1
193 min         1
189 min         1
191 min         1
Name: count, Length: 220, dtype: int64

In [10]:
df['duration'].fillna('1 Season', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['duration'].fillna('1 Season', inplace=True)


In [11]:
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             0
duration           0
listed_in          0
description        0
dtype: int64

In [12]:
df['country'] = df['country'].fillna(df['country'].mode()[0])
df['rating'] = df['rating'].fillna(df['rating'].mode()[0])

In [13]:
print('Dropping rows containing null values')
df.dropna(subset=['date_added'], inplace=True)

Dropping rows containing null values


In [14]:
df.isnull().sum()

show_id            0
type               0
title              0
director        2624
cast             825
country            0
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
description        0
dtype: int64

In [15]:
print('replacing null values with unknown')
df['director'] = df['director'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Unknown')

replacing null values with unknown


In [16]:
df.isnull().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [17]:
print('Checking for duplicate values')
df.duplicated().sum()

Checking for duplicate values


0

In [18]:
print('Standardising text values ')
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].str.strip().str.lower()

Standardising text values 


In [19]:
df['country'].value_counts()

country
united states                             3642
india                                      972
united kingdom                             418
japan                                      244
south korea                                199
                                          ... 
romania, bulgaria, hungary                   1
uruguay, guatemala                           1
france, senegal, belgium                     1
mexico, united states, spain, colombia       1
united arab emirates, jordan                 1
Name: count, Length: 748, dtype: int64

In [20]:
print('Normalizing country names ')
df['country'] = df['country'].replace({
    'united states': 'usa',
    'united kingdom': 'uk',
    'united arab emirates': 'uae'
})

Normalizing country names 


In [21]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,movie,dick johnson is dead,kirsten johnson,unknown,usa,"september 25, 2021",2020,pg-13,90 min,documentaries,"as her father nears the end of his life, filmm..."
1,s2,tv show,blood & water,unknown,"ama qamata, khosi ngema, gail mabalane, thaban...",south africa,"september 24, 2021",2021,tv-ma,2 seasons,"international tv shows, tv dramas, tv mysteries","after crossing paths at a party, a cape town t..."
2,s3,tv show,ganglands,julien leclercq,"sami bouajila, tracy gotoas, samuel jouy, nabi...",usa,"september 24, 2021",2021,tv-ma,1 season,"crime tv shows, international tv shows, tv act...",to protect his family from a powerful drug lor...
3,s4,tv show,jailbirds new orleans,unknown,unknown,usa,"september 24, 2021",2021,tv-ma,1 season,"docuseries, reality tv","feuds, flirtations and toilet talk go down amo..."
4,s5,tv show,kota factory,unknown,"mayur more, jitendra kumar, ranjan raj, alam k...",india,"september 24, 2021",2021,tv-ma,2 seasons,"international tv shows, romantic tv shows, tv ...",in a city of coaching centers known to train i...


In [22]:
print('Converting Date Formats to Consistent Type')
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

Converting Date Formats to Consistent Type


  df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')


In [23]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,movie,dick johnson is dead,kirsten johnson,unknown,usa,2021-09-25,2020,pg-13,90 min,documentaries,"as her father nears the end of his life, filmm..."
1,s2,tv show,blood & water,unknown,"ama qamata, khosi ngema, gail mabalane, thaban...",south africa,2021-09-24,2021,tv-ma,2 seasons,"international tv shows, tv dramas, tv mysteries","after crossing paths at a party, a cape town t..."
2,s3,tv show,ganglands,julien leclercq,"sami bouajila, tracy gotoas, samuel jouy, nabi...",usa,2021-09-24,2021,tv-ma,1 season,"crime tv shows, international tv shows, tv act...",to protect his family from a powerful drug lor...
3,s4,tv show,jailbirds new orleans,unknown,unknown,usa,2021-09-24,2021,tv-ma,1 season,"docuseries, reality tv","feuds, flirtations and toilet talk go down amo..."
4,s5,tv show,kota factory,unknown,"mayur more, jitendra kumar, ranjan raj, alam k...",india,2021-09-24,2021,tv-ma,2 seasons,"international tv shows, romantic tv shows, tv ...",in a city of coaching centers known to train i...


In [24]:
print('Dropping columns show_id as it is not needed')
df.drop(columns='show_id', inplace=True)

Dropping columns show_id as it is not needed


In [25]:
df['rating'].value_counts()

rating
tv-ma       3209
tv-14       2157
tv-pg        861
r            799
pg-13        490
tv-y7        333
tv-y         306
pg           287
tv-g         220
nr            79
g             41
tv-y7-fv       6
nc-17          3
ur             3
74 min         1
84 min         1
66 min         1
Name: count, dtype: int64

In [26]:
print('Since the list for rating is too long, we will bucket them into 3 categories; kids, teens, adults')
def simplify_rating(rating):
    kids = ['TV-Y', 'TV-Y7', 'G', 'PG']
    teens = ['TV-G', 'TV-PG', 'PG-13']
    adults = ['TV-14', 'TV-MA', 'R', 'NC-17']
    if rating in kids:
        return 'kids'
    elif rating in teens:
        return 'teens'
    elif rating in adults:
        return 'adults'
    else:
        return 'unknown'

df['rating_category'] = df['rating'].apply(simplify_rating)

Since the list for rating is too long, we will bucket them into 3 categories; kids, teens, adults


In [27]:
print('Final preprocessed table data')
df.head()

Final preprocessed table data


Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,rating_category
0,movie,dick johnson is dead,kirsten johnson,unknown,usa,2021-09-25,2020,pg-13,90 min,documentaries,"as her father nears the end of his life, filmm...",unknown
1,tv show,blood & water,unknown,"ama qamata, khosi ngema, gail mabalane, thaban...",south africa,2021-09-24,2021,tv-ma,2 seasons,"international tv shows, tv dramas, tv mysteries","after crossing paths at a party, a cape town t...",unknown
2,tv show,ganglands,julien leclercq,"sami bouajila, tracy gotoas, samuel jouy, nabi...",usa,2021-09-24,2021,tv-ma,1 season,"crime tv shows, international tv shows, tv act...",to protect his family from a powerful drug lor...,unknown
3,tv show,jailbirds new orleans,unknown,unknown,usa,2021-09-24,2021,tv-ma,1 season,"docuseries, reality tv","feuds, flirtations and toilet talk go down amo...",unknown
4,tv show,kota factory,unknown,"mayur more, jitendra kumar, ranjan raj, alam k...",india,2021-09-24,2021,tv-ma,2 seasons,"international tv shows, romantic tv shows, tv ...",in a city of coaching centers known to train i...,unknown
