#Data Loading and Initial Exploration

In [23]:
import pandas as pd

In [24]:
f_path = "/content/netflix_titles.xlsx"
netflix = pd.read_excel(f_path)

In [25]:
# Display the First Few Rows
netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25 00:00:00,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24 00:00:00,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021-09-24 00:00:00,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,2021-09-24 00:00:00,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24 00:00:00,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [26]:
# overview of the dataset
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [27]:
# Statistical Summary for Columns
netflix.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


# Data Cleaning

In [28]:
# missing values check
netflix.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [29]:
# checking unique values
netflix.nunique()

show_id         8807
type               2
title           8804
director        4528
cast            7692
country          748
date_added      1767
release_year      74
rating            17
duration         220
listed_in        514
description     8775
dtype: int64

In [30]:
# Dealing with Missing Values

mode_direct = netflix['director'].mode()[0]
mode_cast = netflix['cast'].mode()[0]
mode_country = netflix['country'].mode()[0]
mode_date_added = netflix['date_added'].mode()[0]
mode_rating = netflix['rating'].mode()[0]

median_duration = netflix['duration'].median

In [31]:
print(mode_direct)
print(mode_cast)
print(mode_country)
print(mode_date_added)
print(mode_rating)
print(median_duration)

Rajiv Chilaka
David Attenborough
United States
2020-01-01 00:00:00
TV-MA
<bound method NDFrame._add_numeric_operations.<locals>.median of 0          90 min
1       2 Seasons
2        1 Season
3        1 Season
4       2 Seasons
          ...    
8802      158 min
8803    2 Seasons
8804       88 min
8805       88 min
8806      111 min
Name: duration, Length: 8807, dtype: object>


In [32]:
netflix['director'].fillna(mode_direct, inplace=True)
netflix['cast'].fillna(mode_cast, inplace=True)
netflix['country'].fillna(mode_country, inplace=True)
netflix['date_added'].fillna(mode_date_added, inplace=True)
netflix['rating'].fillna(mode_rating, inplace=True)

netflix['duration'].fillna(median_duration, inplace=True)

In [33]:
# checking null values again

netflix.isnull().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [34]:
# checking duplicates from data

netflix.duplicated().sum()

0

In [39]:
# changing column data type

netflix['date_added'] = pd.to_datetime(netflix['date_added'])
netflix['release_year'] = netflix['release_year'].astype(int)
netflix['rating'] = netflix['rating'].astype('category')
netflix['type'] = netflix['type'].astype('category')

In [41]:
netflix.dtypes

show_id                 object
type                  category
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                category
duration                object
listed_in               object
description             object
dtype: object

# Data Transformation

In [42]:
# Creating New Features

netflix['year_added'] = netflix['date_added'].dt.year
netflix['month_added'] = netflix['date_added'].dt.month

In [43]:
# mapping countries to standardize

country_mapping = {
    'United States': 'USA',
    'India': 'India',
    'United Kingdom': 'UK',
    'Canada': 'Canada',
    'France': 'France',
    'Japan': 'Japan',
    'South Korea': 'South Korea',
    'Spain': 'Spain',
    'Germany': 'Germany',
    'Mexico': 'Mexico',
    'Australia': 'Australia',
    'China': 'China',
    'Hong Kong': 'Hong Kong',
    'Taiwan': 'Taiwan',
    'Turkey': 'Turkey',
    'Italy': 'Italy',
    'Brazil': 'Brazil',
    'Netherlands': 'Netherlands',
    'Argentina': 'Argentina',
    'Singapore': 'Singapore',
    'Belgium': 'Belgium',
    'Colombia': 'Colombia',
    'Sweden': 'Sweden',
    'Russia': 'Russia',
    'Norway': 'Norway',
    'Denmark': 'Denmark',
    'South Africa': 'South Africa',
    'Ireland': 'Ireland',
    'New Zealand': 'New Zealand',
    'Poland': 'Poland',
    'Switzerland': 'Switzerland',
    'Philippines': 'Philippines',
    'Finland': 'Finland',
    'Austria': 'Austria',
    'Israel': 'Israel',
    'Chile': 'Chile',
    'Thailand': 'Thailand',
    'Portugal': 'Portugal',
    'Malaysia': 'Malaysia',
    'United Arab Emirates': 'UAE',
    'Greece': 'Greece',
    'Czech Republic': 'Czech Republic',
    'Saudi Arabia': 'Saudi Arabia',
    'Egypt': 'Egypt',
    'Hungary': 'Hungary',
    'Pakistan': 'Pakistan',
    'Serbia': 'Serbia',
    'Vietnam': 'Vietnam',
    'Peru': 'Peru',
    'Bulgaria': 'Bulgaria',
    'Soviet Union': 'Russia',
    'West Germany': 'Germany',
    'Soviet Union (former)': 'Russia',
    'East Germany': 'Germany',
    'Federal Republic of Yugoslavia': 'Serbia',
    'Yugoslavia': 'Serbia',
    'Czechoslovakia': 'Czech Republic',
    'Federal Republic of Germany': 'Germany',
    'East Germany': 'Germany',
    'West Germany': 'Germany',
    'German Democratic Republic': 'Germany'
}

netflix['country'] = netflix['country'].map(country_mapping)

In [44]:
netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,David Attenborough,USA,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021,9
1,s2,TV Show,Blood & Water,Rajiv Chilaka,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021,9
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",USA,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2021,9
3,s4,TV Show,Jailbirds New Orleans,Rajiv Chilaka,David Attenborough,USA,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",2021,9
4,s5,TV Show,Kota Factory,Rajiv Chilaka,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2021,9


# Export Dataset

In [45]:
# export dataset to excel format

netflix.to_excel('netflix_cleaned.xlsx', index=False)