In [1]:
import pandas as pd

In [2]:
# importing the dataset
netflix = pd.read_csv(r'C:\Users\daved\Downloads\Download\netflix_titles.csv')

In [3]:
# Checking the dataset by viewing the top 5 rows
netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [5]:
# Checking numbers of rows and columns
netflix.shape

(8807, 12)

In [6]:
# checking for the names of columns in the dataset
netflix.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [7]:
# checking for the data types of the columns in the dataset
netflix.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

In [8]:
# checking for null values in the dataset
netflix.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

#### Data Cleaning

In [9]:
# Cleaning our country columns with null by given it value 'unknown' 
netflix['country'] = netflix['country'].fillna('unknown')

In [12]:
# checking if it worked
netflix[['country']].head(10)

Unnamed: 0,country
0,United States
1,South Africa
2,unknown
3,unknown
4,India
5,unknown
6,unknown
7,"United States, Ghana, Burkina Faso, United Kin..."
8,United Kingdom
9,United States


In [13]:
# dropping rows with null values in title and type colums
netflix = netflix.dropna(subset=['title', 'type'])

In [14]:
# checking if null values still exist in title and type columns
netflix[['title', 'type']].isnull().sum()

title    0
type     0
dtype: int64

#### Flitering, Aggregating, and Performing Conditioninal functions on the dataset

In [16]:
# checking how many are Movies and Tv shows in the type column
netflix['type'].value_counts()

type
Movie      6131
TV Show    2676
Name: count, dtype: int64

In [18]:
# checking for top 10 most frequent countries
netflix['country'].value_counts().head(10)

country
United States     2818
India              972
unknown            831
United Kingdom     419
Japan              245
South Korea        199
Canada             181
Spain              145
France             124
Mexico             110
Name: count, dtype: int64

In [25]:
# converting date_added column to datetime format
netflix['date_added'] = pd.to_datetime(netflix['date_added'], errors= 'coerce')

In [None]:
# checking the data type of date_added columns
# it checked to datetime
netflix.dtypes

show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object

In [None]:
# creating new column year_added from date_added
netflix['year_added'] = netflix['date_added'].dt.year

In [29]:
# checking the year_added column
netflix[['title', 'date_added', 'year_added']].head(10)

Unnamed: 0,title,date_added,year_added
0,Dick Johnson Is Dead,NaT,
1,Blood & Water,NaT,
2,Ganglands,NaT,
3,Jailbirds New Orleans,NaT,
4,Kota Factory,NaT,
5,Midnight Mass,NaT,
6,My Little Pony: A New Generation,NaT,
7,Sankofa,NaT,
8,The Great British Baking Show,NaT,
9,The Starling,NaT,


In [30]:
# checking all shows released before the year 2000
netflix[netflix['release_year'] < 2000]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",NaT,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s...",
22,s23,Movie,Avvai Shanmughi,K.S. Ravikumar,"Kamal Hassan, Meena, Gemini Ganesan, Heera Raj...",unknown,NaT,1996,TV-PG,161 min,"Comedies, International Movies",Newly divorced and denied visitation rights wi...,
24,s25,Movie,Jeans,S. Shankar,"Prashanth, Aishwarya Rai Bachchan, Sri Lakshmi...",India,NaT,1998,TV-14,166 min,"Comedies, International Movies, Romantic Movies",When the father of the man she loves insists t...,
26,s27,Movie,Minsara Kanavu,Rajiv Menon,"Arvind Swamy, Kajol, Prabhu Deva, Nassar, S.P....",unknown,NaT,1997,TV-PG,147 min,"Comedies, International Movies, Music & Musicals",A tangled love triangle ensues when a man fall...,
41,s42,Movie,Jaws,Steven Spielberg,"Roy Scheider, Robert Shaw, Richard Dreyfuss, L...",United States,NaT,1975,PG,124 min,"Action & Adventure, Classic Movies, Dramas",When an insatiable great white shark terrorize...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8745,s8746,Movie,Willy Wonka & the Chocolate Factory,Mel Stuart,"Gene Wilder, Jack Albertson, Peter Ostrum, Roy...","United States, East Germany, West Germany",NaT,1971,G,100 min,"Children & Family Movies, Classic Movies, Come...",Zany Willy Wonka causes a stir when he announc...,
8748,s8749,Movie,Winter of Our Dreams,John Duigan,"Judy Davis, Bryan Brown, Cathy Downes, Baz Luh...",Australia,NaT,1981,NR,86 min,"Classic Movies, Dramas","After the death of a long-ago lover, married p...",
8763,s8764,Movie,WWII: Report from the Aleutians,John Huston,,United States,NaT,1943,TV-PG,45 min,Documentaries,Filmmaker John Huston narrates this Oscar-nomi...,
8764,s8765,Movie,Wyatt Earp,Lawrence Kasdan,"Kevin Costner, Dennis Quaid, Gene Hackman, Dav...",United States,NaT,1994,PG-13,191 min,Action & Adventure,Legendary lawman Wyatt Earp is continually at ...,


In [31]:
# checking most common genre in listed_in column

from collections import Counter

# split by comma and flatten the list
genres = netflix['listed_in'].dropna().str.split(',')
genre_counts = Counter([genre for sublist in genres for genre in sublist])
genre_counts.most_common(1)

[(' International Movies', 2624)]

In [32]:
# showing show directed by 'Rajikumar Hirani'
netflix[netflix['director'] == 'Rajikumar Hirani']

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added


In [34]:
# counting how many shows fall under each rating
netflix['rating'].value_counts()

rating
TV-MA       3207
TV-14       2160
TV-PG        863
R            799
PG-13        490
TV-Y7        334
TV-Y         307
PG           287
TV-G         220
NR            80
G             41
TV-Y7-FV       6
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: count, dtype: int64

In [37]:
# filtering movies with duration more than 100 minutes

# Only apply to movies
movies = netflix[netflix['type'] == 'Movie'].copy()

# extracting duration as number
movies['content_length'] = movies['duration'].str.extract(r'(\d+)').astype(float)

# filter
movies_over_100 = movies[movies['content_length'] > 100]
movies_over_100[['title', 'duration', 'content_length']].head()

Unnamed: 0,title,duration,content_length
7,Sankofa,125 min,125.0
9,The Starling,104 min,104.0
12,Je Suis Karl,127 min,127.0
22,Avvai Shanmughi,161 min,161.0
24,Jeans,166 min,166.0


In [38]:
# creating 'content_length' column from duration
netflix['content_length'] = netflix['duration'].str.extract(r'(\d+)').astype(float)

In [39]:
# checking content_length from duration column
netflix[['title', 'duration', 'content_length']].head(10)

Unnamed: 0,title,duration,content_length
0,Dick Johnson Is Dead,90 min,90.0
1,Blood & Water,2 Seasons,2.0
2,Ganglands,1 Season,1.0
3,Jailbirds New Orleans,1 Season,1.0
4,Kota Factory,2 Seasons,2.0
5,Midnight Mass,1 Season,1.0
6,My Little Pony: A New Generation,91 min,91.0
7,Sankofa,125 min,125.0
8,The Great British Baking Show,9 Seasons,9.0
9,The Starling,104 min,104.0


In [41]:
# sorting the dataset by release_year in descending order

netflix_sorted = netflix.sort_values(by = 'release_year', ascending = False)

In [42]:
# checking the dataset sorted
netflix_sorted[['title', 'release_year']].head(10)

Unnamed: 0,title,release_year
693,Ali & Ratu Ratu Queens,2021
781,Black Holes | The Edge of All We Know,2021
762,Sweet & Sour,2021
763,Sweet Tooth,2021
764,Trippin' with the Kandasamys,2021
765,Xtreme,2021
766,Alan Saldaña: Locked Up,2021
767,Creator's File: GOLD,2021
768,Dancing Queens,2021
770,Myriam Fares: The Journey,2021


Exporting the Cleaned Dataset

In [43]:
netflix.to_csv('netflix_cleaned.csv', index = False)