In [195]:
#Load the dataset
import pandas as pd
df=pd.read_csv("netflix_titles.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [197]:
#Identify missing values
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [198]:
#Handle missing values
df["director"]=df["director"].fillna("Not Available")
df["cast"]=df["cast"].fillna("Not Available")
df["country"]=df["country"].fillna("Unknown")
df["rating"]=df["rating"].fillna("Not Rated")
df["date_added"] = pd.to_datetime(df["date_added"], errors="coerce")

In [199]:
df=df.dropna()

In [200]:
#Remove duplicates
df.duplicated().sum()
df = df.drop_duplicates()

In [201]:
#Standardize Text values
text_columns=["type","country","rating","listed_in"]
for col in text_columns:
    df[col]=df[col].str.strip().str.upper()
df["type"] = df["type"].replace({
    "movie": "Movie",
    "tv show": "TV Show"
})

In [202]:
#Convert data format to consistent type
df["date_added"] = df["date_added"].dt.strftime("%d-%m-%Y")

In [203]:
#Rename Column Headers
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(" ", "_")

In [204]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8706 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8706 non-null   object
 1   type          8706 non-null   object
 2   title         8706 non-null   object
 3   director      8706 non-null   object
 4   cast          8706 non-null   object
 5   country       8706 non-null   object
 6   date_added    8706 non-null   object
 7   release_year  8706 non-null   int64 
 8   rating        8706 non-null   object
 9   duration      8706 non-null   object
 10  listed_in     8706 non-null   object
 11  description   8706 non-null   object
dtypes: int64(1), object(11)
memory usage: 884.2+ KB


In [205]:
df["date_added"] = pd.to_datetime(df["date_added"], errors="coerce",dayfirst=True)
df[["duration_value", "duration_unit"]] = df["duration"].str.split(" ", n=1, expand=True)
df["duration_value"] = pd.to_numeric(df["duration_value"], errors="coerce")
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,duration_value,duration_unit
0,s1,MOVIE,Dick Johnson Is Dead,Kirsten Johnson,Not Available,UNITED STATES,2021-09-25,2020,PG-13,90 min,DOCUMENTARIES,"As her father nears the end of his life, filmm...",90,min
1,s2,TV SHOW,Blood & Water,Not Available,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",SOUTH AFRICA,2021-09-24,2021,TV-MA,2 Seasons,"INTERNATIONAL TV SHOWS, TV DRAMAS, TV MYSTERIES","After crossing paths at a party, a Cape Town t...",2,Seasons
2,s3,TV SHOW,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",UNKNOWN,2021-09-24,2021,TV-MA,1 Season,"CRIME TV SHOWS, INTERNATIONAL TV SHOWS, TV ACT...",To protect his family from a powerful drug lor...,1,Season
3,s4,TV SHOW,Jailbirds New Orleans,Not Available,Not Available,UNKNOWN,2021-09-24,2021,TV-MA,1 Season,"DOCUSERIES, REALITY TV","Feuds, flirtations and toilet talk go down amo...",1,Season
4,s5,TV SHOW,Kota Factory,Not Available,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",INDIA,2021-09-24,2021,TV-MA,2 Seasons,"INTERNATIONAL TV SHOWS, ROMANTIC TV SHOWS, TV ...",In a city of coaching centers known to train I...,2,Seasons
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,MOVIE,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",UNITED STATES,2019-11-20,2007,R,158 min,"CULT MOVIES, DRAMAS, THRILLERS","A political cartoonist, a crime reporter and a...",158,min
8803,s8804,TV SHOW,Zombie Dumb,Not Available,Not Available,UNKNOWN,2019-07-01,2018,TV-Y7,2 Seasons,"KIDS' TV, KOREAN TV SHOWS, TV COMEDIES","While living alone in a spooky town, a young g...",2,Seasons
8804,s8805,MOVIE,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",UNITED STATES,2019-11-01,2009,R,88 min,"COMEDIES, HORROR MOVIES",Looking to survive in a world taken over by zo...,88,min
8805,s8806,MOVIE,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",UNITED STATES,2020-01-11,2006,PG,88 min,"CHILDREN & FAMILY MOVIES, COMEDIES","Dragged from civilian life, a former superhero...",88,min


In [206]:
df.isnull().sum()
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
Index: 8706 entries, 0 to 8806
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   show_id         8706 non-null   object        
 1   type            8706 non-null   object        
 2   title           8706 non-null   object        
 3   director        8706 non-null   object        
 4   cast            8706 non-null   object        
 5   country         8706 non-null   object        
 6   date_added      8706 non-null   datetime64[ns]
 7   release_year    8706 non-null   int64         
 8   rating          8706 non-null   object        
 9   duration        8706 non-null   object        
 10  listed_in       8706 non-null   object        
 11  description     8706 non-null   object        
 12  duration_value  8706 non-null   int64         
 13  duration_unit   8706 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(11)
memory usage: 1020

(8706, 14)

In [207]:
#Cleaned dataset
df.to_csv("cleaned_netflix_titles.csv", index=False)