In [2]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/netflix_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Identify and handle missing values

### Subtask:
Use `.isnull()` to identify missing values and handle them appropriately.


In [3]:
print(df.isnull().sum())

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64


In [4]:
df['director'].fillna('Unknown', inplace=True)
df['cast'].fillna('Unknown', inplace=True)
df['country'].fillna('Unknown', inplace=True)

df.dropna(subset=['date_added', 'rating', 'duration'], inplace=True)

print(df.isnull().sum())

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['director'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cast'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

## Remove duplicate rows

### Subtask:
Use `.drop_duplicates()` to remove duplicate rows.


In [5]:
num_duplicates_before = df.duplicated().sum()
print(f"Number of duplicate rows before removal: {num_duplicates_before}")

df.drop_duplicates(inplace=True)

num_duplicates_after = df.duplicated().sum()
print(f"Number of duplicate rows after removal: {num_duplicates_after}")

Number of duplicate rows before removal: 0
Number of duplicate rows after removal: 0


## Standardize text values

### Subtask:
Standardize text values in relevant columns.


In [6]:
for col in ['type', 'rating', 'listed_in']:
    df[col] = df[col].str.lower().str.strip()

display(df.head())

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,"September 25, 2021",2020,pg-13,90 min,documentaries,"As her father nears the end of his life, filmm..."
1,s2,tv show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,tv-ma,2 Seasons,"international tv shows, tv dramas, tv mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,tv show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,"September 24, 2021",2021,tv-ma,1 Season,"crime tv shows, international tv shows, tv act...",To protect his family from a powerful drug lor...
3,s4,tv show,Jailbirds New Orleans,Unknown,Unknown,Unknown,"September 24, 2021",2021,tv-ma,1 Season,"docuseries, reality tv","Feuds, flirtations and toilet talk go down amo..."
4,s5,tv show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,tv-ma,2 Seasons,"international tv shows, romantic tv shows, tv ...",In a city of coaching centers known to train I...


## Convert date format

### Subtask:
Convert the 'date_added' column to a consistent datetime format.


In [16]:
df['date_added'] = pd.to_datetime(df['date_added'])
print(df.dtypes)
display(df.head())

show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,pg-13,90 min,documentaries,"As her father nears the end of his life, filmm..."
1,s2,tv show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,tv-ma,2 Seasons,"international tv shows, tv dramas, tv mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,tv show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021,tv-ma,1 Season,"crime tv shows, international tv shows, tv act...",To protect his family from a powerful drug lor...
3,s4,tv show,Jailbirds New Orleans,Unknown,Unknown,Unknown,2021-09-24,2021,tv-ma,1 Season,"docuseries, reality tv","Feuds, flirtations and toilet talk go down amo..."
4,s5,tv show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,tv-ma,2 Seasons,"international tv shows, romantic tv shows, tv ...",In a city of coaching centers known to train I...


In [8]:
df['date_added'] = pd.to_datetime(df['date_added'], format='mixed')
print(df.dtypes)
display(df.head())

show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,pg-13,90 min,documentaries,"As her father nears the end of his life, filmm..."
1,s2,tv show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,tv-ma,2 Seasons,"international tv shows, tv dramas, tv mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,tv show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021,tv-ma,1 Season,"crime tv shows, international tv shows, tv act...",To protect his family from a powerful drug lor...
3,s4,tv show,Jailbirds New Orleans,Unknown,Unknown,Unknown,2021-09-24,2021,tv-ma,1 Season,"docuseries, reality tv","Feuds, flirtations and toilet talk go down amo..."
4,s5,tv show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,tv-ma,2 Seasons,"international tv shows, romantic tv shows, tv ...",In a city of coaching centers known to train I...


## Rename column headers

### Subtask:
Rename column headers to be clean and uniform.


In [9]:
column_mapping = {
    'show_id': 'show_id',
    'type': 'type',
    'title': 'title',
    'director': 'director',
    'cast': 'cast',
    'country': 'country',
    'date_added': 'date_added',
    'release_year': 'release_year',
    'rating': 'rating',
    'duration': 'duration',
    'listed_in': 'listed_in',
    'description': 'description'
}
df.rename(columns=column_mapping, inplace=True)
print(df.columns)

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')


## Check and fix data types

### Subtask:
Ensure columns have appropriate data types.


In [10]:
print(df.dtypes)

show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object


## Display cleaned data

### Subtask:
Display the first few rows of the cleaned DataFrame.


In [11]:
display(df.head())

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,pg-13,90 min,documentaries,"As her father nears the end of his life, filmm..."
1,s2,tv show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,tv-ma,2 Seasons,"international tv shows, tv dramas, tv mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,tv show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021,tv-ma,1 Season,"crime tv shows, international tv shows, tv act...",To protect his family from a powerful drug lor...
3,s4,tv show,Jailbirds New Orleans,Unknown,Unknown,Unknown,2021-09-24,2021,tv-ma,1 Season,"docuseries, reality tv","Feuds, flirtations and toilet talk go down amo..."
4,s5,tv show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,tv-ma,2 Seasons,"international tv shows, romantic tv shows, tv ...",In a city of coaching centers known to train I...


In [13]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv('/content/drive/MyDrive/cleaned_netflix_titles.csv', index=False)
