<a href="https://colab.research.google.com/github/Divyanshupatil/Netflix_Content_Analysis/blob/main/Netflix_Content_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading

In [None]:
import pandas as pd

# load the dataset
df = pd.read_csv('netflix_titles.csv')

#Display top 5 rows
print(df.head(5))

#check for missing values.
print(df.isnull().sum())


  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

# Data Cleaning

In [None]:
# Drop duplicates if any
# Remove 'genres' column before dropping duplicates
df.drop(columns=['genres'], inplace=True, errors='ignore')  # errors='ignore' handles if 'genres' column doesn't exist
df.drop_duplicates(inplace = True)

#Convert 'date_added' to datetime
df['date_added'] = pd.to_datetime(df['date_added'], format = 'mixed')

#Fill Null values in key text fields
df['director'].fillna('Not Available', inplace=True)
df['cast'].fillna('Not Available', inplace=True)
df['country'].fillna('Unknown', inplace=True)

#Split 'listed_in' into list of generes
df['genres'] = df['listed_in'].apply(lambda x: x.split(', '))

# Extract year from 'date_added'
df['year_added'] = df['date_added'].dt.year



# Exploratory Data Analysis

In [None]:
# How many movies vs TV Shows?
print(df['type'].value_counts())

# Top 10 countries producing content
print(df['country'].value_counts().head(10))

# Content added per year
print(df['year_added'].value_counts().sort_index())

#Most common genres
from collections import Counter
all_genres = sum(df['genres'],[])
print(Counter(all_genres).most_common(10))


# Save Clean File

In [None]:
df.to_csv('netflix_titles_clean.csv', index=False)