In [None]:
import pandas as pd


data = pd.read_csv('/kaggle/input/netflix-titles-dataset-20082021/netflix1.csv')

data.head()

# 📊 Netflix Data Cleaning and Visualization

This notebook explores a dataset of Netflix titles from 2008 to 2021. We'll perform data cleaning and EDA (exploratory data analysis) using Python.


In [None]:
print(data.isnull().sum())

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data['date_added'] = pd.to_datetime(data['date_added'])

In [None]:
print(data.dtypes)

In [None]:
data['year_added'] = data['date_added'].dt.year
data['month_added'] = data['date_added'].dt.month
data['day_added'] = data['date_added'].dt.day

Count Movies vs. TV Shows

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=data, x='type', palette='Set2')
plt.title("Distribution of Content Types")
plt.xlabel("Type")
plt.ylabel("Count")
plt.show()

**Analyze Top 10 Countries**

In [None]:
top_countries = data['country'].value_counts().head(10)

plt.figure(figsize=(10,6))
sns.barplot(x=top_countries.values, y=top_countries.index, palette='flare')
plt.title("Top 10 Countries with Most Content on Netflix")
plt.xlabel("Number of Titles")
plt.ylabel("Country")
plt.show()

**Content Distribution: Movies vs TV Shows****

In [None]:
type_counts = data['type'].value_counts()

plt.figure(figsize=(6,6))
plt.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%', startangle=140, colors=['#66b3ff','#ff9999'])
plt.title("Netflix Content Distribution: Movies vs TV Shows")
plt.axis('equal')
plt.show()

 ****Analyze Ratings****

In [None]:
rating_counts = data['rating'].value_counts()

plt.figure(figsize=(10,6))
sns.barplot(x=rating_counts.index, y=rating_counts.values, palette='muted')
plt.title('Distribution of Content Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

****Visualize Content Added Over Time****

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='year_added', data=data, palette='coolwarm')
plt.title("Content Added Over the Years")
plt.xlabel("Year")
plt.ylabel("Number of Titles")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

****Monthly Trends for Movies vs TV Shows****

In [None]:
monthly_movie = data[data['type'] == 'Movie']['month_added'].value_counts().sort_index()
monthly_tv = data[data['type'] == 'TV Show']['month_added'].value_counts().sort_index()

plt.plot(monthly_movie.index, monthly_movie.values, label='Movies')
plt.plot(monthly_tv.index, monthly_tv.values, label='TV Shows')
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.title("Monthly Content Release Trend")
plt.xlabel("Month")
plt.ylabel("Count")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from wordcloud import WordCloud

movie_titles = data[data['type'] == 'Movie']['title']
wordcloud = WordCloud(width=800, height=400, background_color='black').generate(' '.join(movie_titles))

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Movie Titles")
plt.show()

## 📌 Conclusion & Insights

- Netflix has a higher number of movies than TV shows.
- The most popular content originates from the United States and India.
- TV-MA and TV-14 are the most common content ratings.
- Content addition peaked between 2018 and 2020.
- Genres like Dramas, International, and Comedies dominate Netflix's library.
