<a href="https://colab.research.google.com/github/Aashir-Aqeel/netflix-data-analysis/blob/main/Netflix_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Make plts look better
plt.style.use('ggplot')

# load dataset
df = pd.read_csv("/content/netflix_professional_dataset.csv")

print("Shape:", df.shape)
print(df.head())


# Data Cleaning

In [2]:
# drop duplicates
df = df.drop_duplicates()

In [3]:
# fill missing values
df = df.fillna({"Country":"Unknown", "Rating":"Unrated"})

In [4]:
# Convert duration (remove 'min')
df['Duration'] = df['Duration'].str.replace(" min","", regex=False).astype(int)

# Content Type Distribution

In [None]:
plt.figure(figsize=(6,5))
df['Listed_In'].value_counts().plot(kind='bar', color=['steelblue','orange'])
plt.title("Distribution of Content Categories")
plt.xlabel("Type")
plt.ylabel("Count")
plt.show()

# Country-wise Content



In [None]:
top_countries = df['Country'].value_counts().head(10)

plt.figure(figsize = (10, 6))
top_countries.plot(kind = 'bar', color = 'skyblue')
plt.title("Top 10 Countries Producing Netflix Content")
plt.xlabel("Country")
plt.ylabel("Count")
plt.xticks(rotation = 45)
plt.show()

# Rating Distribution (Pie Chart)


In [None]:
plt.figure(figsize=(8,8))
df['Rating'].value_counts().plot(kind='pie', autopct="%1.1f%%", startangle=90)
plt.title("Content Ratings Distribution")
plt.show()

# Movie Duration Distribution (Histogram)

In [None]:
movies_df = df[df["Categorye"]=="Movie"]

plt.figure(figsize=(10,6))
plt.hist(movies_df['Duration'], bins=20, color='lightgreen', edgecolor="black")
plt.title("Distribution of Movie Durations")
plt.xlabel("Duration (Minutes)")
plt.ylabel("Frequency")
plt.show()

# Scatter Plot (Release Year vs Duration)

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(movies_df['Release_Year'], movies_df['Duration'], alpha=0.5, color="purple")
plt.title("Movie Duration vs Release Year")
plt.xlabel("Release Year")
plt.ylabel("Duration (Minutes)")
plt.show()

In [None]:
print("\nCorrelation between Release_Year and Duration:")
print(df[['Release_Year','Duration']].corr())