In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("netflix_titles.csv")

In [3]:
df.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


# Data Preprocessing Or Data Cleaning

In [4]:
# Check for null values
print(df.isnull().sum().sum())

4307


In [5]:
# Check for duplicated rows
print(df.duplicated().sum().sum())

0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [23]:
df = df["date_added"].dropna(inplace=True)

In [8]:
df['date_added'] = pd.to_datetime(df['date_added'])

ValueError: time data " August 4, 2017" doesn't match format "%B %d, %Y", at position 1442. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [10]:
df['date_added'] = df['date_added'].dt.strftime("%Y-%m-%d")

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
df['Release_Year'] = df['Release_Date'].dt.year

In [None]:
df['Release_Month'] = df['Release_Date'].dt.strftime('%B')
df.head(1)

In [None]:
df = df.drop(['Overview', 'Poster_Url', 'Release_Date'], axis=1)

In [None]:
df['Original_Language'].value_counts().head(10)

data = {'Original_Language': ['en', 'ja', 'es', 'fr', 'ko', 'zh', 'it', 'cn', 'ru', 'de', 'pt']}
language = pd.DataFrame(data)
lang_map = {
    'en': 'English',
    'ja': 'Japanese',
    'es': 'Spanish',
    'fr': 'French',
    'ko': 'Korean',
    'zh': 'Chinese (Mandarin)',
    'it': 'Italian',
    'cn': 'Chinese (Simplified)',
    'ru': 'Russian',
    'de': 'German',
    'pt': 'Portuguese'
}
df['Original_Language'] = language['Original_Language'].replace(lang_map)

In [None]:
df.head(1)

In [None]:
def catigorize_columns(df, col, labels):
    category = [
        df[col].min(),               
        df[col].quantile(0.25),     
        df[col].quantile(0.50),     
        df[col].quantile(0.75),
        df[col].max() + 0.1 
    ]
    df['Vote_Avg'] = pd.cut(df[col], bins=category, labels=labels, include_lowest=True, duplicates='drop')
    return df

In [None]:
labels = ['Not_Popular', 'Below_Avg', 'Average', 'Popular']

df = catigorize_columns(df, 'Vote_Average', labels)
df.head(2)

In [None]:
df.nunique()

# EDA (Exploratory Data Analysis)

In [None]:
sns.set_style('whitegrid')

In [None]:
sns.countplot(data=df, x='Vote_Avg', palette='magma')
plt.title('Distribution of Vote Averages', fontsize=16)
plt.xlabel('Vote Average (Rounded)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.show()

In [None]:
data = df.select_dtypes(include=['int64', 'float64'])

plt.figure(figsize=(8,4))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')

In [None]:
plt.figure(figsize=(10,5))
sns.pairplot(data)
plt.show()

# Data Visualization

## Most frequent genre of movies released on Netflix

In [None]:
genre = df['Genre'].value_counts(sort=True).head(12)
freq_genre = pd.DataFrame(genre)
freq_genre.columns = ['total_freq']
freq_genre = freq_genre.reset_index()
freq_genre.columns = ['genre', 'total_freq']

In [None]:
freq_genre['total_freq'][0]

freq_genre['percentage'] = (freq_genre['total_freq'] / freq_genre['total_freq'].sum()) * 100
freq_genre.head(5)

In [None]:
plt.figure(figsize=(9,5))
sns.barplot(data=freq_genre, x='genre', y = 'total_freq', palette="magma")
plt.title('Distribution of Genre Based On Frequency', fontsize=16)
plt.xlabel('Genre Name', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=60)
plt.show()

## Which has highest votes avg column?

In [None]:
df.head(3)

In [None]:
sns.catplot(y='Vote_Avg', data=df, kind='count',
    order=df['Vote_Avg'].value_counts().index, palette='magma')
plt.title('Distribution Of Votes', fontsize=16)
plt.xlabel('Votes Types', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.show()

## What movie got the highest popularity? What's its genre?

In [None]:
df.head(2)

In [None]:
df[df['Popularity'] == df['Popularity'].max()]

## What movie got the lowest popularity? What's its genre?

In [None]:
df[df['Popularity'] == df['Popularity'].min()]

## Which year has the most filmmed movies?

In [None]:
year = df['Release_Year'].value_counts().head(10)
release_year = pd.DataFrame(year)
release_year = release_year.reset_index()

release_year.columns = ['year', 'count']


In [None]:
plt.figure(figsize=(10, 6))
explode = [0.1 if i == release_year['count'].max() else 0 for i in release_year['count']]
plt.pie(
    release_year['count'],
    labels=release_year['year'],
    autopct='%1.1f%%',            
    startangle=140,              
    explode=explode,              
    shadow=True                  
)
plt.title('Movie Releases By Year', fontsize=14)
plt.axis('equal') 
plt.tight_layout()
plt.show()

## Total Movie Count per Release Month by Average Vote

In [None]:
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 
               'July', 'August', 'September', 'October', 'November', 'December']
plt.figure(figsize=(15,7))
sns.countplot(data=df, x='Release_Month', hue='Vote_Avg', order=month_order, palette='magma')
plt.title('Movie Count per Release Month by Average Vote', fontsize=16)
plt.xlabel('Release Month', fontsize=12)
plt.ylabel('Number of Movies', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Top 10 movies based on their popularity release on the Netflix

In [None]:
top_10_movies = df.sort_values(by='Popularity', ascending=False).head(10)
top_10_movies[['Title', 'Popularity']].head(5)

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(data=top_10_movies, x='Title', y='Popularity', palette='magma')
plt.title('Top 10 movies Based on their Popularity', fontsize=16)
plt.xlabel('Movies Title/Names', fontsize=12)
plt.ylabel('Popularity', fontsize=12)
plt.xticks(rotation=90)
plt.show()

### Q1: What is the most frequent genre in the dataset?
#### Drama genre is the most frequent genre in our dataset and has appeared more than 18.92% of the times among 19 other genres.

### Q2: What genres has highest votes?
#### We have 25.5% of our dataset with not popular vote (2560 rows). Drama again gets the highest popularity among fans by being having more than 18.5% of movies popularities.

### Q3: What movie got the highest popularity ? what's its genre?
#### Spider-Man; No Way Home has the highest popularity rate in our dataset and it has genres of Action , Adventure and Sience Fiction .

### Q4: What movie got the lowest popularity ? what's its genre?
#### The united states, thread' has the highest lowest rate in our dataset 'sci-fi' and history'. and it has genres of music , drama 'war'

### Q5: Which year has the most filmmed movies?
#### Year 2020 has the highest filmming rate in our dataset.