NETFLIX DATASET DATA ANALYSIS

In [17]:
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import ast

In [3]:
df = pd.read_csv('titles.csv')

In [4]:
df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.01,7.3
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,534486.0,15.461,7.811
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.6


In [5]:
df.isnull().sum()

id                         0
title                      1
type                       0
description               18
release_year               0
age_certification       2619
runtime                    0
genres                     0
production_countries       0
seasons                 3744
imdb_id                  403
imdb_score               482
imdb_votes               498
tmdb_popularity           91
tmdb_score               311
dtype: int64

In [6]:
df.shape

(5850, 15)

DATA CLEANING PROCESS COLUMN BY COLUMN

In [7]:
#  --- Cleaning: `id` and `title` ---
print("\nProcessing 'id' and 'title'...")
# Check for nulls
if df['id'].isnull().any() or df['title'].isnull().any():
    print("Found missing values in 'id' or 'title'. Dropping rows...")
    df.dropna(subset=['id', 'title'], inplace=True)
# Check for duplicates
if df['id'].duplicated().any():
    print("Found duplicate IDs. Dropping duplicates...")
    df.drop_duplicates(subset=['id'], keep='first', inplace=True)
print("'id' and 'title' cleaned.")


Processing 'id' and 'title'...
Found missing values in 'id' or 'title'. Dropping rows...
'id' and 'title' cleaned.


In [9]:
df.shape

(5849, 15)

In [10]:
# --- Cleaning: `type` ---
print("\nProcessing 'type'...")
print(f"Unique values in 'type' before cleaning: {df['type'].unique()}")


Processing 'type'...
Unique values in 'type' before cleaning: ['SHOW' 'MOVIE']


In [32]:
df.loc[:,'description'].fillna('No description available', inplace=True)

In [33]:
df.loc[:,'age_certification'].fillna('Not Rated', inplace=True)
df.loc[:,'age_certification'] = df['age_certification'].replace({'NC-17': 'R', 'TV-Y7-FV': 'TV-Y7'})

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.loc[:,'age_certification'].fillna('Not Rated', inplace=True)


In [12]:
# --- Cleaning: `age_certification` ---
print("\nProcessing 'age_certification'...")
df['age_certification'].fillna('Not Rated', inplace=True)
# Consolidate similar ratings
df['age_certification'] = df['age_certification'].replace({'NC-17': 'R', 'TV-Y7-FV': 'TV-Y7'})
print(f"Age certifications standardized. Unique values now: {df['age_certification'].unique()}")


Processing 'age_certification'...
Age certifications standardized. Unique values now: ['TV-MA' 'R' 'PG' 'Not Rated' 'TV-14' 'PG-13' 'TV-PG' 'TV-Y' 'TV-G'
 'TV-Y7' 'G']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age_certification'].fillna('Not Rated', inplace=True)


In [18]:
# --- Cleaning: `genres` and `production_countries` ---
print("\nProcessing 'genres' and 'production_countries'...")
# Safely convert string representations of lists into actual lists
def safe_literal_eval(s):
    try:
        # It's already a list for some reason in some environments, check type first
        if isinstance(s, list):
            return s
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        return [] # Return an empty list if parsing fails
df['genres'] = df['genres'].apply(safe_literal_eval)
df['production_countries'] = df['production_countries'].apply(safe_literal_eval)
print("'genres' and 'production_countries' converted to list objects.")


Processing 'genres' and 'production_countries'...
'genres' and 'production_countries' converted to list objects.


In [19]:
# --- Cleaning: `seasons` ---
print("\nProcessing 'seasons'...")
# Fill NaN for movies with 0, as they don't have seasons.
# For shows, NaN can mean data is missing, but 0 is a reasonable fill for consistency.
df['seasons'].fillna(0, inplace=True)
df['seasons'] = df['seasons'].astype(int)
print("'seasons' NaNs filled and column converted to integer.")


Processing 'seasons'...
'seasons' NaNs filled and column converted to integer.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['seasons'].fillna(0, inplace=True)


In [27]:
# Fill NaN in 'seasons' with 0 for movies and shows for consistency
df['seasons'] = df['seasons'].fillna(0).astype(int)

In [20]:
# --- Step 1: Load and Clean the Data for imdb---
try:
    # Load the dataset
    df = pd.read_csv('titles.csv')
    print(f"Successfully loaded {len(df)} rows.")

    # Drop rows where 'imdb_id' is missing, as it's essential for this task
    initial_rows = len(df)
    df.dropna(subset=['imdb_id'], inplace=True)
    print(f"Dropped {initial_rows - len(df)} rows with missing IMDb IDs.")
    print(f"Remaining rows: {len(df)}")

except FileNotFoundError:
    print("Error: titles.csv not found. Please ensure the file is in the correct directory.")
    exit()


Successfully loaded 5850 rows.
Dropped 403 rows with missing IMDb IDs.
Remaining rows: 5447


In [21]:
# --- Step 2: Convert 'imdb_id' to a Numeric Score (1-10 Scale) ---
print("\n--- 2. Converting IMDb ID to a Numeric Rating ---")

# Extract the numeric part of the ID and convert to an integer
# The 'tt' prefix is removed, and the remaining string is converted to a number.
# Errors are coerced to NaN, which are then dropped.
df['imdb_id_numeric'] = pd.to_numeric(df['imdb_id'].str.replace('tt', ''), errors='coerce')
df.dropna(subset=['imdb_id_numeric'], inplace=True)
df['imdb_id_numeric'] = df['imdb_id_numeric'].astype(int)

print("Extracted numeric part from IMDb IDs.")


--- 2. Converting IMDb ID to a Numeric Rating ---
Extracted numeric part from IMDb IDs.


In [22]:
# Normalize the numeric ID to a scale of 1 to 10
min_id = df['imdb_id_numeric'].min()
max_id = df['imdb_id_numeric'].max()

# Min-Max normalization formula: new_value = 1 + ( (value - min) * 9 / (max - min) )
# We scale from 1 to 10.
df['imdb_id_rating'] = 1 + ((df['imdb_id_numeric'] - min_id) * 9 / (max_id - min_id))

print("Created 'imdb_id_rating' by normalizing the numeric ID to a 1-10 scale.")


Created 'imdb_id_rating' by normalizing the numeric ID to a 1-10 scale.


In [28]:
df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,imdb_id_numeric,imdb_id_rating
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],0,tt0075314,8.2,808582,40.965,8.179,75314,1.01181
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],0,tt0068473,7.7,107673,10.01,7.3,68473,1.008905
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],0,tt0071853,8.2,534486,15.461,7.811,71853,1.010341
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"['war', 'action']","['GB', 'US']",0,tt0061578,7.7,72662,20.398,7.6,61578,1.005978
5,ts22164,Monty Python's Flying Circus,SHOW,A British sketch comedy series with the shows ...,1969,TV-14,30,"['comedy', 'european']",['GB'],4,tt0063929,8.8,73424,17.617,8.306,63929,1.006976


In [24]:
# --- Cleaning: `imdb_score`, `imdb_votes`, `tmdb_popularity`, `tmdb_score` ---
# Fill remaining numerical NaNs with the median for robustness against outliers
for col in ['imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score']:
    if df[col].isnull().any():
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
        print(f"Filled NaNs in '{col}' with median value ({median_val}).")

# Convert imdb_votes to integer
df['imdb_votes'] = df['imdb_votes'].astype(int)
print("'imdb_votes' converted to integer.")

Filled NaNs in 'imdb_score' with median value (6.6).
Filled NaNs in 'imdb_votes' with median value (2233.5).
Filled NaNs in 'tmdb_popularity' with median value (7.1105).
Filled NaNs in 'tmdb_score' with median value (6.9).
'imdb_votes' converted to integer.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always beha

In [34]:
print("\n--- Data Cleaning Complete ---")
print("Final dataset summary:")
df.info()



--- Data Cleaning Complete ---
Final dataset summary:
<class 'pandas.core.frame.DataFrame'>
Index: 5447 entries, 1 to 5849
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    5447 non-null   object 
 1   title                 5446 non-null   object 
 2   type                  5447 non-null   object 
 3   description           5447 non-null   object 
 4   release_year          5447 non-null   int64  
 5   age_certification     5447 non-null   object 
 6   runtime               5447 non-null   int64  
 7   genres                5447 non-null   object 
 8   production_countries  5447 non-null   object 
 9   seasons               5447 non-null   int64  
 10  imdb_id               5447 non-null   object 
 11  imdb_score            5447 non-null   float64
 12  imdb_votes            5447 non-null   int64  
 13  tmdb_popularity       5447 non-null   float64
 14  tmdb_score            

In [35]:
df.isnull().sum()

id                      0
title                   1
type                    0
description             0
release_year            0
age_certification       0
runtime                 0
genres                  0
production_countries    0
seasons                 0
imdb_id                 0
imdb_score              0
imdb_votes              0
tmdb_popularity         0
tmdb_score              0
imdb_id_numeric         0
imdb_id_rating          0
dtype: int64