In [15]:
# Load YAML configuration file
import yaml

with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)

# Load dataset with pandas
import pandas as pd
df = pd.read_csv(config['input_data']['file'])  # Load dataset based on YAML path
df.head()  # Show the first 5 rows of the dataset

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [16]:
# Explore structure and metadata
df.shape         # Check number of rows and columns
df.columns       # List all column names
df.info()        # Get data types and non-null counts per column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [17]:
# Check missing values in descending order
df.isnull().sum().sort_values(ascending=False)

director        2634
country          831
cast             825
date_added        10
rating             4
duration           3
show_id            0
type               0
title              0
release_year       0
listed_in          0
description        0
dtype: int64

In [18]:
# Number of unique values per column
df.nunique().sort_values()

# Distribution of key categorical columns
df['type'].value_counts(normalize=True)  # Distribution of 'Movie' vs 'TV Show'
df['country'].value_counts().head(10)    # Top 10 countries
df['rating'].value_counts().head(10)     # Most common ratings



rating
TV-MA    3207
TV-14    2160
TV-PG     863
R         799
PG-13     490
TV-Y7     334
TV-Y      307
PG        287
TV-G      220
NR         80
Name: count, dtype: int64

In [19]:
# View a sample of the genres column
df['listed_in'].sample(10)

2317                 Action & Adventure, Sci-Fi & Fantasy
6236                             Children & Family Movies
6394               Action & Adventure, Independent Movies
1576                   Children & Family Movies, Comedies
6249                                             Kids' TV
5433    International TV Shows, Korean TV Shows, Roman...
2914      Comedies, International Movies, Romantic Movies
7587                              Dramas, Romantic Movies
6730                           Dramas, Independent Movies
4455     Action & Adventure, Dramas, International Movies
Name: listed_in, dtype: object

In [20]:
# Check unique countries
df['country'].unique()[:20]

array(['United States', 'South Africa', nan, 'India',
       'United States, Ghana, Burkina Faso, United Kingdom, Germany, Ethiopia',
       'United Kingdom', 'Germany, Czech Republic', 'Mexico', 'Turkey',
       'Australia', 'United States, India, France', 'Finland',
       'China, Canada, United States',
       'South Africa, United States, Japan', 'Nigeria', 'Japan',
       'Spain, United States', 'France', 'Belgium',
       'United Kingdom, United States'], dtype=object)

In [21]:
# Split 'duration' column into duration value and type (e.g., "90 min" → 90 + 'min')
df[['duration_int', 'duration_type']] = df['duration'].str.extract(r'(\d+)\s*(\w+)')
df['duration_int'] = pd.to_numeric(df['duration_int'], errors='coerce')  # Convert to numeric
df[['duration', 'duration_int', 'duration_type']].head()  # Preview new columns

Unnamed: 0,duration,duration_int,duration_type
0,90 min,90.0,min
1,2 Seasons,2.0,Seasons
2,1 Season,1.0,Season
3,1 Season,1.0,Season
4,2 Seasons,2.0,Seasons


In [22]:
# Convert 'date_added' column to datetime format
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce') # Convert to datetime
df['date_added'].dtype  # Confirm conversion

dtype('<M8[ns]')

In [23]:
# Split genres into lists (for multi-label classification)
df['genres'] = df['listed_in'].str.split(', ') # Split genre strings into lists
df[['listed_in', 'genres']].sample(5)  # Check results

Unnamed: 0,listed_in,genres
3852,"Crime TV Shows, International TV Shows, Korean...","[Crime TV Shows, International TV Shows, Korea..."
1462,"Kids' TV, TV Comedies","[Kids' TV, TV Comedies]"
7564,"Action & Adventure, Children & Family Movies","[Action & Adventure, Children & Family Movies]"
71,Children & Family Movies,[Children & Family Movies]
3696,"Anime Series, International TV Shows","[Anime Series, International TV Shows]"


In [24]:
# Frequency of duration types (e.g., 'min', 'Season')
df['duration_type'].value_counts() # Check types of duration (minutes or seasons)

# Standardize plural form to singular ('Seasons' → 'Season')
df['duration_type'] = df['duration_type'].replace('Seasons', 'Season') # Normalize to singular
df['duration_type'].value_counts()  # Confirm correction

duration_type
min       6128
Season    2676
Name: count, dtype: int64

In [25]:
# Check for missing values in 'duration_int'
df['duration_int'].isnull().sum()  # Check how many null durations remain
# Keep only rows where 'duration_int' is not null
df = df[df['duration_int'].notna()]  # Filter out rows without valid duration
# Inspect rows where duration was missing
df[df['duration_int'].isna()][['duration']]


Unnamed: 0,duration


In [26]:
# Remove duplicated rows based on 'show_id', 'title', and 'release_year'
df = df.drop_duplicates(subset=['show_id', 'title', 'release_year'])  # Remove duplicates
print("Number of rows after removing duplicates:", df.shape[0])  # Check number of rows after deduplication

Number of rows after removing duplicates: 8804


In [27]:
# Confirm that duplicates are removed
duplicates = df.duplicated(subset=['show_id', 'title', 'release_year']).sum()  # Count remaining duplicates
print("Remaining duplicates:", duplicates) 

Remaining duplicates: 0


In [28]:
# Check missing values again per column (after cleaning)
df.isnull().sum().sort_values(ascending=False)

director         2634
country           831
cast              825
date_added         98
rating              4
show_id             0
type                0
title               0
release_year        0
duration            0
listed_in           0
description         0
duration_int        0
duration_type       0
genres              0
dtype: int64

In [29]:
# Fill missing values with appropriate defaults
# 1. Replace missing 'director' with "Unknown"
df['director'] = df['director'].fillna("Unknown")
# 2. Replace missing 'cast' with "Unknown"
df['cast'] = df['cast'].fillna("Unknown")
# 3. Replace missing 'country' with "Unknown"
df['country'] = df['country'].fillna("Unknown")
# 4. Drop rows where 'date_added' is missing
df = df[df['date_added'].notna()]
# 5. Replace missing 'rating' with "Not Rated"
df['rating'] = df['rating'].fillna("Not Rated")

In [30]:
# Confirm again that no missing values remain after filling
df.isnull().sum().sort_values(ascending=False)

show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added       0
release_year     0
rating           0
duration         0
listed_in        0
description      0
duration_int     0
duration_type    0
genres           0
dtype: int64

In [31]:
# Define a function to extract the first country if there are multiple listed
def extract_first_country(country):
    if isinstance(country, str):
        return country.split(',')[0]  # Keep only the first country
    return country

# Apply the function to create a cleaned 'country' column
df['country'] = df['country'].apply(extract_first_country)

In [32]:
# Quick check of distributions on key columns
print(df['duration_type'].value_counts())  # Check distribution of 'duration_type'
print(df['type'].value_counts())           # Check distribution of 'type'
print(df['rating'].value_counts())         # Check distribution of 'rating'
print(df['country'].value_counts().head(20))  # Check top 20 most common countries

duration_type
min       6128
Season    2578
Name: count, dtype: int64
type
Movie      6128
TV Show    2578
Name: count, dtype: int64
rating
TV-MA        3183
TV-14        2133
TV-PG         838
R             799
PG-13         490
TV-Y7         330
TV-Y          300
PG            287
TV-G          212
NR             78
G              41
TV-Y7-FV        5
Not Rated       4
NC-17           3
UR              3
Name: count, dtype: int64
country
United States     3166
India             1007
Unknown            827
United Kingdom     611
Canada             259
Japan              255
France             209
South Korea        206
Spain              177
Mexico             134
Australia          115
Egypt              112
Turkey             111
Germany            103
China              100
Nigeria             96
Indonesia           85
Taiwan              85
Brazil              84
Philippines         80
Name: count, dtype: int64


In [33]:
# Normalize inconsistent 'rating' labels into unified categories
df['rating'] = df['rating'].replace({
    'NR': 'Unrated',
    'UR': 'Unrated',
    'Not Rated': 'Unrated',
    'UNRATED': 'Unrated',
    'TV-Y7-FV': 'TV-Y7'  # Merge variant of TV-Y7
})

In [67]:
# Standardize 'duration_type' to lowercase for consistency
df['duration_type'] = df['duration_type'].str.lower()

In [35]:
# Check the format and range of 'date_added' column
print("Date format:", df['date_added'].dtype)  # Confirm 'date_added' is datetime
print("Earliest date:", df['date_added'].min())  # Check earliest date
print("Latest date:", df['date_added'].max())    # Check latest date
df['date_added'].sample(10, random_state=1)     # Preview 10 random samples

Date format: datetime64[ns]
Earliest date: 2008-01-01 00:00:00
Latest date: 2021-09-25 00:00:00


1600   2020-12-03
1829   2020-10-19
2567   2020-05-02
311    2021-08-04
1933   2020-09-30
8528   2019-06-06
2385   2020-06-15
5756   2016-10-12
397    2021-07-23
4697   2018-08-17
Name: date_added, dtype: datetime64[ns]

In [36]:
# Calculate the average duration by 'duration_type' (e.g., average movie length)
avg_duration = df.groupby('duration_type')['duration_int'].mean().reset_index()
print("Average duration per type:")
print(avg_duration)

Average duration per type:
  duration_type  duration_int
0           min     99.577187
1        season      1.688518


In [37]:
# Count number of titles per country (using the cleaned 'country' column)
country_counts = df['country'].value_counts().reset_index()
country_counts.columns = ['country', 'count']
print("\nNumber of titles by country:")
print(country_counts.head(10))


Number of titles by country:
          country  count
0   United States   3166
1           India   1007
2         Unknown    827
3  United Kingdom    611
4          Canada    259
5           Japan    255
6          France    209
7     South Korea    206
8           Spain    177
9          Mexico    134


In [38]:
# Explode the 'genres' column and count most common genres
df_exploded = df.explode('genres')  # Separate multiple genres into individual rows
genre_counts = df_exploded['genres'].value_counts().reset_index()  # Count occurrences of each genre
genre_counts.columns = ['genre', 'count']  # Rename columns for clarity
print("\nMost common genres:")
print(genre_counts.head(10))  # Show top 10 genres


Most common genres:
                      genre  count
0      International Movies   2752
1                    Dramas   2427
2                  Comedies   1674
3    International TV Shows   1328
4             Documentaries    869
5        Action & Adventure    859
6        Independent Movies    756
7                 TV Dramas    739
8  Children & Family Movies    641
9           Romantic Movies    616


In [39]:
# Count number of titles by release year
titles_per_year = df['release_year'].value_counts().sort_index().reset_index()
titles_per_year.columns = ['release_year', 'count']
print("\n🔹 Titles by release year:")
print(titles_per_year.tail(10))  # Show most recent years


🔹 Titles by release year:
    release_year  count
64          2012    229
65          2013    282
66          2014    343
67          2015    548
68          2016    878
69          2017   1015
70          2018   1140
71          2019   1030
72          2020    953
73          2021    592


In [40]:
#Export cleaned DataFrame to CSV for SQL import or future use
df.to_csv('../data/clean/cleaned_netflix_data.csv', index=False) #Save