In [3]:
# Load YAML configuration file
import yaml

with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)

# Load dataset with pandas
import pandas as pd
df = pd.read_csv(config['input_data']['file'])  # Load dataset based on YAML path
df.head()  # Show the first 5 rows of the dataset

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,duration_int,duration_type,genres
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",90.0,min,['Documentaries']
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2.0,season,"['International TV Shows', 'TV Dramas', 'TV My..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,1.0,season,"['Crime TV Shows', 'International TV Shows', '..."
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",1.0,season,"['Docuseries', 'Reality TV']"
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2.0,season,"['International TV Shows', 'Romantic TV Shows'..."


In [26]:
# Explore structure and metadata
df.shape         # Check number of rows and columns
df.columns       # List all column names
df.info()        # Get data types and non-null counts per column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8706 entries, 0 to 8705
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   show_id        8706 non-null   object 
 1   type           8706 non-null   object 
 2   title          8706 non-null   object 
 3   director       8706 non-null   object 
 4   cast           8706 non-null   object 
 5   country        8704 non-null   object 
 6   date_added     8706 non-null   object 
 7   release_year   8706 non-null   int64  
 8   rating         8706 non-null   object 
 9   duration       8706 non-null   object 
 10  listed_in      8706 non-null   object 
 11  description    8706 non-null   object 
 12  duration_int   8706 non-null   float64
 13  duration_type  8706 non-null   object 
 14  genres         8706 non-null   object 
dtypes: float64(1), int64(1), object(13)
memory usage: 1020.4+ KB


In [28]:
# Check missing values in descending order
df.isnull().sum().sort_values(ascending=False)

country          2
show_id          0
type             0
title            0
director         0
cast             0
date_added       0
release_year     0
rating           0
duration         0
listed_in        0
description      0
duration_int     0
duration_type    0
genres           0
dtype: int64

In [30]:
# Number of unique values per column
df.nunique().sort_values()

# Distribution of key categorical columns
df['type'].value_counts(normalize=True)  # Distribution of 'Movie' vs 'TV Show'
df['country'].value_counts().head(10)    # Top 10 countries
df['rating'].value_counts().head(10)     # Most common ratings

rating
TV-MA      3183
TV-14      2133
TV-PG       838
R           799
PG-13       490
TV-Y7       335
TV-Y        300
PG          287
TV-G        212
Unrated      85
Name: count, dtype: int64

In [32]:
# View a sample of the genres column
df['listed_in'].sample(10)

8494                  International TV Shows, TV Dramas
6799                       Dramas, International Movies
1009                           Children & Family Movies
2490    Comedies, International Movies, Romantic Movies
1617                                           Comedies
1184           Action & Adventure, International Movies
5799                       British TV Shows, Docuseries
935                  Children & Family Movies, Comedies
310                          Crime TV Shows, Docuseries
6810                       Action & Adventure, Comedies
Name: listed_in, dtype: object

In [34]:
# Check unique countries
df['country'].unique()[:20]

array(['United States', 'South Africa', 'Unknown', 'India',
       'United Kingdom', 'Germany', 'Mexico', 'Turkey', 'Australia',
       'Finland', 'China', 'Nigeria', 'Japan', 'Spain', 'France',
       'Belgium', 'South Korea', 'Argentina', 'Russia', 'Canada'],
      dtype=object)

In [36]:
# Split 'duration' column into duration value and type (e.g., "90 min" → 90 + 'min')
df[['duration_int', 'duration_type']] = df['duration'].str.extract(r'(\d+)\s*(\w+)')
df['duration_int'] = pd.to_numeric(df['duration_int'], errors='coerce')  # Convert to numeric
df[['duration', 'duration_int', 'duration_type']].head()  # Preview new columns

Unnamed: 0,duration,duration_int,duration_type
0,90 min,90,min
1,2 Seasons,2,Seasons
2,1 Season,1,Season
3,1 Season,1,Season
4,2 Seasons,2,Seasons


In [38]:
# Convert 'date_added' column to datetime format
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce') # Convert to datetime
df['date_added'].dtype  # Confirm conversion

dtype('<M8[ns]')

In [40]:
# Split genres into lists (for multi-label classification)
df['genres'] = df['listed_in'].str.split(', ') # Split genre strings into lists
df[['listed_in', 'genres']].sample(5)  # Check results

Unnamed: 0,listed_in,genres
6184,"Documentaries, Sports Movies","[Documentaries, Sports Movies]"
7806,Anime Series,[Anime Series]
4381,"Romantic TV Shows, Spanish-Language TV Shows, ...","[Romantic TV Shows, Spanish-Language TV Shows,..."
1939,"Comedies, Dramas, International Movies","[Comedies, Dramas, International Movies]"
6426,"Comedies, Dramas, International Movies","[Comedies, Dramas, International Movies]"


In [42]:
# Frequency of duration types (e.g., 'min', 'Season')
df['duration_type'].value_counts() # Check types of duration (minutes or seasons)

# Standardize plural form to singular ('Seasons' → 'Season')
df['duration_type'] = df['duration_type'].replace('Seasons', 'Season') # Normalize to singular
df['duration_type'].value_counts()  # Confirm correction

duration_type
min       6128
Season    2578
Name: count, dtype: int64

In [44]:
# Check for missing values in 'duration_int'
df['duration_int'].isnull().sum()  # Check how many null durations remain
# Keep only rows where 'duration_int' is not null
df = df[df['duration_int'].notna()]  # Filter out rows without valid duration
# Inspect rows where duration was missing
df[df['duration_int'].isna()][['duration']]


Unnamed: 0,duration


In [46]:
# Remove duplicated rows based on 'show_id', 'title', and 'release_year'
df = df.drop_duplicates(subset=['show_id', 'title', 'release_year'])  # Remove duplicates
print("Number of rows after removing duplicates:", df.shape[0])  # Check number of rows after deduplication

Number of rows after removing duplicates: 8706


In [48]:
# Confirm that duplicates are removed
duplicates = df.duplicated(subset=['show_id', 'title', 'release_year']).sum()  # Count remaining duplicates
print("Remaining duplicates:", duplicates) 

Remaining duplicates: 0


In [50]:
# Check missing values again per column (after cleaning)
df.isnull().sum().sort_values(ascending=False)

country          2
show_id          0
type             0
title            0
director         0
cast             0
date_added       0
release_year     0
rating           0
duration         0
listed_in        0
description      0
duration_int     0
duration_type    0
genres           0
dtype: int64

In [52]:
# Fill missing values with appropriate defaults
# 1. Replace missing 'director' with "Unknown"
df['director'] = df['director'].fillna("Unknown")
# 2. Replace missing 'cast' with "Unknown"
df['cast'] = df['cast'].fillna("Unknown")
# 3. Replace missing 'country' with "Unknown"
df['country'] = df['country'].fillna("Unknown")
# 4. Drop rows where 'date_added' is missing
df = df[df['date_added'].notna()]
# 5. Replace missing 'rating' with "Not Rated"
df['rating'] = df['rating'].fillna("Not Rated")

In [54]:
# Confirm again that no missing values remain after filling
df.isnull().sum().sort_values(ascending=False)

show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added       0
release_year     0
rating           0
duration         0
listed_in        0
description      0
duration_int     0
duration_type    0
genres           0
dtype: int64

In [56]:
# Define a function to extract the first country if there are multiple listed
def extract_first_country(country):
    if isinstance(country, str):
        return country.split(',')[0]  # Keep only the first country
    return country

# Apply the function to create a cleaned 'country' column
df['country'] = df['country'].apply(extract_first_country)

In [58]:
# Quick check of distributions on key columns
print(df['duration_type'].value_counts())  # Check distribution of 'duration_type'
print(df['type'].value_counts())           # Check distribution of 'type'
print(df['rating'].value_counts())         # Check distribution of 'rating'
print(df['country'].value_counts().head(20))  # Check top 20 most common countries

duration_type
min       6128
Season    2578
Name: count, dtype: int64
type
Movie      6128
TV Show    2578
Name: count, dtype: int64
rating
TV-MA      3183
TV-14      2133
TV-PG       838
R           799
PG-13       490
TV-Y7       335
TV-Y        300
PG          287
TV-G        212
Unrated      85
G            41
NC-17         3
Name: count, dtype: int64
country
United States     3166
India             1007
Unknown            829
United Kingdom     611
Canada             259
Japan              255
France             209
South Korea        206
Spain              177
Mexico             134
Australia          115
Egypt              112
Turkey             111
Germany            103
China              100
Nigeria             96
Indonesia           85
Taiwan              85
Brazil              84
Philippines         80
Name: count, dtype: int64


In [60]:
# Normalize inconsistent 'rating' labels into unified categories
df['rating'] = df['rating'].replace({
    'NR': 'Unrated',
    'UR': 'Unrated',
    'Not Rated': 'Unrated',
    'UNRATED': 'Unrated',
    'TV-Y7-FV': 'TV-Y7'  # Merge variant of TV-Y7
})

In [62]:
# Standardize 'duration_type' to lowercase for consistency
df['duration_type'] = df['duration_type'].str.lower()

In [64]:
# Check the format and range of 'date_added' column
print("Date format:", df['date_added'].dtype)  # Confirm 'date_added' is datetime
print("Earliest date:", df['date_added'].min())  # Check earliest date
print("Latest date:", df['date_added'].max())    # Check latest date
df['date_added'].sample(10, random_state=1)     # Preview 10 random samples

Date format: datetime64[ns]
Earliest date: 2008-01-01 00:00:00
Latest date: 2021-09-25 00:00:00


1600   2020-12-03
1829   2020-10-19
2567   2020-05-02
311    2021-08-04
1933   2020-09-30
8432   2019-06-06
2385   2020-06-15
5755   2016-10-12
397    2021-07-23
4697   2018-08-17
Name: date_added, dtype: datetime64[ns]

In [66]:
# Calculate the average duration by 'duration_type' (e.g., average movie length)
avg_duration = df.groupby('duration_type')['duration_int'].mean().reset_index()
print("Average duration per type:")
print(avg_duration)

Average duration per type:
  duration_type  duration_int
0           min     99.577187
1        season      1.688518


In [68]:
# Count number of titles per country (using the cleaned 'country' column)
country_counts = df['country'].value_counts().reset_index()
country_counts.columns = ['country', 'count']
print("\nNumber of titles by country:")
print(country_counts.head(10))


Number of titles by country:
          country  count
0   United States   3166
1           India   1007
2         Unknown    829
3  United Kingdom    611
4          Canada    259
5           Japan    255
6          France    209
7     South Korea    206
8           Spain    177
9          Mexico    134


In [70]:
# Explode the 'genres' column and count most common genres
df_exploded = df.explode('genres')  # Separate multiple genres into individual rows
genre_counts = df_exploded['genres'].value_counts().reset_index()  # Count occurrences of each genre
genre_counts.columns = ['genre', 'count']  # Rename columns for clarity
print("\nMost common genres:")
print(genre_counts.head(10))  # Show top 10 genres


Most common genres:
                      genre  count
0      International Movies   2752
1                    Dramas   2427
2                  Comedies   1674
3    International TV Shows   1328
4             Documentaries    869
5        Action & Adventure    859
6        Independent Movies    756
7                 TV Dramas    739
8  Children & Family Movies    641
9           Romantic Movies    616


In [72]:
# Count number of titles by release year
titles_per_year = df['release_year'].value_counts().sort_index().reset_index()
titles_per_year.columns = ['release_year', 'count']
print("\n🔹 Titles by release year:")
print(titles_per_year.tail(10))  # Show most recent years


🔹 Titles by release year:
    release_year  count
64          2012    229
65          2013    282
66          2014    343
67          2015    548
68          2016    878
69          2017   1015
70          2018   1140
71          2019   1030
72          2020    953
73          2021    592


In [81]:
#Export cleaned DataFrame to CSV for SQL import or future use
df['platform'] = 'Netflix'
df.to_csv('../data/clean/cleaned_netflix_data_with_platform.csv', index=False) #Save


In [83]:
print(df.columns)
print(df[['show_id', 'title', 'platform']].head())

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'duration_int', 'duration_type', 'genres', 'platform'],
      dtype='object')
  show_id                  title platform
0      s1   Dick Johnson Is Dead  Netflix
1      s2          Blood & Water  Netflix
2      s3              Ganglands  Netflix
3      s4  Jailbirds New Orleans  Netflix
4      s5           Kota Factory  Netflix


In [76]:
import os
print("File updated successfully:", os.path.getmtime('../data/clean/cleaned_netflix_data_with_platform.csv'))


File updated successfully: 1745323920.0718448
