In [75]:
# import important libraries
import pandas as pd
from warnings import filterwarnings
filterwarnings(action='ignore')

In [76]:
# Load data from data/processed data
df = pd.read_csv('../data/processed/loaded_data.csv')

In [77]:
# Find number of row in dataframe
print(f"The dataset contain {len(df)} data")

The dataset contain 8807 data


In [78]:
# find null in dataset
df.isnull().sum()

Unnamed: 0         0
show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [79]:
# Impute missing director, cast, and country with 'Unknown'
df['director'].fillna('Unknown', inplace=True)
df['cast'].fillna('Unknown', inplace=True)
df['country'].fillna('Not Specified', inplace=True)

# Impute missing 'rating' with the mode (most frequent value)
df['rating'].fillna(df['rating'].mode()[0], inplace=True)

# Impute missing 'duration' with the median
df['duration'].fillna(df['duration'].mode()[0], inplace=True)

# Impute missing 'date_added' with a placeholder (e.g., 'Not Added')
df['date_added'].fillna(df['date_added'].ffill(), inplace=True)


In [80]:
# find null in dataset
df.isnull().sum()

Unnamed: 0      0
show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [81]:
# Drop duplicates in show_id if there is
df.drop_duplicates(subset=('show_id'), inplace=True)

In [82]:
# change date_added and year released column to datetime
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['release_year'] = pd.to_datetime(df['release_year'], errors='coerce')

In [83]:
# drop unnamed column because has no use and its duplicate of index
df.drop(columns=(['Unnamed: 0']), inplace=True)

In [84]:
# Handling categorical data

# Remove rows where the 'rating' contains 'min' (duration entries)
df = df[~df['rating'].str.contains('min', na=False)]



In [85]:
df['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', 'NR', 'TV-Y7-FV', 'UR'], dtype=object)

In [86]:
# Since movie and tv show have different duration we divide df then

In [87]:
df['cast'] = df['cast'].fillna('Unknown').str.split(',').apply(lambda x: [i.strip() for i in x])
df['director'] = df['director'].fillna('Unknown').str.split(',').apply(lambda x: [i.strip() for i in x])


In [91]:
# Set 'show_id' as the index of the DataFrame
df.set_index('show_id', inplace=True)

df.head()

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s1,Movie,Dick Johnson Is Dead,[Kirsten Johnson],[Unknown],United States,2021-09-25,1970-01-01 00:00:00.000002020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
s2,TV Show,Blood & Water,[Unknown],"[Ama Qamata, Khosi Ngema, Gail Mabalane, Thaba...",South Africa,2021-09-24,1970-01-01 00:00:00.000002021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
s3,TV Show,Ganglands,[Julien Leclercq],"[Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nab...",Not Specified,2021-09-24,1970-01-01 00:00:00.000002021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
s4,TV Show,Jailbirds New Orleans,[Unknown],[Unknown],Not Specified,2021-09-24,1970-01-01 00:00:00.000002021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
s5,TV Show,Kota Factory,[Unknown],"[Mayur More, Jitendra Kumar, Ranjan Raj, Alam ...",India,2021-09-24,1970-01-01 00:00:00.000002021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [94]:
movie_df = df[df['type'] == 'Movie'].drop(columns=(['type']))
show_df = df[df['type'] == 'TV Show'].drop(columns=(['type']))
print(f"The length of movie_df is {len(movie_df)} and length of show_df is {len(show_df)}")

The length of movie_df is 6128 and length of show_df is 2676


In [95]:
# saving processed dataset
df.to_csv('../data/processed/full_cleaned.csv')
movie_df.to_csv('../data/processed/movie_cleaned.csv')
show_df.to_csv('../data/processed/show_cleaned.csv')