In [1]:
# Load YAML configuration file
import yaml

with open("../amazon_config.yaml", "r") as file:
    config = yaml.safe_load(file)

# Load dataset with pandas
import pandas as pd
df = pd.read_csv(config['input_data']['file'])  # Load dataset based on YAML path
df.head()  # Show the first 5 rows of the dataset

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...
2,s3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...
3,s4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ..."
4,s5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...


In [3]:
# Explore structure and metadata
df.shape         # Check number of rows and columns
df.columns       # List all column names
df.info()        # Get data types and non-null counts per column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9668 entries, 0 to 9667
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       9668 non-null   object
 1   type          9668 non-null   object
 2   title         9668 non-null   object
 3   director      7585 non-null   object
 4   cast          8435 non-null   object
 5   country       672 non-null    object
 6   date_added    155 non-null    object
 7   release_year  9668 non-null   int64 
 8   rating        9331 non-null   object
 9   duration      9668 non-null   object
 10  listed_in     9668 non-null   object
 11  description   9668 non-null   object
dtypes: int64(1), object(11)
memory usage: 906.5+ KB


In [5]:
# Check missing values in descending order
df.isnull().sum().sort_values(ascending=False)

date_added      9513
country         8996
director        2083
cast            1233
rating           337
show_id            0
type               0
title              0
release_year       0
duration           0
listed_in          0
description        0
dtype: int64

In [7]:
# Number of unique values per column
df.nunique().sort_values()

# Distribution of key categorical columns
df['type'].value_counts(normalize=True)  # Distribution of 'Movie' vs 'TV Show'
df['country'].value_counts().head(10)    # Top 10 countries
df['rating'].value_counts().head(10)     # Most common ratings

rating
13+      2117
16+      1547
ALL      1268
18+      1243
R        1010
PG-13     393
7+        385
PG        253
NR        223
TV-14     208
Name: count, dtype: int64

In [9]:
# View a sample of the genres column
df['listed_in'].sample(10)

5282                                        Comedy, Drama
2478                                             TV Shows
2008                                       Drama, Western
7384                                    Suspense, Western
8044                                     Horror, Suspense
6618                        Documentary, Special Interest
1686    Arts, Entertainment, and Culture, Comedy, Spec...
2168                                          Documentary
6597                                                Drama
46                                       Special Interest
Name: listed_in, dtype: object

In [11]:
# Check unique countries
df['country'].unique()[:20]

array(['Canada', 'India', 'United States', 'United Kingdom', 'France',
       'Spain', nan, 'Italy', 'United Kingdom, France',
       'United States, Italy', 'United States, India',
       'United Kingdom, United States',
       'United States, United Kingdom, Germany', 'Japan',
       'China, United States, United Kingdom',
       'Denmark, United Kingdom, Czech Republic, Netherlands',
       'United States, Ireland', 'United States, United Kingdom, Canada',
       'United Kingdom, United States, India', 'United Kingdom, India'],
      dtype=object)

In [13]:
# Split 'duration' column into duration value and type (e.g., "90 min" → 90 + 'min')
df[['duration_int', 'duration_type']] = df['duration'].str.extract(r'(\d+)\s*(\w+)')
df['duration_int'] = pd.to_numeric(df['duration_int'], errors='coerce')  # Convert to numeric
df[['duration', 'duration_int', 'duration_type']].head()  # Preview new columns

Unnamed: 0,duration,duration_int,duration_type
0,113 min,113,min
1,110 min,110,min
2,74 min,74,min
3,69 min,69,min
4,45 min,45,min


In [15]:
# Convert 'date_added' column to datetime format
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce') # Convert to datetime
df['date_added'].dtype  # Confirm conversion

dtype('<M8[ns]')

In [17]:
# Split genres into lists (for multi-label classification)
df['genres'] = df['listed_in'].str.split(', ') # Split genre strings into lists
df[['listed_in', 'genres']].sample(5)  # Check results

Unnamed: 0,listed_in,genres
6441,"Drama, Suspense","[Drama, Suspense]"
5626,"Documentary, Special Interest","[Documentary, Special Interest]"
5083,"Animation, Kids","[Animation, Kids]"
5962,"Arts, Entertainment, and Culture, Comedy, Spec...","[Arts, Entertainment, and Culture, Comedy, Spe..."
7759,Comedy,[Comedy]


In [19]:
# Frequency of duration types (e.g., 'min', 'Season')
df['duration_type'].value_counts() # Check types of duration (minutes or seasons)

# Standardize plural form to singular ('Seasons' → 'Season')
df['duration_type'] = df['duration_type'].replace('Seasons', 'Season') # Normalize to singular
df['duration_type'].value_counts()  # Confirm correction

duration_type
min       7814
Season    1854
Name: count, dtype: int64

In [21]:
# Check for missing values in 'duration_int'
df['duration_int'].isnull().sum()  # Check how many null durations remain
# Keep only rows where 'duration_int' is not null
df = df[df['duration_int'].notna()]  # Filter out rows without valid duration
# Inspect rows where duration was missing
df[df['duration_int'].isna()][['duration']]


Unnamed: 0,duration


In [23]:
# Remove duplicated rows based on 'show_id', 'title', and 'release_year'
df = df.drop_duplicates(subset=['show_id', 'title', 'release_year'])  # Remove duplicates
print("Number of rows after removing duplicates:", df.shape[0])  # Check number of rows after deduplication

Number of rows after removing duplicates: 9668


In [25]:
# Confirm that duplicates are removed
duplicates = df.duplicated(subset=['show_id', 'title', 'release_year']).sum()  # Count remaining duplicates
print("Remaining duplicates:", duplicates) 

Remaining duplicates: 0


In [27]:
# Check missing values again per column (after cleaning)
df.isnull().sum().sort_values(ascending=False)

date_added       9513
country          8996
director         2083
cast             1233
rating            337
show_id             0
type                0
title               0
release_year        0
duration            0
listed_in           0
description         0
duration_int        0
duration_type       0
genres              0
dtype: int64

In [29]:
import numpy as np  # Asegúrate de importar numpy para poder usar np.nan

# Fill missing values with appropriate defaults

# 1. Replace missing 'director' with "Unknown"
df['director'] = df['director'].fillna("Unknown")

# 2. Replace missing 'cast' with "Unknown"
df['cast'] = df['cast'].fillna("Unknown")

# 3. Clean the 'country' column: replace empty strings or spaces with NaN, then fill with "Unknown"
df['country'] = df['country'].str.strip()  # Remove extra spaces
df['country'] = df['country'].replace(['', ' '], np.nan)  # Replace empty or space-only cells with NaN

# Check how many null values there are now in 'cou ntry'
print("Number of missing values in 'country' after cleaning:", df['country'].isnull().sum())

# Fill missing 'country' with "Unknown"
df['country'] = df['country'].fillna("Unknown")

# 4. Drop rows where 'date_added' is missing
df = df[df['date_added'].notna()]

# 5. Replace missing 'rating' with "Not Rated"
df['rating'] = df['rating'].fillna("Not Rated")

# Final check: confirm there are no missing values left
print("\nMissing values per column after filling:")
print(df.isnull().sum().sort_values(ascending=False))


Number of missing values in 'country' after cleaning: 8996

Missing values per column after filling:
show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added       0
release_year     0
rating           0
duration         0
listed_in        0
description      0
duration_int     0
duration_type    0
genres           0
dtype: int64


In [31]:
# Confirm again that no missing values remain after filling
df.isnull().sum().sort_values(ascending=False)

show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added       0
release_year     0
rating           0
duration         0
listed_in        0
description      0
duration_int     0
duration_type    0
genres           0
dtype: int64

In [64]:
# Define a function to extract the first country if there are multiple listed
def extract_first_country(country):
    if isinstance(country, str):
        return country.split(',')[0]  # Keep only the first country
    return country

# Apply the function to create a cleaned 'country' column
df['country'] = df['country'].apply(extract_first_country)

In [35]:
# Quick check of distributions on key columns
print(df['duration_type'].value_counts())  # Check distribution of 'duration_type'
print(df['type'].value_counts())           # Check distribution of 'type'
print(df['rating'].value_counts())         # Check distribution of 'rating'
print(df['country'].value_counts().head(20))  # Check top 20 most common countries

duration_type
Season    139
min        16
Name: count, dtype: int64
type
TV Show    139
Movie       16
Name: count, dtype: int64
rating
ALL          35
13+          20
Not Rated    15
16+          15
TV-G         13
TV-Y         10
TV-14        10
18+           8
7+            7
TV-NR         5
TV-MA         5
TV-Y7         5
TV-PG         4
NR            3
Name: count, dtype: int64
country
Unknown           129
United States      14
Canada              3
United Kingdom      3
Spain               3
India               2
France              1
Name: count, dtype: int64


In [37]:
# Normalize inconsistent 'rating' labels into unified categories
df['rating'] = df['rating'].replace({
    'NR': 'Unrated',
    'UR': 'Unrated',
    'Not Rated': 'Unrated',
    'UNRATED': 'Unrated',
    'TV-Y7-FV': 'TV-Y7'  # Merge variant of TV-Y7
})

In [39]:
# Standardize 'duration_type' to lowercase for consistency
df['duration_type'] = df['duration_type'].str.lower()

In [41]:
# Check the format and range of 'date_added' column
print("Date format:", df['date_added'].dtype)  # Confirm 'date_added' is datetime
print("Earliest date:", df['date_added'].min())  # Check earliest date
print("Latest date:", df['date_added'].max())    # Check latest date
df['date_added'].sample(10, random_state=1)     # Preview 10 random samples

Date format: datetime64[ns]
Earliest date: 2021-03-30 00:00:00
Latest date: 2021-10-10 00:00:00


8696   2021-09-17
6568   2021-07-17
5855   2021-06-15
4783   2021-05-01
5041   2021-05-12
5379   2021-05-24
14     2021-05-02
8437   2021-09-09
5920   2021-06-18
9401   2021-10-05
Name: date_added, dtype: datetime64[ns]

In [43]:
# Calculate the average duration by 'duration_type' (e.g., average movie length)
avg_duration = df.groupby('duration_type')['duration_int'].mean().reset_index()
print("Average duration per type:")
print(avg_duration)

Average duration per type:
  duration_type  duration_int
0           min     85.125000
1        season      3.071942


In [45]:
# Count number of titles per country (using the cleaned 'country' column)
country_counts = df['country'].value_counts().reset_index()
country_counts.columns = ['country', 'count']
print("\nNumber of titles by country:")
print(country_counts.head(10))


Number of titles by country:
          country  count
0         Unknown    129
1   United States     14
2          Canada      3
3  United Kingdom      3
4           Spain      3
5           India      2
6          France      1


In [47]:
# Explode the 'genres' column and count most common genres
df_exploded = df.explode('genres')  # Separate multiple genres into individual rows
genre_counts = df_exploded['genres'].value_counts().reset_index()  # Count occurrences of each genre
genre_counts.columns = ['genre', 'count']  # Rename columns for clarity
print("\nMost common genres:")
print(genre_counts.head(10))  # Show top 10 genres


Most common genres:
              genre  count
0              Kids     59
1         Animation     44
2            Comedy     30
3             Drama     30
4       Documentary     24
5            Action     18
6  Special Interest     17
7            Sports     13
8          Suspense     13
9        Unscripted     11


In [68]:
# Count number of titles by release year
titles_per_year = df['release_year'].value_counts().sort_index().reset_index()
titles_per_year.columns = ['release_year', 'count']
print("\n🔹 Titles by release year:")
print(titles_per_year.tail(10))  # Show most recent years


🔹 Titles by release year:
    release_year  count
15          2012      3
16          2013      4
17          2014      6
18          2015      5
19          2016      9
20          2017     10
21          2018     14
22          2019     12
23          2020     12
24          2021     53


In [74]:
#Export cleaned DataFrame to CSV for SQL import or future use
df['platform'] = 'Prime'
df.to_csv('../data/clean/cleaned_prime_data.csv', index=False) #Save
import pandas as pd
df = pd.read_csv('../data/clean/cleaned_prime_data.csv')
print(df['platform'].value_counts())
print(df.shape)

platform
Prime    155
Name: count, dtype: int64
(155, 16)


In [53]:
print(df.columns)
print(df[['show_id', 'title', 'platform']].head())

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'duration_int', 'duration_type', 'genres', 'platform'],
      dtype='object')
  show_id                 title platform
0      s1   The Grand Seduction    Prime
1      s2  Take Care Good Night    Prime
2      s3  Secrets of Deception    Prime
3      s4    Pink: Staying True    Prime
4      s5         Monster Maker    Prime


In [61]:
import os
print("File updated successfully:", os.path.getmtime('../data/clean/cleaned_prime_data.csv'))


File updated successfully: 1745333935.8988771
