# Data Cleaning Pipeline

This notebook executes the data cleaning and merging process using the `data_cleaning` module.

In [69]:
import os 
import pandas as pd

import pathlib
import numpy as np

from tqdm import tqdm

In [70]:
from data_cleaning.process_charts import process_all_charts
from data_cleaning.merge import merge_data
from data_cleaning.clean_songs import (
    list_weekly_chart_files,
    extract_dates_from_filenames,
    summarize_weekly_date_gaps,
    create_song_dict,
    update_song_rows_with_dict,
    fill_with_proxy_dict_compat,
    fill_missing_from_dfs,
)

DATA_DIR = "data"

weekly_charts_path = os.path.join(DATA_DIR, "bronze", "data")
tracks_path = os.path.join(DATA_DIR, "bronze", "tracks.csv")
songs_path = os.path.join(DATA_DIR, "bronze", "combined_songs.csv")
output_path = os.path.join(DATA_DIR, "silver", "songs_with_features.csv")

print("Starting data processing...")
process_all_charts(weekly_charts_path, songs_path)

print("Data processing complete.")

print("Starting data merging...")
merge_data(tracks_path, songs_path, output_path)
print("Data merging complete.")

if os.path.exists(output_path):
    songs = pd.read_csv(output_path)
    print("Songs loaded successfully. ({:_} rows)".format(songs.shape[0]))
else:
    raise FileNotFoundError("Error: Output path does not exist.")

Starting data processing...
✓ File saved as: data/bronze/combined_songs.csv
Data processing complete.
Starting data merging...
✓ Merged data saved to: data/silver/songs_with_features.csv
Data merging complete.
Songs loaded successfully. (41_995 rows)


## Verification
Check if no week was skipped during the webscraping.

In [71]:
# Example usage of verification helpers from data_cleaning.clean_songs
files = list_weekly_chart_files(weekly_charts_path)
dates = extract_dates_from_filenames(files)
summarize_weekly_date_gaps(dates)

First date: 2016-12-29
Last date: 2020-12-31
Total files: 210
Expected weeks: 210

Missing weeks:

Unexpected extra dates:


In [72]:
if "name" in songs.columns:
    # flag where name not NaN and track_name is NaN
    print("Rows where name is not included in track_name: {}/{:_}".format(songs[songs["name"].notna() & songs["track_name"].isna()].shape[0], songs.shape[0]))
    
    # Drop the "name" column as it is included in "track_name"
    songs.drop(columns=["name"], inplace=True)
    print("Column 'name' dropped successfully.")

Rows where name is not included in track_name: 0/41_995
Column 'name' dropped successfully.


## Clean songs

In [73]:
songs.columns, songs.shape

(Index(['track_id', 'artist_names', 'track_name', 'source', 'week_date',
        'popularity', 'duration_ms', 'explicit', 'artists', 'id_artists',
        'release_date', 'danceability', 'energy', 'key', 'loudness', 'mode',
        'speechiness', 'acousticness', 'instrumentalness', 'liveness',
        'valence', 'tempo', 'time_signature'],
       dtype='object'),
 (41995, 23))

In [74]:
# Create a dictionary of canonical song IDs using helper from data_cleaning.clean_songs
song_dict = create_song_dict(songs)
print(song_dict[("The Weeknd", "Blinding Lights")])

Processing rows:   0%|          | 0/41995 [00:00<?, ?it/s]

Processing rows: 100%|██████████| 41995/41995 [00:00<00:00, 59077.00it/s]

['0VjIjW4GlUZAMYd2vXMi3b', 'Republic Records', Timestamp('2020-03-20 00:00:00')]





In [75]:
# Apply the update function from data_cleaning.clean_songs
songs = update_song_rows_with_dict(songs, song_dict)

Updating songs: 100%|██████████| 41995/41995 [00:05<00:00, 7387.32it/s]

Number of songs updated: 6_458/41_995





In [76]:
# Fill missing values in columns of interest using helpers from data_cleaning.clean_songs
columns_to_fill = [
    'artist_names', 'track_name', 'source', 'duration_ms', 'explicit', 
    'popularity', 'artists', 'id_artists', 'release_date', 'danceability', 'energy', 
    'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
    'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature'
]

songs = fill_with_proxy_dict_compat(songs, columns_to_fill)


Number of rows filled: 11_069


In [77]:
print("There are still {:_} rows with NaN values".format(songs[songs.isna().any(axis=1)].shape[0]))

There are still 7_404 rows with NaN values


## Enrich still missing values

In [78]:
df_enrichment2_path = os.path.join(DATA_DIR, "bronze", "spotify_top_songs_audio_features.csv")
df_enrichment2 = pd.read_csv(df_enrichment2_path)
if "id" in df_enrichment2.columns:
    # Replace id with track_id
    df_enrichment2.rename(columns={"id": "track_id"}, inplace=True)
df_enrichment2.shape, df_enrichment2.columns 

((6513, 19),
 Index(['track_id', 'artist_names', 'track_name', 'source', 'key', 'mode',
        'time_signature', 'danceability', 'energy', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'loudness',
        'tempo', 'duration_ms', 'weeks_on_chart', 'streams'],
       dtype='object'))

In [79]:
# List all dataframes in a kaggle_enrichment3_dir, then add them together to it is one big dataframe
kaggle_enrichment3_dir = os.path.join(DATA_DIR, "bronze", "kaggle_enrichment3")
import glob

# List all CSV files in the kaggle_enrichment3_dir
csv_files = glob.glob(os.path.join(kaggle_enrichment3_dir, "*.csv"))

# Read each CSV file into a DataFrame and collect them in a list
df_list = [pd.read_csv(f) for f in csv_files]

# Concatenate all DataFrames into a single big DataFrame
df_enrichment3 = pd.concat(df_list, ignore_index=True)

# Show shape and columns to confirm final structure
df_enrichment3.shape, df_enrichment3.columns


((247035, 17),
 Index(['artist_name', 'track_id', 'track_name', 'acousticness', 'danceability',
        'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
        'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence',
        'popularity'],
       dtype='object'))

In [80]:
set(df_enrichment2.columns).difference(set(columns_to_fill)), set(df_enrichment3.columns).difference(set(columns_to_fill))

({'streams', 'track_id', 'weeks_on_chart'}, {'artist_name', 'track_id'})

In [81]:
# Enrich missing values from external enrichment DataFrames using helper from data_cleaning.clean_songs
songs_gold = fill_missing_from_dfs(songs, columns_to_fill, "track_id", df_enrichment2, df_enrichment3)

Total missing values *before* processing DF: 125_826
Available columns: ['artist_names', 'track_name', 'source', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
Size of lookup dictionary : 6513


Enriching songs: 100%|██████████| 41995/41995 [00:11<00:00, 3533.69it/s]


Available columns: ['track_name', 'duration_ms', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
Size of lookup dictionary : 130989


Enriching songs: 100%|██████████| 41995/41995 [00:03<00:00, 12609.75it/s]

Total missing values *after* processing DF: 33_889





In [82]:
# save songs_gold
if "popularity" in songs_gold.columns:
    songs_gold.drop(columns=["popularity"], inplace=True)
songs_gold.to_csv(os.path.join(DATA_DIR, "gold", "songs_with_features.csv"), index=False)

In [83]:
# Show rows with NaN values in songs_gold
songs_gold[songs_gold.isna().any(axis=1)]

Unnamed: 0,track_id,artist_names,track_name,source,week_date,duration_ms,explicit,artists,id_artists,release_date,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
5,25sgk305KZfyuqVBQIahim,Ava Max,Sweet but Psycho,Atlantic Records,2019-01-17,187436.0,,,,,...,C#/Db,-4.724,Major,0.0476,0.0691,0.000000,0.1660,0.628,133.002,4 beats
11,7wFybC8jBH3zE139OpCtpG,"Gesaffelstein, The Weeknd",Lost in the Fire (feat. The Weeknd),Columbia,2019-01-17,202093.0,,,,,...,D,-12.159,Major,0.0359,0.0863,0.001330,0.1170,0.176,101.004,4 beats
56,0jAfdqv18goRTUxm3ilRjb,"A Boogie Wit da Hoodie, Tyga, Offset",Startender (feat. Offset and Tyga),Highbridge the Label / Atlantic Records,2019-01-17,192779.0,,,,,...,F#/Gb,-4.653,Minor,0.1330,0.0235,0.000000,0.1510,0.506,191.971,4 beats
57,13hvHEstJ4sNbzdroPrPI3,"Dua Lipa, BLACKPINK",Kiss and Make Up,Warner Records,2019-01-17,190560.0,0.0,"['Dua Lipa', 'BLACKPINK']","['6M2wZ9GZgrQXHCFfjv46we', '41MozSoPIsD1dJM0CL...",,...,8.0,-4.383,1.0,0.1460,0.0557,0.000000,0.1890,0.630,99.986,4.0
62,2FUNBaa5DwItJtYEBgAblU,21 Savage,monster,"Slaughter Gang, LLC/Epic Records",2019-01-17,233040.0,,,,,...,A,-6.916,Minor,0.1240,0.1580,0.000228,0.1180,0.224,134.022,4 beats
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41943,7K7MUBCnzgBAvMVW2RTWNs,"Loud Luxury, Brando",Body,Armada Music,2019-06-13,163216.0,,,,,...,C#/Db,-4.399,Major,0.0380,0.0476,0.000094,0.0543,0.582,121.958,4 beats
41951,6fxVffaTuwjgEk5h9QyRjy,Ed Sheeran,Photograph,Atlantic Records UK,2019-06-13,258987.0,0.0,['Ed Sheeran'],['6eUKZXaKkcviH0Ku9w2n3V'],,...,4.0,-10.480,1.0,0.0476,0.6070,0.000464,0.0986,0.201,107.989,4.0
41978,5s8LepdwU0THzpd0M7nLsa,Ozuna,Te Soñé de Nuevo,Aura Music Corp.,2019-06-13,199813.0,,,,,...,G,-2.867,Minor,0.1070,0.0533,0.000000,0.0936,0.773,168.040,4 beats
41986,4Sokm1cWK36H2WctWWRGf1,Ufo361,Irina Shayk,Stay High,2019-06-13,147452.0,,,,,...,D,-9.504,Minor,0.0696,0.1660,0.020900,0.1700,0.201,74.869,4 beats
