# Data Cleaning Pipeline

This notebook executes the data cleaning and merging process using the `data_cleaning` module.

In [3]:
import os 
import pandas as pd

import pathlib
import numpy as np

from tqdm import tqdm

In [4]:
from data_cleaning.process_charts import process_all_charts
from data_cleaning.merge import merge_data
from data_cleaning.clean_songs import (
    list_weekly_chart_files,
    extract_dates_from_filenames,
    summarize_weekly_date_gaps,
    create_song_dict,
    update_song_rows_with_dict,
    fill_with_proxy_dict_compat,
    fill_missing_from_dfs,
)

DATA_DIR = "data"

weekly_charts_path = os.path.join(DATA_DIR, "bronze", "data")
tracks_path = os.path.join(DATA_DIR, "bronze", "tracks.csv")
songs_path = os.path.join(DATA_DIR, "silver", "combined_songs.csv")
output_path = os.path.join(DATA_DIR, "silver", "songs_with_features.csv")

print("Starting data processing...")
process_all_charts(weekly_charts_path, songs_path)

silver_songs = pd.read_csv(songs_path)
print(silver_songs.isna().sum())
print(silver_songs.shape)

print("Data processing complete.")

print("Starting data merging...")
merge_data(tracks_path, songs_path, output_path)
print("Data merging complete.")

silver_songs_features = pd.read_csv(output_path)
print(silver_songs_features.isna().sum())
print(silver_songs_features.shape)

if os.path.exists(output_path):
    songs = pd.read_csv(output_path)
    print("Songs loaded successfully. ({:_} rows)".format(songs.shape[0]))
else:
    raise FileNotFoundError("Error: Output path does not exist.")

Starting data processing...
✓ File saved as: data/silver/combined_songs.csv
track_id        0
artist_names    0
track_name      0
source          0
streams         0
week_date       0
dtype: int64
(41995, 6)
Data processing complete.
Starting data merging...
✓ Merged data saved to: data/silver/songs_with_features.csv
Data merging complete.
track_id                0
artist_names            0
track_name              0
source                  0
streams                 0
week_date               0
name                10631
popularity          10631
duration_ms         10631
explicit            10631
artists             10631
id_artists          10631
release_date        10631
danceability        10631
energy              10631
key                 10631
loudness            10631
mode                10631
speechiness         10631
acousticness        10631
instrumentalness    10631
liveness            10631
valence             10631
tempo               10631
time_signature      10631
dtype: i

In [5]:
tracks_df = pd.read_csv(tracks_path)
print(tracks_df.isna().sum())
print(tracks_df.shape)


id                   0
name                71
popularity           0
duration_ms          0
explicit             0
artists              0
id_artists           0
release_date         0
danceability         0
energy               0
key                  0
loudness             0
mode                 0
speechiness          0
acousticness         0
instrumentalness     0
liveness             0
valence              0
tempo                0
time_signature       0
dtype: int64
(586672, 20)


In [6]:
# get number of "track_id" in songs that are not in "id" column of tracks_df
print(songs[~songs["track_id"].isin(tracks_df["id"])].shape[0])

10631


In [7]:
print(songs.isna().sum())


track_id                0
artist_names            0
track_name              0
source                  0
streams                 0
week_date               0
name                10631
popularity          10631
duration_ms         10631
explicit            10631
artists             10631
id_artists          10631
release_date        10631
danceability        10631
energy              10631
key                 10631
loudness            10631
mode                10631
speechiness         10631
acousticness        10631
instrumentalness    10631
liveness            10631
valence             10631
tempo               10631
time_signature      10631
dtype: int64


## Verification
Check if no week was skipped during the webscraping.

In [8]:
# Example usage of verification helpers from data_cleaning.clean_songs
files = list_weekly_chart_files(weekly_charts_path)
dates = extract_dates_from_filenames(files)
summarize_weekly_date_gaps(dates)

First date: 2016-12-29
Last date: 2020-12-31
Total files: 210
Expected weeks: 210

Missing weeks:

Unexpected extra dates:


In [9]:
if "name" in songs.columns:
    # flag where name not NaN and track_name is NaN
    print("Rows where name is not included in track_name: {}/{:_}".format(songs[songs["name"].notna() & songs["track_name"].isna()].shape[0], songs.shape[0]))
    
    # Drop the "name" column as it is included in "track_name"
    songs.drop(columns=["name"], inplace=True)
    print("Column 'name' dropped successfully.")

Rows where name is not included in track_name: 0/41_995
Column 'name' dropped successfully.


## Clean songs

In [10]:
songs.columns, songs.shape

(Index(['track_id', 'artist_names', 'track_name', 'source', 'streams',
        'week_date', 'popularity', 'duration_ms', 'explicit', 'artists',
        'id_artists', 'release_date', 'danceability', 'energy', 'key',
        'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
        'liveness', 'valence', 'tempo', 'time_signature'],
       dtype='object'),
 (41995, 24))

In [11]:
# Create a dictionary of canonical song IDs using helper from data_cleaning.clean_songs
song_dict = create_song_dict(songs)
print(song_dict[("The Weeknd", "Blinding Lights")])

Processing rows: 100%|██████████| 41995/41995 [00:00<00:00, 61281.36it/s]

['0VjIjW4GlUZAMYd2vXMi3b', 'Republic Records', Timestamp('2020-03-20 00:00:00')]





In [12]:
# Apply the update function from data_cleaning.clean_songs
songs = update_song_rows_with_dict(songs, song_dict)

Updating songs: 100%|██████████| 41995/41995 [00:04<00:00, 9227.33it/s]

Number of songs updated: 6_458/41_995





In [13]:
# Fill missing values in columns of interest using helpers from data_cleaning.clean_songs
columns_to_fill = [
    'artist_names', 'track_name', 'source', 'duration_ms', 'explicit', 
    'popularity', 'artists', 'id_artists', 'release_date', 'danceability', 'energy', 
    'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
    'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature'
]

songs = fill_with_proxy_dict_compat(songs, columns_to_fill)


Number of rows filled: 11_069


In [14]:
print("There are still {:_} rows with NaN values".format(songs[songs.isna().any(axis=1)].shape[0]))

There are still 7_404 rows with NaN values


## Enrich still missing values

In [15]:
df_enrichment2_path = os.path.join(DATA_DIR, "bronze", "spotify_top_songs_audio_features.csv")
df_enrichment2 = pd.read_csv(df_enrichment2_path)
if "id" in df_enrichment2.columns:
    # Replace id with track_id
    df_enrichment2.rename(columns={"id": "track_id"}, inplace=True)
df_enrichment2.shape, df_enrichment2.columns 

((6513, 19),
 Index(['track_id', 'artist_names', 'track_name', 'source', 'key', 'mode',
        'time_signature', 'danceability', 'energy', 'speechiness',
        'acousticness', 'instrumentalness', 'liveness', 'valence', 'loudness',
        'tempo', 'duration_ms', 'weeks_on_chart', 'streams'],
       dtype='object'))

In [16]:
# List all dataframes in a kaggle_enrichment3_dir, then add them together to it is one big dataframe
kaggle_enrichment3_dir = os.path.join(DATA_DIR, "bronze", "kaggle_enrichment3")
import glob

# List all CSV files in the kaggle_enrichment3_dir
csv_files = glob.glob(os.path.join(kaggle_enrichment3_dir, "*.csv"))

# Read each CSV file into a DataFrame and collect them in a list
df_list = [pd.read_csv(f) for f in csv_files]

# Concatenate all DataFrames into a single big DataFrame
df_enrichment3 = pd.concat(df_list, ignore_index=True)

# Show shape and columns to confirm final structure
df_enrichment3.shape, df_enrichment3.columns


((247035, 17),
 Index(['artist_name', 'track_id', 'track_name', 'acousticness', 'danceability',
        'duration_ms', 'energy', 'instrumentalness', 'key', 'liveness',
        'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence',
        'popularity'],
       dtype='object'))

In [17]:
set(df_enrichment2.columns).difference(set(columns_to_fill)), set(df_enrichment3.columns).difference(set(columns_to_fill))

({'streams', 'track_id', 'weeks_on_chart'}, {'artist_name', 'track_id'})

In [18]:
# Enrich missing values from external enrichment DataFrames using helper from data_cleaning.clean_songs
songs_gold = fill_missing_from_dfs(songs, columns_to_fill, "track_id", df_enrichment2, df_enrichment3)

Total missing values *before* processing DF: 125_826
Available columns: ['artist_names', 'track_name', 'source', 'duration_ms', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
Size of lookup dictionary : 6513


Enriching songs: 100%|██████████| 41995/41995 [00:11<00:00, 3508.20it/s]


Available columns: ['track_name', 'duration_ms', 'popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']
Size of lookup dictionary : 130989


Enriching songs: 100%|██████████| 41995/41995 [00:03<00:00, 12599.29it/s]

Total missing values *after* processing DF: 33_889





In [19]:
# save songs_gold
if "popularity" in songs_gold.columns:
    songs_gold.drop(columns=["popularity"], inplace=True)


In [20]:
# Show rows with NaN values in songs_gold
songs_gold[songs_gold.isna().any(axis=1)]

Unnamed: 0,track_id,artist_names,track_name,source,streams,week_date,duration_ms,explicit,artists,id_artists,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
5,25sgk305KZfyuqVBQIahim,Ava Max,Sweet but Psycho,Atlantic Records,22400542,2019-01-17,187436.0,,,,...,C#/Db,-4.724,Major,0.0476,0.0691,0.000000,0.1660,0.628,133.002,4 beats
11,7wFybC8jBH3zE139OpCtpG,"Gesaffelstein, The Weeknd",Lost in the Fire (feat. The Weeknd),Columbia,18491016,2019-01-17,202093.0,,,,...,D,-12.159,Major,0.0359,0.0863,0.001330,0.1170,0.176,101.004,4 beats
56,0jAfdqv18goRTUxm3ilRjb,"A Boogie Wit da Hoodie, Tyga, Offset",Startender (feat. Offset and Tyga),Highbridge the Label / Atlantic Records,8247911,2019-01-17,192779.0,,,,...,F#/Gb,-4.653,Minor,0.1330,0.0235,0.000000,0.1510,0.506,191.971,4 beats
57,13hvHEstJ4sNbzdroPrPI3,"Dua Lipa, BLACKPINK",Kiss and Make Up,Warner Records,8240958,2019-01-17,190560.0,0.0,"['Dua Lipa', 'BLACKPINK']","['6M2wZ9GZgrQXHCFfjv46we', '41MozSoPIsD1dJM0CL...",...,8.0,-4.383,1.0,0.1460,0.0557,0.000000,0.1890,0.630,99.986,4.0
62,2FUNBaa5DwItJtYEBgAblU,21 Savage,monster,"Slaughter Gang, LLC/Epic Records",8027341,2019-01-17,233040.0,,,,...,A,-6.916,Minor,0.1240,0.1580,0.000228,0.1180,0.224,134.022,4 beats
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41943,7K7MUBCnzgBAvMVW2RTWNs,"Loud Luxury, Brando",Body,Armada Music,5231950,2019-06-13,163216.0,,,,...,C#/Db,-4.399,Major,0.0380,0.0476,0.000094,0.0543,0.582,121.958,4 beats
41951,6fxVffaTuwjgEk5h9QyRjy,Ed Sheeran,Photograph,Atlantic Records UK,5021070,2019-06-13,258987.0,0.0,['Ed Sheeran'],['6eUKZXaKkcviH0Ku9w2n3V'],...,4.0,-10.480,1.0,0.0476,0.6070,0.000464,0.0986,0.201,107.989,4.0
41978,5s8LepdwU0THzpd0M7nLsa,Ozuna,Te Soñé de Nuevo,Aura Music Corp.,4667591,2019-06-13,199813.0,,,,...,G,-2.867,Minor,0.1070,0.0533,0.000000,0.0936,0.773,168.040,4 beats
41986,4Sokm1cWK36H2WctWWRGf1,Ufo361,Irina Shayk,Stay High,4567010,2019-06-13,147452.0,,,,...,D,-9.504,Minor,0.0696,0.1660,0.020900,0.1700,0.201,74.869,4 beats


In [21]:
# Display the number of NaN values for each column in songs_gold
songs_gold_witouht_nan = songs_gold.dropna()

print(songs_gold.isna().sum())
print(songs_gold.shape[0] - songs_gold_witouht_nan.shape[0])
print("{}/{}".format(songs_gold_witouht_nan.shape[0], songs_gold.shape[0]))


track_id               0
artist_names           0
track_name             0
source                 0
streams                0
week_date              0
duration_ms            0
explicit            6966
artists             6966
id_artists          6966
release_date        7404
danceability           0
energy                 0
key                    0
loudness               0
mode                   0
speechiness            0
acousticness           0
instrumentalness       0
liveness               0
valence                0
tempo                  0
time_signature         0
dtype: int64
7404
34591/41995


In [22]:
# Get rows with NaN values in any column for songs_gold
songs_gold_with_nan = songs_gold[songs_gold.isna().any(axis=1)][["track_id", "week_date", "explicit", "artists", "id_artists", "release_date"]]
songs_gold_with_nan


Unnamed: 0,track_id,week_date,explicit,artists,id_artists,release_date
5,25sgk305KZfyuqVBQIahim,2019-01-17,,,,
11,7wFybC8jBH3zE139OpCtpG,2019-01-17,,,,
56,0jAfdqv18goRTUxm3ilRjb,2019-01-17,,,,
57,13hvHEstJ4sNbzdroPrPI3,2019-01-17,0.0,"['Dua Lipa', 'BLACKPINK']","['6M2wZ9GZgrQXHCFfjv46we', '41MozSoPIsD1dJM0CL...",
62,2FUNBaa5DwItJtYEBgAblU,2019-01-17,,,,
...,...,...,...,...,...,...
41943,7K7MUBCnzgBAvMVW2RTWNs,2019-06-13,,,,
41951,6fxVffaTuwjgEk5h9QyRjy,2019-06-13,0.0,['Ed Sheeran'],['6eUKZXaKkcviH0Ku9w2n3V'],
41978,5s8LepdwU0THzpd0M7nLsa,2019-06-13,,,,
41986,4Sokm1cWK36H2WctWWRGf1,2019-06-13,,,,


## Save new gold

In [23]:
# We decide to drop the rows without release date as they are the same rows as teh ones without explicit, artists, id_artists and release_date

In [24]:
print(songs_gold_witouht_nan.isna().sum())

track_id            0
artist_names        0
track_name          0
source              0
streams             0
week_date           0
duration_ms         0
explicit            0
artists             0
id_artists          0
release_date        0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
dtype: int64


In [25]:
songs_gold_witouht_nan.to_csv(os.path.join(DATA_DIR, "gold", "songs_with_features.csv"), index=False)

In [26]:
def get_unique_ids_from_column(df, column):
    """Aggregate all ids in a specified df[column] into a set."""
    all_ids = set()
    for ids in df[column]:
        if isinstance(ids, str):
            try:
                parsed_ids = eval(ids)
    
                if isinstance(parsed_ids, (list, set, tuple)):
                    all_ids.update(parsed_ids)
                else:
                    all_ids.add(parsed_ids)
            except Exception:
                all_ids.add(ids)
    return all_ids

# Example usage:
all_id_artists = get_unique_ids_from_column(songs, 'id_artists')
print(len(all_id_artists))

# Example usage:
all_names_artists = get_unique_ids_from_column(songs, 'artists')
print(len(all_names_artists))

986
986


In [27]:
import itertools
import math

def get_all_combinations(all_names_artists, digits = '0123456789'):
    n = 1
    while math.perm(len(digits), n) < len(all_names_artists):
        n += 1
        if n > 10:
            raise ValueError("Too many combinations")

    
    combs = list(itertools.permutations(digits, n))
    
    return combs

def get_artist_to_id(all_names_artists):
    combs = get_all_combinations(all_names_artists)
    return dict(zip(all_names_artists, combs[:len(all_names_artists)]))

def update_id_artists_with_mapping(songs_gold):
    """
    Updates the 'id_artists' column in songs_gold by mapping artist names (in 'artists' column)
    to their ids using the artist_to_id_dict. The result will be a list of ids for each row.
    """
    
    all_names_artists = get_unique_ids_from_column(songs_gold, 'artists')
    artist_to_id_dict = get_artist_to_id(all_names_artists)
    
    def map_artists_to_ids(artists_entry):
        # handle if artists_entry is a string representation of a list or just a string
        if isinstance(artists_entry, str):
            try:
                parsed = eval(artists_entry)
                if isinstance(parsed, (list, tuple)):
                    return [artist_to_id_dict.get(a, None) for a in parsed]
                else:
                    return [artist_to_id_dict.get(parsed, None)]
            except Exception:
                # fallback: single name as string
                return [artist_to_id_dict.get(artists_entry, None)]
        elif isinstance(artists_entry, (list, tuple)):
            return [artist_to_id_dict.get(a, None) for a in artists_entry]
        else:
            return [artist_to_id_dict.get(artists_entry, None)]
    
    songs_gold = songs_gold.copy()
    songs_gold['id_artists'] = songs_gold['artists'].apply(map_artists_to_ids)
    return songs_gold









In [28]:
# Generate "artists" column by splitting "artist_names" at ',' and stripping whitespace
songs_gold_2 = songs_gold.copy()
songs_gold_2['artists'] = songs_gold_2['artist_names'].apply(
    lambda x: [artist.strip() for artist in x.split(',')] if isinstance(x, str) else []
)

In [29]:
songs_gold_2 = update_id_artists_with_mapping(songs_gold_2)

print(songs_gold_2.isna().sum())

track_id               0
artist_names           0
track_name             0
source                 0
streams                0
week_date              0
duration_ms            0
explicit            6966
artists                0
id_artists             0
release_date        7404
danceability           0
energy                 0
key                    0
loudness               0
mode                   0
speechiness            0
acousticness           0
instrumentalness       0
liveness               0
valence                0
tempo                  0
time_signature         0
dtype: int64


In [None]:
import os
import json
import time
import numpy as np

from google.api_core.exceptions import ResourceExhausted
import google.generativeai as genai  # make sure you have the Gemini API python SDK setup

def gemini_check_if_explicit(artist_names, track_name, retry=3):
    """Queries Gemini API if the song is explicit (18+ or with insults and so on) and expects json {'thinking': ..., 'value': (0|1)}."""
    prompt = (
        f"Given the song by these artists: {artist_names} and with title: {track_name}, "
        "does it contain explicit (age=18+, due to insult, bad words, violence or sexual content) material/lyrics? "
        "Answer in this json format: {{'thinking':'<brief reason>', 'value':0 or 1}}. "
        "Only 'value':1 means explicit, 0 means clean."
    )
    for attempt in range(retry):
        try:
            response = genai.generate_text(prompt=prompt, model="gemini-2.5-flash-lite")
            try:
                answer = response.text.strip()
                if answer.startswith('```'):
                    answer = answer.strip('`')
                # Try to load as JSON
                result = json.loads(answer.replace("'", '"'))
                value = result.get('value', None)
                if value in (0, 1):
                    return value, result
            except Exception:
                pass
        except ResourceExhausted:
            time.sleep(3)
        except Exception:
            pass
        time.sleep(1 + attempt)
    return None, None

def enrich_explicit_via_gemini(
    df,
    explicit_col="explicit",
    artist_col="artist_names",
    track_name_col="track_name",
    track_id_col="track_id",
    previous_dict_path=None,
    save_dict_path=None,
    save_every=50
):
    import pandas as pd
    from tqdm import tqdm

    # Load cache if present
    if previous_dict_path and os.path.exists(previous_dict_path):
        with open(previous_dict_path, "r") as f:
            explicit_map = json.load(f)
    else:
        explicit_map = {}

    answered_ids = set(explicit_map.keys())
    value_by_id = dict(explicit_map)  # for in-loop dynamic dp-style cache

    total = df.shape[0]
    updated_count = 0

    for idx, row in tqdm(df.iterrows(), total=total, desc="Enriching explicit via Gemini"):
        track_id = str(row[track_id_col])

        # Skip if explicit is NOT nan (i.e., is filled already)
        val = row[explicit_col]
        if pd.notna(val) and not (isinstance(val, float) and np.isnan(val)):
            # Already has answer, skip
            continue

        # Check if already in dict (either from loaded or from live answers this run)
        if track_id in value_by_id:
            continue

        # Otherwise, need to call Gemini
        artist_names_val = row[artist_col]
        track_name_val = row[track_name_col]
        print("Calling Gemini for {}".format(track_id))
        value, result_obj = gemini_check_if_explicit(artist_names_val, track_name_val)
        value_by_id[track_id] = value  # dynamic programming: remember answer right away

        updated_count += 1
        # Only update explicit_map when it's actually not None (to not overwrite previous ones with None)
        explicit_map[track_id] = value

        # Save progress every so many updates
        if save_dict_path and updated_count % save_every == 0:
            with open(save_dict_path, "w") as f:
                json.dump(explicit_map, f, indent=2)

    # Save again when all is done
    if save_dict_path:
        with open(save_dict_path, "w") as f:
            json.dump(explicit_map, f, indent=2)
    return explicit_map

# Example use:
explicit_dict = enrich_explicit_via_gemini(
    songs_gold_2,
    previous_dict_path="data/silver/explicit_cache.json",
    save_dict_path="data/silver/explicit_cache.json"
)

Enriching explicit via Gemini:   0%|          | 1/41995 [00:00<2:25:53,  4.80it/s]

Calling Gemini for 25sgk305KZfyuqVBQIahim


Enriching explicit via Gemini:   0%|          | 6/41995 [00:06<12:37:03,  1.08s/it]

Calling Gemini for 7wFybC8jBH3zE139OpCtpG


Enriching explicit via Gemini:   0%|          | 12/41995 [00:12<12:04:38,  1.04s/it]

Calling Gemini for 0jAfdqv18goRTUxm3ilRjb


Enriching explicit via Gemini:   0%|          | 57/41995 [00:18<2:58:27,  3.92it/s] 

Calling Gemini for 2FUNBaa5DwItJtYEBgAblU


Enriching explicit via Gemini:   0%|          | 63/41995 [00:24<4:12:26,  2.77it/s]

Calling Gemini for 7K7MUBCnzgBAvMVW2RTWNs


Enriching explicit via Gemini:   0%|          | 65/41995 [00:30<6:09:15,  1.89it/s]

Calling Gemini for 4NdXQlDTzxbOMkzJGWFtz3


Enriching explicit via Gemini:   0%|          | 66/41995 [00:36<8:55:00,  1.31it/s]

Calling Gemini for 6Gy7rXB6Ku5vIWC7WGWsl3


Enriching explicit via Gemini:   0%|          | 76/41995 [00:38<5:52:23,  1.98it/s]


KeyboardInterrupt: 

In [None]:
songs_gold_2["explicit"].unique()

array([ 0.,  1., nan])