# Data Cleaning Pipeline

This notebook executes the data cleaning and merging process using the `data_cleaning` module.

In [1]:
from data_cleaning.process_charts import process_all_charts
from data_cleaning.merge import merge_data

weekly_charts_path = "data/raw/data"
tracks_path = "data/raw/tracks.csv"
songs_path = "data/raw/combined_songs.csv"
output_path = "data/processed/songs_with_features.csv"

print("Starting data processing...")
process_all_charts(weekly_charts_path)

print("Data processing complete.")

print("Starting data merging...")
merge_data(tracks_path, songs_path, output_path)
print("Data merging complete.")

Starting data processing...
✓ File saved as: /Users/arthurmrv/Library/Mobile Documents/com~apple~CloudDocs/Documents/School/AIDAMS/S5/data/project/spotify_charts_project/data/processed/combined_songs.csv
Data processing complete.
Starting data merging...
✓ Merged data saved to: data/processed/songs_with_features.csv
Data merging complete.


## Verification
Check if no week was skipped during the webscraping.

In [2]:
import os
import pandas as pd

#folder = "data/raw/data"

# 1. List all CSV filenames
files = [f for f in os.listdir(weekly_charts_path) if f.endswith(".csv")]

# 2. Extract dates from filenames
dates = []
for f in files:
    try:
        date_str = f.replace("regional-global-weekly-", "").replace(".csv", "")
        dates.append(pd.to_datetime(date_str))
    except Exception as e:
        print("Skipping invalid filename:", f)

# Sort dates
dates = sorted(dates)

# 3. Create expected weekly date range
if dates:
    start = dates[0]
    end = dates[-1]
    expected = pd.date_range(start=start, end=end, freq="W-THU")
    
    # 4. Check differences
    missing = expected.difference(dates)
    extra = set(dates) - set(expected)
    
    print("First date:", start.date())
    print("Last date:", end.date())
    print("Total files:", len(dates))
    print("Expected weeks:", len(expected))
    
    print("\nMissing weeks:")
    for m in missing:
        print(m.date())
    
    print("\nUnexpected extra dates:")
    for e in sorted(extra):
        print(e.date())
else:
    print("No data files found.")

First date: 2016-12-29
Last date: 2020-12-31
Total files: 210
Expected weeks: 210

Missing weeks:

Unexpected extra dates:
