# Data Cleaning Pipeline

This notebook executes the data cleaning and merging process using the `data_cleaning` module.

In [None]:
from data_cleaning.process_charts import process_all_charts
from data_cleaning.merge import merge_data

print("Starting data processing...")
process_all_charts()
print("Data processing complete.")

print("Starting data merging...")
merge_data()
print("Data merging complete.")

## Verification
Check if no week was skipped during the webscraping.

In [None]:
import os
import pandas as pd

folder = "data/raw"

# 1. List all CSV filenames
files = [f for f in os.listdir(folder) if f.endswith(".csv")]

# 2. Extract dates from filenames
dates = []
for f in files:
    try:
        date_str = f.replace("regional-global-weekly-", "").replace(".csv", "")
        dates.append(pd.to_datetime(date_str))
    except Exception as e:
        print("Skipping invalid filename:", f)

# Sort dates
dates = sorted(dates)

# 3. Create expected weekly date range
if dates:
    start = dates[0]
    end = dates[-1]
    expected = pd.date_range(start=start, end=end, freq="W-THU")
    
    # 4. Check differences
    missing = expected.difference(dates)
    extra = set(dates) - set(expected)
    
    print("First date:", start.date())
    print("Last date:", end.date())
    print("Total files:", len(dates))
    print("Expected weeks:", len(expected))
    
    print("\nMissing weeks:")
    for m in missing:
        print(m.date())
    
    print("\nUnexpected extra dates:")
    for e in sorted(extra):
        print(e.date())
else:
    print("No data files found.")