# DEFRA Dataset Assesment


1) I'll be start adding my main paths and modules I will be using in this notebook below.

In [79]:
# possible python modules i will be using below
from curses import meta
import os
import pandas as pd
from pathlib import Path
import csv

# for parse pdf uk pollutant limitations to csv
import re
# pdfplumber for pdf parsing
import pdfplumber

# function 5. chi-square test
from scipy import stats

#define base path  without hardcoding
base_dir = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "defra" / "optimised"
#metadata file for pollutant name, location and site names
metadata_path = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "defra" /"test"/"std_london_sites_pollutant.csv"

# output path for saving statistics 1. function
#the first analyse dataset created without inclitiong nan optimased files, and cross referencing that's why changed the name to dataset_statistics-noNAN-incl.csv
os.makedirs(base_dir / "report", exist_ok=True)
stats_output_path = base_dir/"report"/ "defra_stats.csv"

# output paths for saving all the pollutant distribution and nan value analysis.
pollutant_distrubution_path = base_dir / "report" / "pollutant_distribution.csv"
nan_val_pollutant_split_path = base_dir / "report" / "nan_values_by_pollutant.csv"
nan_val_stationPollutant_path = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" /"defra" / "report" / "nan_values_by_station_pollutant.csv"


# log file from nan replacement process
nan_log_path = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "defra" / "logs" / "NaN_values_record.csv"

# function for uk pollutant regulations pdf to parse csv file path
pdf_path = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "defra" / "capabilities" / "Air_Quality_Objectives_Update.pdf"
csv_output_path = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "defra" / "capabilities" / "uk_pollutant_limits.csv"


# data quality metrics report output path

quality_output = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "defra"/ "report" / "quality_metrics_validation.csv"
quality_output.parent.mkdir(parents=True, exist_ok=True)

#chi-square test output path func 5
chi_square_output1 = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "defra" / "report" / "chi_square_tests1.csv"
chi_square_output = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "defra" / "report" / "chi_square_tests.csv"

## 1) Initial Dataset Assessment: Raw Numbers

Before conducting quality checks, I need to establish the baseline characteristics of the DEFRA dataset. This section calculates comprehensive statistics about the data collection effort, including file counts, measurement records, station coverage, and pollutant distribution.

### Purpose
- Document the scale and scope of data collection.
- Establish baseline metrics for comparison with LAQN.
- Provide context for subsequent quality analysis.

### Methodology
The function `get_defra_dataset_statistics()` performs the following:
1. Loads standardised metadata to identify unique stations and pollutants.
2. Counts files across all three yearly directories (2023, 2024, 2025).
3. Calculates total measurement records by reading all CSV files.
4. Determines spatial coverage from unique coordinate pairs.
5. Documents temporal coverage (35 months: January 2023 to November 2025).

### Notes
- File counting is fast (scans directory structure only).
- Record counting can be slow (reads every CSV file).
- Results are saved to csv.

In [22]:
def get_defra_dataset_statistics(base_dir, metadata_path, nan_log_path):
    """
    Calculate statistics at DEFRA dataset.
    This function walks through the monthly data directories 2023, 2024, 2025 and calculates key metrics needed for reporting.
    
    Parameters:
        base_dir : Path
            Base directory containing defra data folders.
        metadata_path : Path
            Path to the standardised metadata csv file.
        nan_log_path : Path
            Path to the NaN values log file after notice data flags, changed them to NaN.
            
    Returns:
        dict : Dictionary containing all calculated statistics.
    """
    
    stats = {}
    
    # read metadata to get station and pollutant info
    print("\nloading metadata from std_london_sites_pollutant.csv...")
    metadata = pd.read_csv(metadata_path, encoding="utf-8")
    
    # calculate metadata statistics
    stats['unique_stations'] = metadata['station_name'].nunique()
    stats['total_combinations'] = len(metadata)
    stats['unique_pollutants'] = metadata['pollutant_std'].nunique()
    
    # get pollutant breakdown
    pollutant_counts = metadata['pollutant_std'].value_counts()
    stats['pollutant_distribution'] = pollutant_counts.to_dict()
    
    # create set of expected station/pollutant pairs from metadata
    expected_pairs = set(
        zip(metadata['station_name'], metadata['pollutant_std'])
    )
    stats['expected_pairs'] = len(expected_pairs)
    print(f"  expected station/pollutant pairs from metadata: {len(expected_pairs)}")
    
    # count unique coordinates for spatial coverage, i will be use this for laqn dataset as well
    # group by lat/lon and count unique locations, instead of station names and will do the validation afterwards
    unique_coords = metadata[['latitude', 'longitude']].drop_duplicates()
    stats['unique_locations'] = len(unique_coords)
    
    # count files in monthly data directories
    total_files = 0
    files_by_year = {}
    
    # loop through each years measurement directory
    print("\nscanning optimised directory for collected data...")
    for year in ['2023', '2024', '2025']:
        year_dir = Path(base_dir) / f'{year}measurements'
        
        if year_dir.exists():
            # count all CSV files in this years directory and subdirectories
            year_files = list(year_dir.rglob('*.csv'))
            files_by_year[year] = len(year_files)
            total_files += len(year_files)
            print(f"  {year}: {len(year_files)} files")
        else:
            files_by_year[year] = 0
            print(f"  {year}: directory not found")
    
    stats['total_files'] = total_files
    stats['files_by_year'] = files_by_year
    
    # calculate total measurement records, this requires reading all csv files and counting rows
    total_records = 0
    records_by_year = {}
    total_missing = 0
    missing_by_year = {}
    
    # concatenate all CSVs for missing value breakdown
    all_csvs = []
    
    print("\nreading all CSV files to calculate statistics...")
    for year in ['2023', '2024', '2025']:
        year_dir = Path(base_dir) / f'{year}measurements'
        year_records = 0
        year_missing = 0
        
        if year_dir.exists():
            # read each csv, count rows and missing values
            for csv_file in year_dir.rglob('*.csv'):
                try:
                    df = pd.read_csv(csv_file)
                    year_records += len(df)
                    
                    # count missing NaN or empty string values in value column
                    # calculation: missing values in value column only
                    if 'value' in df.columns:
                        missing_in_file = df['value'].isna().sum() + (df['value'] == "").sum()
                        year_missing += missing_in_file
                    
                    # store dataframe for later aggregation
                    all_csvs.append(df)
                    
                except Exception as e:
                    print(f"  warning: could not read {csv_file.name}: {e}")
            
            records_by_year[year] = year_records
            missing_by_year[year] = year_missing
            total_records += year_records
            total_missing += year_missing
            print(f"  {year}: {year_records:,} records, {year_missing:,} missing ({(year_missing/year_records*100):.2f}%)")
        else:
            records_by_year[year] = 0
            missing_by_year[year] = 0
    
    stats['total_records'] = total_records
    stats['records_by_year'] = records_by_year
    stats['missing_by_year'] = missing_by_year
    stats['total_missing'] = total_missing
    stats['overall_completeness'] = ((total_records - total_missing) / total_records * 100) if total_records > 0 else 0
    
    # cross-reference metadata with collected data
    print("\ncross-referencing collected data with metadata...")
    
    if all_csvs:
        all_data = pd.concat(all_csvs, ignore_index=True)
        
        # check if required columns exist in csv files
        # file structure: timestamp,value,timeseries_id,station_name,pollutant_name,pollutant_std,latitude,longitude
        if 'station_name' in all_data.columns and 'pollutant_std' in all_data.columns:
            # identify actual station/pollutant pairs in collected data
            collected_pairs = set(
                zip(all_data['station_name'], all_data['pollutant_std'])
            )
            stats['collected_pairs'] = len(collected_pairs)
            
            # find missing pairs (in metadata but not in collected data)
            missing_pairs = expected_pairs - collected_pairs
            stats['missing_pairs'] = list(missing_pairs)
            stats['missing_pairs_count'] = len(missing_pairs)
            
            # find extra pairs (in collected data but not in metadata)
            extra_pairs = collected_pairs - expected_pairs
            stats['extra_pairs'] = list(extra_pairs)
            stats['extra_pairs_count'] = len(extra_pairs)
            
            print(f"  expected pairs from metadata: {len(expected_pairs)}")
            print(f"  actually collected pairs: {len(collected_pairs)}")
            print(f"  missing pairs (in metadata but not collected): {len(missing_pairs)}")
            print(f"  extra pairs (collected but not in metadata): {len(extra_pairs)}")
            
            # group by station and pollutant_std, count missing values
            # calculation: (100 * missing value cell number) / (total number of row value col)
            missing_breakdown = {}
            
            for (station, pollutant), group in all_data.groupby(['station_name', 'pollutant_std']):
                total_rows = len(group)
                # count missing in value column
                if 'value' in group.columns:
                    missing_rows = group['value'].isna().sum() + (group['value'] == "").sum()
                else:
                    missing_rows = 0
                
                missing_breakdown[(station, pollutant)] = (int(missing_rows), int(total_rows))
            
            stats['missing_by_station_pollutant'] = missing_breakdown
        else:
            print("  warning: station_name or pollutant_std columns not found")
            stats['missing_by_station_pollutant'] = {}
            stats['collected_pairs'] = 0
            stats['missing_pairs'] = []
            stats['missing_pairs_count'] = 0
            stats['extra_pairs'] = []
            stats['extra_pairs_count'] = 0
    else:
        stats['missing_by_station_pollutant'] = {}
        stats['collected_pairs'] = 0
        stats['missing_pairs'] = list(expected_pairs)
        stats['missing_pairs_count'] = len(expected_pairs)
        stats['extra_pairs'] = []
        stats['extra_pairs_count'] = 0
    
    # distribution of nan by pollutant over time
    if stats['missing_by_station_pollutant']:
        pollutant_missing_summary = {}
        
        for (station, pollutant), (missing, total) in stats['missing_by_station_pollutant'].items():
            if pollutant not in pollutant_missing_summary:
                pollutant_missing_summary[pollutant] = {'total_missing': 0, 'total_records': 0}
            
            pollutant_missing_summary[pollutant]['total_missing'] += missing
            pollutant_missing_summary[pollutant]['total_records'] += total
        
        # calculate percentages
        for pollutant in pollutant_missing_summary:
            total_missing = pollutant_missing_summary[pollutant]['total_missing']
            total_records = pollutant_missing_summary[pollutant]['total_records']
            percentage = (total_missing / total_records * 100) if total_records > 0 else 0
            pollutant_missing_summary[pollutant]['percentage_missing'] = percentage
        
        stats['missing_by_pollutant_type'] = pollutant_missing_summary
    else:
        stats['missing_by_pollutant_type'] = {}
    
    # log file created during data cleaning process
    if Path(nan_log_path).exists():
        nan_log = pd.read_csv(nan_log_path)
        
        # calculate replacement statistics per year
        replacements_by_year = nan_log.groupby('year_folder')['invalid_flags_replaced'].sum().to_dict()
        stats['nan_replacements_by_year'] = replacements_by_year
        stats['total_nan_replacements'] = nan_log['invalid_flags_replaced'].sum()
        
        # get mean percentage of invalid flags
        stats['mean_invalid_percentage'] = nan_log['percentage_invalid'].mean()
        stats['max_invalid_percentage'] = nan_log['percentage_invalid'].max()
        
    else:
        stats['nan_replacements_by_year'] = {}
        stats['total_nan_replacements'] = 0
        stats['mean_invalid_percentage'] = 0
        stats['max_invalid_percentage'] = 0
    
    # calculate temporal coverage based on the files collected, understands which months have data
    stats['temporal_coverage'] = {
        'start_date': '2023-01-01',
        'end_date': '2025-11-19',  
        'total_months': 35
    }
    
    return stats

In [27]:
def print_dataset_statistics(stats):
    """
    Print dataset statistics
    
    Parameters:
        stats : dict
            returned by get_defra_dataset_statistics().
    """
    
    print("\n" + "="*40)
    print("Defra dataset statistics: initial assessment")
    print("="*40)
    
    print("\nScale and scope:")
    print(f"Total files collected: {stats['total_files']:,}")
    print(f"Total measurement records: {stats['total_records']:,}")
    print(f"Total missing values (nan): {stats['total_missing']:,}")
    print(f"Overall completeness: {stats['overall_completeness']:.2f}%")
    print(f"Unique monitoring stations: {stats['unique_stations']}")
    print(f"Total station-pollutant combinations: {stats['total_combinations']}")
    print(f"Unique pollutant types: {stats['unique_pollutants']}")
    print(f"Unique geographic locations: {stats['unique_locations']}")
    
    # data collection coverage
    print("\nData collection coverage:")
    print(f"Expected pairs (from metadata): {stats.get('expected_pairs', 0)}")
    print(f"Actually collected pairs: {stats.get('collected_pairs', 0)}")
    print(f"Missing pairs (not collected): {stats.get('missing_pairs_count', 0)}")
    print(f"Extra pairs (not in metadata): {stats.get('extra_pairs_count', 0)}")
    
    if stats.get('missing_pairs_count', 0) > 0:
        print(f"\nwarning: {stats['missing_pairs_count']} station/pollutant pairs from metadata were not found in collected data.")
        print("first 10 missing pairs:")
        for i, (station, pollutant) in enumerate(stats['missing_pairs'][:10], 1):
            print(f"  {i}. {station} - {pollutant}")
    
    if stats.get('extra_pairs_count', 0) > 0:
        print(f"\nNote: {stats['extra_pairs_count']} station/pollutant pairs in collected data are not in metadata.")
    
    print("\nfiles by year:")
    for year, count in stats['files_by_year'].items():
        print(f"  {year}: {count:,} files")
    
    print("\nrecords by year:")
    for year, count in stats['records_by_year'].items():
        missing = stats['missing_by_year'].get(year, 0)
        missing_pct = (missing / count * 100) if count > 0 else 0
        print(f"  {year}: {count:,} records, {missing:,} missing ({missing_pct:.2f}%)")
    
    # adding nan value summary below
    print("\nnan replacement summary:")
    print(f"Total invalid flags replaced: {stats['total_nan_replacements']:,}")
    print(f"Mean invalid percentage per file: {stats['mean_invalid_percentage']:.2f}%")
    print(f"Max invalid percentage: {stats['max_invalid_percentage']:.2f}%")
    
    # count of replacements by year
    if stats['nan_replacements_by_year']:
        print("\nreplacements by year:")
        for year_folder, count in stats['nan_replacements_by_year'].items():
            print(f"  {year_folder}: {count:,} flags replaced")
    
    print("\ntemporal coverage:")
    print(f"start date: {stats['temporal_coverage']['start_date']}")
    print(f"end date: {stats['temporal_coverage']['end_date']}")
    print(f"total months: {stats['temporal_coverage']['total_months']}")
    
    print("\npollutant distribution:")
    print("station/pollutant combinations by type:")
    for pollutant, count in sorted(stats['pollutant_distribution'].items(), 
                                   key=lambda x: x[1], reverse=True):
        percentage = (count / stats['total_combinations']) * 100
        print(f"  {pollutant}: {count} ({percentage:.1f}%)")
    
    # missing value distribution by pollutant type
    print("\nMissing value distribution by pollutant type:")
    if stats.get('missing_by_pollutant_type'):
        # sort by percentage missing (highest first)
        sorted_pollutants = sorted(
            stats['missing_by_pollutant_type'].items(),
            key=lambda x: x[1]['percentage_missing'],
            reverse=True
        )
        
        print(f"{'pollutant':<20} {'total records':>15} {'missing':>12} {'% missing':>12}")
        print("-" * 60)
        for pollutant, data in sorted_pollutants:
            print(f"{pollutant:<20} {data['total_records']:>15,} {data['total_missing']:>12,} {data['percentage_missing']:>11.2f}%")
    else:
        print("  no missing value distribution available.")
    
    # print missing values by station/pollutant breakdown with row_number column
    print("\nMissing values by station/pollutant:")
    if stats.get('missing_by_station_pollutant'):
        # prepare a sorted list by missing percentage descending
        breakdown = []
        for (station, pollutant), (missing, total) in stats['missing_by_station_pollutant'].items():
            percent = (missing / total * 100) if total > 0 else 0
            breakdown.append((station, pollutant, missing, total, percent))
        # sort by percentage descending and take top 20
        breakdown.sort(key=lambda x: x[4], reverse=True)
        breakdown = breakdown[:20]
        print(f"{'station':<30} {'pollutant':<20} {'missing':>10} {'total_row':>12} {'% missing':>12}")
        print("-" * 40)
        
        for station, pollutant, missing, total, percent in breakdown:
            print(f"{station:<30} {pollutant:<20} {missing:>10,} {total:>12,} {percent:>11.2f}%")
    else:
        print(" No missing value breakdown available.")

In [None]:
# run the analysis
stats = get_defra_dataset_statistics(base_dir, metadata_path, nan_log_path)
print_dataset_statistics(stats)

# # Save statistics for later use as csv
# save statistics for later use as csv
# prepare flat data structure for csv
stats_rows = []
stats_rows.append(["metric", "value"])
stats_rows.append(["total_files", stats['total_files']])
stats_rows.append(["total_records", stats['total_records']])
stats_rows.append(["total_missing", stats['total_missing']])
stats_rows.append(["overall_completeness_pct", f"{stats['overall_completeness']:.2f}"])
stats_rows.append(["unique_stations", stats['unique_stations']])
stats_rows.append(["total_combinations", stats['total_combinations']])
stats_rows.append(["unique_pollutants", stats['unique_pollutants']])
stats_rows.append(["unique_locations", stats['unique_locations']])
stats_rows.append(["expected_pairs", stats.get('expected_pairs', 0)])
stats_rows.append(["collected_pairs", stats.get('collected_pairs', 0)])
stats_rows.append(["missing_pairs_count", stats.get('missing_pairs_count', 0)])
stats_rows.append(["extra_pairs_count", stats.get('extra_pairs_count', 0)])
stats_rows.append(["total_nan_replacements", stats['total_nan_replacements']])
stats_rows.append(["mean_invalid_pct", f"{stats['mean_invalid_percentage']:.2f}"])
stats_rows.append(["max_invalid_pct", f"{stats['max_invalid_percentage']:.2f}"])

# add year-specific metrics
for year in ['2023', '2024', '2025']:
    stats_rows.append([f"files_{year}", stats['files_by_year'].get(year, 0)])
    stats_rows.append([f"records_{year}", stats['records_by_year'].get(year, 0)])
    stats_rows.append([f"missing_{year}", stats['missing_by_year'].get(year, 0)])
    year_key = f'{year}measurements'
    stats_rows.append([f"replacements_{year}", stats['nan_replacements_by_year'].get(year_key, 0)])

# # save to csv stats report save func below (commented out for now to overwrite previous report)
# pd.DataFrame(stats_rows[1:], columns=stats_rows[0]).to_csv(stats_output_path, index=False)
# print(f"\nstatistics saved to: {stats_output_path}")

# # save pollutant distribution to csv describe the path on top pollutant_distrubution_path
# total_combinations = stats['total_combinations']
# pollutant_distribution_df = pd.DataFrame(
#     [
#         {
#             'pollutant': k,
#             'count': v,
#             'percentage': round((v / total_combinations) * 100, 2) if total_combinations > 0 else 0
#         }
#         for k, v in stats['pollutant_distribution'].items()
#     ]
# )
# pollutant_distribution_df.to_csv(pollutant_distrubution_path, index=False)
# print(f"Pollutant distribution saved to: {pollutant_distrubution_path}")

# # Save missing value distribution by pollutant type to path described the path on top nan_val_pollutant_split_path
# if stats.get('missing_by_pollutant_type'):
#     missing_by_pollutant_df = pd.DataFrame([
#         {
#             'pollutant': k,
#             'total_records': v['total_records'],
#             'total_missing': v['total_missing'],
#             'percentage_missing': v['percentage_missing']
#         }
#         for k, v in stats['missing_by_pollutant_type'].items()
#     ])
#     missing_by_pollutant_df.to_csv(nan_val_pollutant_split_path, index=False)
#     print(f"Missing value distribution by pollutant type saved to: {nan_val_pollutant_split_path}")

# # save missing values by station/pollutant to csv path on top nan_val_stationPollutant_path
# if stats.get('missing_by_station_pollutant'):
#     missing_by_station_pollutant_df = pd.DataFrame([
#         {
#             'station': k[0],
#             'pollutant': k[1],
#             'missing': v[0],
#             'total_row': v[1],
#             'percentage_missing': (v[0] / v[1] * 100) if v[1] > 0 else 0
#         }
#         for k, v in stats['missing_by_station_pollutant'].items()
#     ])
#     missing_by_station_pollutant_df.to_csv(nan_val_stationPollutant_path, index=False)
#     print(f"Missing values by station/pollutant saved to: {nan_val_stationPollutant_path}")



loading metadata from std_london_sites_pollutant.csv...
  expected station/pollutant pairs from metadata: 141

scanning optimised directory for collected data...
  2023: 1431 files
  2024: 1193 files
  2025: 939 files

reading all CSV files to calculate statistics...
  2023: 1,000,126 records, 90,161 missing (9.01%)
  2024: 868,320 records, 101,256 missing (11.66%)
  2025: 657,545 records, 30,750 missing (4.68%)

cross-referencing collected data with metadata...
  expected pairs from metadata: 141
  actually collected pairs: 141
  missing pairs (in metadata but not collected): 0
  extra pairs (collected but not in metadata): 0

Defra dataset statistics: initial assessment

Scale and scope:
Total files collected: 3,563
Total measurement records: 2,525,991
Total missing values (nan): 222,167
Overall completeness: 91.20%
Unique monitoring stations: 18
Total station-pollutant combinations: 144
Unique pollutant types: 37
Unique geographic locations: 20

Data collection coverage:
Expected p

    loading metadata from std_london_sites_pollutant.csv...
    expected station/pollutant pairs from metadata: 141

    scanning optimised directory for collected data...
    2023: 1431 files
    2024: 1193 files
    2025: 939 files

    reading all CSV files to calculate statistics...
    2023: 1,000,126 records, 90,161 missing (9.01%)
    2024: 868,320 records, 101,256 missing (11.66%)
    2025: 657,545 records, 30,750 missing (4.68%)

    cross-referencing collected data with metadata...
    expected pairs from metadata: 141
    actually collected pairs: 141
    missing pairs (in metadata but not collected): 0
    extra pairs (collected but not in metadata): 0

    ========================================
    Defra dataset statistics: initial assessment
    ========================================

    Scale and scope:
    Total files collected: 3,563
    Total measurement records: 2,525,991
    Total missing values (nan): 222,167
    Overall completeness: 91.20%
    Unique monitoring stations: 18
    Total station-pollutant combinations: 144
    Unique pollutant types: 37
    Unique geographic locations: 20

    Data collection coverage:
    Expected pairs (from metadata): 141
    Actually collected pairs: 141
    Missing pairs (not collected): 0
    Extra pairs (not in metadata): 0

    files by year:
    2023: 1,431 files
    2024: 1,193 files
    2025: 939 files

    records by year:
    2023: 1,000,126 records, 90,161 missing (9.01%)
    2024: 868,320 records, 101,256 missing (11.66%)
    2025: 657,545 records, 30,750 missing (4.68%)

    nan replacement summary:
    Total invalid flags replaced: 222,167
    Mean invalid percentage per file: 9.61%
    Max invalid percentage: 100.00%

    replacements by year:
    2023measurements: 90,161 flags replaced
    2024measurements: 101,256 flags replaced
    2025measurements: 30,750 flags replaced

    temporal coverage:
    start date: 2023-01-01
    end date: 2025-11-19
    total months: 35

    pollutant distribution:
    station/pollutant combinations by type:
    PM10: 15 (10.4%)
    PM2.5: 15 (10.4%)
    NO2: 14 (9.7%)
    NOx: 14 (9.7%)
    NO: 14 (9.7%)
    O3: 9 (6.2%)
    SO2: 3 (2.1%)
    n-Pentane: 2 (1.4%)
    m,p-Xylene: 2 (1.4%)
    n-Butane: 2 (1.4%)
    n-Heptane: 2 (1.4%)
    n-Hexane: 2 (1.4%)
    n-Octane: 2 (1.4%)
    Propene: 2 (1.4%)
    o-Xylene: 2 (1.4%)
    Propane: 2 (1.4%)
    i-Pentane: 2 (1.4%)
    Toluene: 2 (1.4%)
    trans-2-Butene: 2 (1.4%)
    trans-2-Pentene: 2 (1.4%)
    Isoprene: 2 (1.4%)
    Ethyne: 2 (1.4%)
    i-Octane: 2 (1.4%)
    i-Hexane: 2 (1.4%)
    i-Butane: 2 (1.4%)
    Ethylbenzene: 2 (1.4%)
    Ethene: 2 (1.4%)
    Ethane: 2 (1.4%)
    cis-2-Butene: 2 (1.4%)
    Benzene: 2 (1.4%)
    1-Pentene: 2 (1.4%)
    1-Butene: 2 (1.4%)
    1,3-Butadiene: 2 (1.4%)
    1,3,5-TMB: 2 (1.4%)
    1,2,4-TMB: 2 (1.4%)
    1,2,3-TMB: 2 (1.4%)
    CO: 2 (1.4%)

    Missing value distribution by pollutant type:
    pollutant              total records      missing    % missing
    ------------------------------------------------------------
    PM10                         227,142       37,580       16.54%
    O3                           194,333       27,184       13.99%
    PM2.5                        234,748       29,623       12.62%
    SO2                           72,928        7,181        9.85%
    NO                           326,061       25,444        7.80%
    NO2                          326,072       25,429        7.80%
    NOx                          325,387       24,964        7.67%
    n-Octane                      26,649        1,764        6.62%
    CO                            48,578        3,078        6.34%
    m,p-Xylene                    25,503        1,612        6.32%
    1,3,5-TMB                     26,649        1,641        6.16%
    Toluene                       26,649        1,640        6.15%
    i-Octane                      26,649        1,624        6.09%
    n-Heptane                     26,649        1,622        6.09%
    1,2,4-TMB                     26,649        1,610        6.04%
    Ethylbenzene                  26,649        1,592        5.97%
    Benzene                       26,649        1,586        5.95%
    o-Xylene                      26,649        1,568        5.88%
    1,2,3-TMB                     26,649        1,560        5.85%
    1-Pentene                     26,572        1,381        5.20%
    cis-2-Butene                  26,599        1,378        5.18%
    trans-2-Pentene               26,599        1,366        5.14%
    Isoprene                      26,618        1,341        5.04%
    Ethyne                        26,529        1,328        5.01%
    1,3-Butadiene                 26,568        1,320        4.97%
    i-Hexane                      26,599        1,321        4.97%
    trans-2-Butene                26,599        1,321        4.97%
    n-Hexane                      26,580        1,320        4.97%
    Propane                       26,618        1,316        4.94%
    Ethane                        26,599        1,315        4.94%
    Ethene                        26,618        1,312        4.93%
    Propene                       26,618        1,312        4.93%
    i-Butane                      26,599        1,308        4.92%
    1-Butene                      26,599        1,307        4.91%
    n-Butane                      26,599        1,307        4.91%
    i-Pentane                     26,618        1,306        4.91%
    n-Pentane                     26,618        1,306        4.91%

    Missing values by station/pollutant:
    station                        pollutant               missing    total_row    % missing
    ----------------------------------------
    London Eltham                  PM10                     16,337       16,826       97.09%
    London Eltham                  NO2                      13,187       16,840       78.31%
    London Eltham                  NO                       13,182       16,835       78.30%
    London Eltham                  NOx                      13,125       16,793       78.16%
    London Eltham                  O3                       12,537       16,842       74.44%
    London Teddington Bushy Park   PM10                     10,525       24,327       43.26%
    London Teddington Bushy Park   PM2.5                    20,820       48,656       42.79%
    London Haringey Priory Park South O3                        8,171       24,288       33.64%
    London Marylebone Road         PM10                        632        2,355       26.84%
    London Marylebone Road         PM2.5                       479        2,355       20.34%
    London Norbury Manor School    PM10                        936        5,258       17.80%
    London Norbury Manor School    PM2.5                       936        5,258       17.80%
    London Bexley                  PM10                      4,012       24,273       16.53%
    Southwark A2 Old Kent Road     PM10                        388        2,355       16.48%
    Haringey Roadside              NOx                       3,725       24,250       15.36%
    Haringey Roadside              NO2                       3,708       24,285       15.27%
    Haringey Roadside              NO                        3,708       24,287       15.27%
    London Westminster             PM2.5                     3,463       24,299       14.25%
    London Marylebone Road         SO2                       2,987       24,290       12.30%
    London Marylebone Road         CO                        2,729       24,293       11.23%
    Pollutant distribution saved to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/defra/optimised/report/pollutant_distribution.csv

## 2) Spatial Coverage Analysis

 analysing spatial distribution patterns before accepting the dataset. I need to understand where defra stations are located, identify any geographic biases, and compare coverage to laqn.

### Purpose
- Create maps showing station locations across London.
- Analyse density by borough to identify coverage gaps
- Compare spatial distribution to laqn network
- Ensure no geographic areas are overrepresented or underrepresented

### Methodology
1. Load defra metadata with coordinates
2. Create interactive folium map showing all stations
3. Calculate station density by borough
4. Identify coverage gaps in london
5. Compare to laqn spatial distribution



sources: 
- https://python-visualization.github.io/folium/latest/getting_started.html
- https://pandas.pydata.org/docs/user_guide/groupby.html 
- plotting: https://geopandas.org/en/stable/docs/user_guide/data_structures.html#geoseries
    - general: https://geopandas.org/en/stable/getting_started.html

coordinates:
  -  https://www.ordnancesurvey.co.uk/
  - identifiers: https://www.ordnancesurvey.co.uk/products/search-for-os-products?category=387aa470-8f46-4b02-a4ea-b70d1835f812 
  - WGS84 coordinate system used for latitude/longitude.
  - london coordinates : 51.5072° N, 0.1276° W
  - Latitude and longitude coordinates are: 51.509865, -0.118092.

In [None]:
def analyse_spatial_coverage(metadata_path):
    """
    analyse the stations location on map 
    
    function validates coordinates, identifies  locations, and visulise the spatial distribution
    
    Parameters:
        metadata_path : 
             std metadata csv file.
            
    Returns:
        dictionary containing spatial statistics and coordinate data.
    
        *i got help for this section, sources folium tuttorials, plotting for geopandas and google. Also asked for my friend help as well which
        she works on geospatial data a lot for her phd research.

    """
    
    spatial_stats = {}
    
    # read metadata for coordinate information
    print("\nloading station coordinates from metadata...")
    metadata = pd.read_csv(metadata_path, encoding="utf-8")
    
    # check if coordinate columns exist
    if 'latitude' not in metadata.columns or 'longitude' not in metadata.columns:
        print("  error: latitude or longitude columns not found in metadata")
        return spatial_stats
    
    # validate coordinate completeness
    total_stations = len(metadata)
    missing_lat = metadata['latitude'].isna().sum()
    missing_lon = metadata['longitude'].isna().sum()
    missing_coords = metadata[['latitude', 'longitude']].isna().any(axis=1).sum()
    
    spatial_stats['total_stations'] = total_stations
    spatial_stats['missing_coordinates'] = missing_coords
    spatial_stats['missing_latitude'] = missing_lat
    spatial_stats['missing_longitude'] = missing_lon
    spatial_stats['coordinate_completeness'] = ((total_stations - missing_coords) / total_stations * 100) if total_stations > 0 else 0
    
    print(f"  total stations in metadata: {total_stations}")
    print(f"  missing coordinates: {missing_coords} ({(missing_coords/total_stations*100):.2f}%)")
    print(f"  coordinate completeness: {spatial_stats['coordinate_completeness']:.2f}%")
    
   

In [None]:
def print_spatial_statistics(spatial_stats):
    """
    Print spatial coverage statistics 
    
    Param:
        spatial_stats : 
            Dic returned by analyse_spatial_coverage()
    """
    
    

## 3) uk air quality standards framework

The UK has established legally binding air quality objectives, I'm missing in my dataset, so first i need to parse the pdf file to csv and std format to my dataset.
- chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://uk-air.defra.gov.uk/assets/documents/Air_Quality_Objectives_Update.pdf

In [29]:
def parse_defra_aq_objectives(pdf_path, csv_output_path, metadata_path):
    """
    Parse defra air quality objectives pdf and export to csv

    
    Output columns:
        pollutant: pollutant name from pdf
        pollutant_std: standardised pollutant code from metadata
        limit: numeric limit value extracted from objective column
        unit: unit of measurement (µg/m³, mg/m³, etc)
        objective: full objective text from pdf
        concentration measured as: averaging period (24 hour mean, annual mean, etc)
        applies: jurisdiction (uk only in this case)
    
    """
    
    print("\n" + "="*40)
    print("parsing defra air quality objectives pdf")
    print("="*40)
    print(f"pdf path: {pdf_path}")
    print(f"metadata path: {metadata_path}")
    print(f"output path: {csv_output_path}")
    
    # load metadata for pollutant mapping
    print("\nloading metadata for pollutant standardisation...")
    meta = pd.read_csv(metadata_path)
    print(f"loaded {len(meta)} metadata records")
    
    # build pollutant mapping dictionary
    pollutant_map = {}
    
    # first try direct pollutant column
    if 'pollutant' in meta.columns and 'pollutant_std' in meta.columns:
        meta_clean = meta[['pollutant', 'pollutant_std']].dropna().drop_duplicates()
        for _, row in meta_clean.iterrows():
            key = str(row['pollutant']).strip().lower()
            val = str(row['pollutant_std']).strip()
            pollutant_map[key] = val
    
    # then try pollutant_available column
    if 'pollutant_available' in meta.columns and 'pollutant_std' in meta.columns:
        meta_avail = meta[['pollutant_available', 'pollutant_std']].dropna().copy()
        
        for _, row in meta_avail.iterrows():
            pollutants = str(row['pollutant_available']).split(',')
            std_code = str(row['pollutant_std']).strip()
            
            for poll in pollutants:
                key = poll.strip().lower()
                if key and key != 'nan':
                    pollutant_map[key] = std_code
    
    print(f"built pollutant mapping with {len(pollutant_map)} entries")
    
    # extract tables from pdf using pdfplumber
    print("\nextracting tables from pdf...")
    all_rows = []
    
    with pdfplumber.open(pdf_path) as pdf:
        print(f"pdf has {len(pdf.pages)} pages")
        
        for page_num, page in enumerate(pdf.pages, 1):
            tables = page.extract_tables()
            
            if tables:
                print(f"page {page_num}: found {len(tables)} table(s)")
                
                for table in tables:
                    for row in table:
                        all_rows.append(row)
    
    if not all_rows:
        print("\nerror: no tables found in pdf")
        return None
    
    print(f"total rows extracted: {len(all_rows)}")
    
    # convert to dataframe
    df_raw = pd.DataFrame(all_rows)
    
    print("\nprocessing extracted data...")
    
    # remove completely empty rows
    df_raw = df_raw.replace(r'^\s*$', pd.NA, regex=True)
    df_raw = df_raw.dropna(how='all').reset_index(drop=True)
    
    # find header row
    header_idx = None
    for i in range(min(len(df_raw), 20)):
        row_text = ' '.join([str(x).lower() for x in df_raw.iloc[i].tolist() if pd.notna(x)])
        
        if 'pollutant' in row_text and 'applies' in row_text and 'objective' in row_text:
            header_idx = i
            print(f"found header at row {i}")
            break
    
    if header_idx is None:
        print("error: could not find header row")
        return None
    
    # use detected header
    header_row = [str(x).strip() if pd.notna(x) else '' for x in df_raw.iloc[header_idx].tolist()]
    df_raw.columns = header_row
    df_raw = df_raw.iloc[header_idx + 1:].reset_index(drop=True)
    
    print(f"original columns: {df_raw.columns.tolist()}")
    
    # find and map the concentration column (may be split or abbreviated)
    col_map = {}
    concentration_col = None
    
    for i, col in enumerate(df_raw.columns):
        col_lower = str(col).lower().strip()
        
        if col_lower == 'pollutant':
            col_map[col] = 'pollutant'
        elif col_lower == 'applies':
            col_map[col] = 'applies'
        elif col_lower == 'objective':
            col_map[col] = 'objective'
        elif 'concentration' in col_lower or col_lower == 'measured as':
            # this is the concentration measured as column
            concentration_col = col
            col_map[col] = 'concentration_measured_as'
    
    # if concentration column not found by name, try by position
    # typically it's the 4th column (index 3)
    if concentration_col is None:
        if len(df_raw.columns) > 3:
            concentration_col = df_raw.columns[3]
            col_map[concentration_col] = 'concentration_measured_as'
            print(f"using column position 3 for concentration: {concentration_col}")
    
    df_raw = df_raw.rename(columns=col_map)
    
    print(f"mapped columns: {list(col_map.values())}")
    
    # check required columns exist
    required_cols = ['pollutant', 'applies', 'objective', 'concentration_measured_as']
    missing_cols = [col for col in required_cols if col not in df_raw.columns]
    
    if missing_cols:
        print(f"\nerror: missing required columns: {missing_cols}")
        print(f"mapped columns: {df_raw.columns.tolist()}")
        
        # if only concentration is missing, check if we can merge columns
        if missing_cols == ['concentration_measured_as']:
            print("\nattempting to find concentration column by content...")
            
            # look for columns containing time period keywords
            for col in df_raw.columns:
                if col not in ['pollutant', 'applies', 'objective']:
                    # check if column contains time period text
                    sample_text = ' '.join(df_raw[col].dropna().astype(str).head(10).tolist()).lower()
                    if any(word in sample_text for word in ['hour', 'mean', 'annual', 'day', 'running']):
                        print(f"found concentration column by content: {col}")
                        df_raw = df_raw.rename(columns={col: 'concentration_measured_as'})
                        break
        
        # check again after attempted fix
        missing_cols = [col for col in required_cols if col not in df_raw.columns]
        if missing_cols:
            print(f"still missing: {missing_cols}")
            return None
    
    # select only needed columns
    df = df_raw[required_cols].copy()
    
    # clean text in all columns
    for col in df.columns:
        df[col] = df[col].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip()
    
    # remove rows with missing critical data
    df = df.replace(['nan', 'None', '<NA>', ''], pd.NA)
    
    print(f"rows after cleanup: {len(df)}")
    
    # forward fill pollutant names
    df['pollutant'] = df['pollutant'].fillna(method='ffill')
    
    print("\nfiltering for uk only limits...")
    # filter for uk only
    df_uk = df[df['applies'].str.strip().str.upper() == 'UK'].copy()
    print(f"uk rows found: {len(df_uk)}")
    
    if len(df_uk) == 0:
        print("\nerror: no uk rows found after filtering")
        print("sample applies values found:")
        print(df['applies'].value_counts().head(10))
        return None
    
    print("\nextracting limit values and units from objectives...")
    
    # extract numeric limit from objective
    df_uk['limit'] = df_uk['objective'].str.extract(r'([\d,]+(?:\.\d+)?)', expand=False)
    df_uk['limit'] = df_uk['limit'].str.replace(',', '', regex=False)
    df_uk['limit'] = pd.to_numeric(df_uk['limit'], errors='coerce')
    
    # extract unit from objective
    df_uk['unit'] = df_uk['objective'].str.extract(r'[\d,]+(?:\.\d+)?\s*([^\s]+)', expand=False)
    
    # clean up unit extraction
    df_uk['unit'] = df_uk['unit'].str.extract(r'^([µμmng]+/m[²³3])', expand=False)
    
    # fallback for missing units
    mask_missing_unit = df_uk['unit'].isna()
    df_uk.loc[mask_missing_unit, 'unit'] = df_uk.loc[mask_missing_unit, 'objective'].str.extract(
        r'(µg/m³|μg/m³|mg/m³|ng/m³|ug/m3)', 
        expand=False
    )
    
    print(f"extracted limits for {df_uk['limit'].notna().sum()} rows")
    print(f"extracted units for {df_uk['unit'].notna().sum()} rows")
    
    # map pollutant names to standardised codes
    print("\nmapping pollutants to standardised codes...")
    
    df_uk['pollutant_std'] = df_uk['pollutant'].str.strip().str.lower().map(pollutant_map)
    
    # manual mappings for common pdf pollutant names
    manual_map = {
        'particles (pm10)': 'PM10',
        'particles (pm2.5)': 'PM2.5',
        'particles (pm2.5) exposure reduction': 'PM2.5',
        'pm10': 'PM10',
        'pm2.5': 'PM2.5',
        'nitrogen dioxide': 'NO2',
        'ozone': 'O3',
        'sulphur dioxide': 'SO2',
        'carbon monoxide': 'CO',
        'benzene': 'BENZENE',
        'lead': 'LEAD',
        '1,3-butadiene': 'BUTADIENE',
        'nitrogen oxides': 'NOX',
        'polycyclic aromatic hydrocarbons': 'PAH'
    }
    
    # apply manual mappings where metadata mapping failed
    mask_no_std = df_uk['pollutant_std'].isna()
    df_uk.loc[mask_no_std, 'pollutant_std'] = df_uk.loc[mask_no_std, 'pollutant'].str.strip().str.lower().map(manual_map)
    
    print(f"mapped {df_uk['pollutant_std'].notna().sum()} pollutants to standardised codes")
    
    # show pollutants that could not be mapped
    unmapped = df_uk[df_uk['pollutant_std'].isna()]
    if len(unmapped) > 0:
        print(f"\nwarning: {len(unmapped)} pollutants could not be mapped:")
        for poll in unmapped['pollutant'].unique():
            print(f"  {poll}")
    
    # rename column to match requirements
    df_uk = df_uk.rename(columns={'concentration_measured_as': 'concentration measured as'})
    
    # select final columns in specified order
    final_cols = [
        'pollutant',
        'pollutant_std', 
        'limit',
        'unit',
        'objective',
        'concentration measured as',
        'applies'
    ]
    
    df_final = df_uk[final_cols].copy()
    
    # warn about missing limits
    missing_limits = df_final['limit'].isna().sum()
    if missing_limits > 0:
        print(f"\nwarning: {missing_limits} rows have no numeric limit extracted")
    
    print(f"\nfinal dataset: {len(df_final)} rows")
    
    # show summary by pollutant
    print("\nsummary by pollutant:")
    print("-" * 40)
    summary = df_final.groupby('pollutant', dropna=False).agg({
        'limit': 'count',
        'pollutant_std': lambda x: x.mode()[0] if len(x.mode()) > 0 else None
    }).rename(columns={'limit': 'num_limits', 'pollutant_std': 'std_code'})
    
    for idx, row in summary.iterrows():
        std_code = row['std_code'] if pd.notna(row['std_code']) else 'unmapped'
        print(f"{idx}: {row['num_limits']} limit(s) [{std_code}]")
    
    # save to csv
    print(f"\nsaving to csv: {csv_output_path}")
    csv_output_path.parent.mkdir(parents=True, exist_ok=True)
    df_final.to_csv(csv_output_path, index=False, encoding='utf-8')
    
    print("done")
    print("="*40)
    
    return df_final

In [30]:
# run the parsing function
print("starting pdf parsing...")

result_df = parse_defra_aq_objectives(
    pdf_path=pdf_path,
    csv_output_path=csv_output_path,
    metadata_path=metadata_path
)

if result_df is not None:
    print("\n" + "="*40)
    print("preview of parsed data")
    print("="*40)
    print(result_df.head(20).to_string(index=False))
    
    print("\n" + "="*40)
    print("checking output file")
    print("="*40)
    
    if csv_output_path.exists():
        print(f"file created: {csv_output_path}")
        print(f"file size: {csv_output_path.stat().st_size / 1024:.2f} kb")
        
        verify_df = pd.read_csv(csv_output_path)
        print(f"csv readable: {len(verify_df)} rows")
        print(f"columns: {verify_df.columns.tolist()}")
        
        print("\nall pollutants found:")
        print("-" * 40)
        for poll in verify_df['pollutant'].unique():
            count = len(verify_df[verify_df['pollutant'] == poll])
            print(f"{poll}: {count} limit(s)")
    else:
        print("file was not created")
else:
    print("\nparsing failed, check error messages above")

starting pdf parsing...

parsing defra air quality objectives pdf
pdf path: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/defra/capabilities/Air_Quality_Objectives_Update.pdf
metadata path: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/defra/test/std_london_sites_pollutant.csv
output path: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/defra/capabilities/uk_pollutant_limits.csv

loading metadata for pollutant standardisation...
loaded 144 metadata records
built pollutant mapping with 185 entries

extracting tables from pdf...
pdf has 4 pages
page 1: found 1 table(s)
page 2: found 1 table(s)
page 3: found 1 table(s)
page 4: found 1 table(s)
total rows extracted: 120

processing extracted data...
found header at row 1
original columns: ['Pollutant', 'Applies', 'Objective', 'Concentration', '', 'Date to be', '', 'European Obligations', '', 'Date to be', '']
mapped columns: ['pollutant', 'a

starting pdf parsing...

parsing defra air quality objectives pdf
pdf path: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/defra/capabilities/Air_Quality_Objectives_Update.pdf
metadata path: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/defra/test/std_london_sites_pollutant.csv
output path: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/defra/capabilities/uk_pollutant_limits.csv

loading metadata for pollutant standardisation...
loaded 144 metadata records
built pollutant mapping with 185 entries

extracting tables from pdf...
pdf has 4 pages
page 1: found 1 table(s)
page 2: found 1 table(s)
page 3: found 1 table(s)
page 4: found 1 table(s)
total rows extracted: 120

processing extracted data...
found header at row 1
original columns: ['Pollutant', 'Applies', 'Objective', 'Concentration', '', 'Date to be', '', 'European Obligations', '', 'Date to be', '']
mapped columns: ['pollutant', 'a

  df['pollutant'] = df['pollutant'].fillna(method='ffill')


## 4) Data Quality validations:


A critical gap from the laqn report by applying formal statistical tests to validate data quality patterns. While descriptive statistics show 0% (before I notice the flags of the dataset) issue rate, I need statistical evidence that this pattern is real and not due to chance.


#### Purpuse:
 Checking data qualities if it is in the limits of eea, and make sence for general logic.
- Outlier detection in pollutant measurements.
- Data validity ranges based on WHO/EEA standards.
- Measurement consistency across time periods.
- Quality flags and suspicious patterns.

### methodology
 applies environmental data quality assessment standards:
1. Load aggregated measurement data from all csv files.
2. Calculate statistical distributions for each pollutant type.
3. Identify outliers using IQR method and domain knowledge.
4. Check values against established valid ranges.
5. Flag suspicious patterns constant values, extreme spikes.
6. Calculate quality scores for each station-pollutant combination.

#### air quality measurement standards

- Uk air quality objectives, limits and policy.
- https://uk-air.defra.gov.uk/air-pollution/uk-limits
- chrome-extension://efaidnbmnnnibpcajpcglclefindmkaj/https://uk-air.defra.gov.uk/assets/documents/Air_Quality_Objectives_Update_20230403.pdf

- DEFRA. (2023). *Air Pollution in the UK 2022*.
  - Source: https://uk-air.defra.gov.uk/library/annualreport/
  - Air Quality Objectives and limit values
  - Compliance assessment methodology

- UK Air Information Resource. (2024). *Air Pollution: UK Limits*.
  - Source: https://uk-air.defra.gov.uk/air-pollution/uk-limits
  - Current UK air quality objectives
  - Legal limit values and target dates
  - Measurement unit specifications (µg/m³)

  -  for the rest of the pollutants


- uk voc policy:
  - https://assets.publishing.service.gov.uk/media/5d7a2912ed915d522e4164a5/VO__statement_Final_12092019_CS__1_.pdf



- uk_pollutant_limit.css uk policy base logicl flaw:
    - data I fetched hourly measurements.
    - UK limits: different averaging periods annual mean, 24-hour mean, 8-hour mean...
    - I need to iterate my raw data according to uk_limit csv file format.



In [None]:
def calculate_quality_metrics(base_dir, csv_output_path):
    """
    Checks if measurements are realistic using uk_pollutant_limit.csv 3rd section parsed from uk air pollution policy pdf.
    function validates all measurements against official uk air quality objectives
    
    - Loads all measurement files.
    - Reads uk legal limits from parsed pdf csv.
    - For each pollutant, checks if values exceed uk limits.
    - Finds negative values.
    - Finds extreme values probably sensor err.
    - Calculates uk stD.
    
    Parameters:
        base_dir : 
        uk_limits_path : uk_pollutant_limits.csv from parsed pdf
            
    """
    if not Path(csv_output_path).exists():
        print(f"error: uk limits file not found at {csv_output_path}")
        return {}

    # load uk legal limits from parsed pdf
    uk_limits = pd.read_csv(csv_output_path, encoding="utf-8")
    
    # Create uk limits lookup dict structure = {pollutant_std: {limit_type: limit_value}}
    uk_limits_dict = {}
    
    for _, row in uk_limits.iterrows():
        poll_std = row['pollutant_std']
        limit_val = row['limit']
        conc_type = str(row['concentration measured as']).lower().strip()
        unit = row['unit']
        
        if pd.notna(poll_std) and pd.notna(limit_val):
            if poll_std not in uk_limits_dict:
                uk_limits_dict[poll_std] = []
            
        if pd.notna(poll_std) and pd.notna(limit_val):
            if poll_std not in uk_limits_dict:
                uk_limits_dict[poll_std] = []

        #  averaging period detection
        avg_period = 'unknown'
        if 'annual' in conc_type and 'running' in conc_type:
            avg_period = 'running_annual'
        elif 'running annual' in conc_type:
            avg_period = 'running_annual'
        elif 'annual' in conc_type:
            avg_period = 'annual'
        elif '24 hour' in conc_type or '24-hour' in conc_type:
            avg_period = '24hour'
        elif '8 hour' in conc_type or '8-hour' in conc_type:
            avg_period = '8hour'
        elif '1 hour' in conc_type or '1-hour' in conc_type or 'hour mean' in conc_type:
            avg_period = '1hour'
        elif 'maximum daily' in conc_type:
            avg_period = 'daily_max'

        uk_limits_dict[poll_std].append({
            'limit': float(limit_val),
            'type': conc_type,
            'unit': unit,
            'avg_period': avg_period
        })
    
    print(f"\nUK limits loaded for {len(uk_limits_dict)} pollutants:")
    for poll, limits in uk_limits_dict.items():
        period_info = ', '.join([f"{lim['avg_period']}: {lim['limit']}" for lim in limits])
        print(f"  {poll}: {period_info}")
    
    # load all measurement data with timestamp
    print("\nLoading measurement data.")
    all_data = []
    
    for year in ['2023', '2024', '2025']:
        year_dir = Path(base_dir) / f'{year}measurements'
        if year_dir.exists():
            for csv_file in year_dir.rglob('*.csv'):
                try:
                    df = pd.read_csv(csv_file)
                    if not df.empty and 'timestamp' in df.columns:
                        all_data.append(df)
                except Exception as e:
                    pass
    
    if not all_data:
        print("err no measurement data found")
        return {}
    
    df_all = pd.concat(all_data, ignore_index=True)
    print(f"loaded {len(df_all):,} total records")
    
    # filter valid values and parse timestamp
    df_valid = df_all[df_all['value'].notna()].copy()
    df_valid['value'] = pd.to_numeric(df_valid['value'], errors='coerce')
    df_valid = df_valid[df_valid['value'].notna()]
    
    # parse timestamp to datetime
    df_valid['timestamp'] = pd.to_datetime(df_valid['timestamp'], errors='coerce')
    df_valid = df_valid[df_valid['timestamp'].notna()]
    
    print(f"Analysing {len(df_valid):,} valid measurements with timestamps")
    
    # calculate quality metrics for each pollutant
    print("\nProcessing quality metrics by pollutant...")
    quality_results = {}
    
    for pollutant in df_valid['pollutant_std'].unique():
        if pd.isna(pollutant):
            continue
        
        print(f"\nprocessing {pollutant}...")
        
        poll_data = df_valid[df_valid['pollutant_std'] == pollutant].copy()
        
        if len(poll_data) == 0:
            continue
        
        # basic statistics on raw hourly data
        q_metrics = {
            'pollutant': pollutant,
            'count': int(len(poll_data)),
            'mean_hourly': float(poll_data['value'].mean()),
            'median_hourly': float(poll_data['value'].median()),
            'std_hourly': float(poll_data['value'].std()),
            'min': float(poll_data['value'].min()),
            'max': float(poll_data['value'].max()),
            'p95': float(poll_data['value'].quantile(0.95)),
            'p99': float(poll_data['value'].quantile(0.99))
        }
        
        # check for suspicious values
        negative_count = (poll_data['value'] < 0).sum()
        zero_count = (poll_data['value'] == 0).sum()
        
        q_metrics['negative_values'] = int(negative_count)
        q_metrics['negative_pct'] = float((negative_count / len(poll_data) * 100))
        q_metrics['zero_values'] = int(zero_count)
        q_metrics['zero_pct'] = float((zero_count / len(poll_data) * 100))
        
        # now check against uk limits with proper averaging
        if pollutant in uk_limits_dict:
            uk_poll_limits = uk_limits_dict[pollutant]
            
            for limit_info in uk_poll_limits:
                avg_period = limit_info['avg_period']
                limit_value = limit_info['limit']
                
                if avg_period == 'annual':
                    # calculate annual mean
                    poll_data['year'] = poll_data['timestamp'].dt.year
                    annual_means = poll_data.groupby('year')['value'].mean()
                    
                    q_metrics['uk_annual_limit'] = limit_value
                    q_metrics['mean_annual'] = float(annual_means.mean())
                    q_metrics['exceeds_uk_annual'] = q_metrics['mean_annual'] > limit_value
                    
                    print(f"  annual mean: {q_metrics['mean_annual']:.2f} vs limit {limit_value}")
                
                elif avg_period == '24hour':
                    # calculate daily means
                    poll_data['date'] = poll_data['timestamp'].dt.date
                    daily_means = poll_data.groupby('date')['value'].mean()
                    
                    exceedances = (daily_means > limit_value).sum()
                    
                    q_metrics['uk_24hour_limit'] = limit_value
                    q_metrics['daily_exceedances'] = int(exceedances)
                    q_metrics['daily_exceedances_pct'] = float((exceedances / len(daily_means) * 100))
                    
                    print(f"  24-hour: {exceedances} days exceed {limit_value}")
                
                elif avg_period == '8hour':
                    # calculate 8-hour rolling mean
                    poll_data_sorted = poll_data.sort_values('timestamp')
                    poll_data_sorted['rolling_8h'] = poll_data_sorted['value'].rolling(window=8, min_periods=6).mean()
                    
                    exceedances = (poll_data_sorted['rolling_8h'] > limit_value).sum()
                    
                    q_metrics['uk_8hour_limit'] = limit_value
                    q_metrics['8hour_exceedances'] = int(exceedances)
                    q_metrics['8hour_exceedances_pct'] = float((exceedances / len(poll_data_sorted) * 100))
                    
                    print(f"  8-hour: {exceedances} periods exceed {limit_value}")
                
                elif avg_period == '1hour':
                    # compare hourly values directly
                    exceedances = (poll_data['value'] > limit_value).sum()
                    
                    q_metrics['uk_1hour_limit'] = limit_value
                    q_metrics['hourly_exceedances'] = int(exceedances)
                    q_metrics['hourly_exceedances_pct'] = float((exceedances / len(poll_data) * 100))
                    
                    print(f"  1-hour: {exceedances} hours exceed {limit_value}")
                
                elif avg_period == 'running_annual':
                    # running annual mean (365-day rolling average)
                    poll_data_sorted = poll_data.sort_values('timestamp')
                    poll_data_sorted['rolling_annual'] = poll_data_sorted['value'].rolling(window=24*365, min_periods=24*300).mean()
                    
                    q_metrics['uk_running_annual_limit'] = limit_value
                    q_metrics['mean_running_annual'] = float(poll_data_sorted['rolling_annual'].mean())
                    q_metrics['exceeds_running_annual'] = q_metrics['mean_running_annual'] > limit_value
                    
                    print(f"  running annual: {q_metrics['mean_running_annual']:.2f} vs limit {limit_value}")
                
                elif avg_period == 'daily_max':
                    # maximum daily 8-hour running mean
                    poll_data_sorted = poll_data.sort_values('timestamp')
                    poll_data_sorted['date'] = poll_data_sorted['timestamp'].dt.date
                    poll_data_sorted['rolling_8h'] = poll_data_sorted['value'].rolling(window=8, min_periods=6).mean()
                    
                    daily_max = poll_data_sorted.groupby('date')['rolling_8h'].max()
                    exceedances = (daily_max > limit_value).sum()
                    
                    q_metrics['uk_daily_max_limit'] = limit_value
                    q_metrics['daily_max_exceedances'] = int(exceedances)
                    
                    print(f"  daily max 8h: {exceedances} days exceed {limit_value}")
            
            # overall assessment: use most restrictive limit for out of range check
            all_limits = [lim['limit'] for lim in uk_poll_limits]
            max_limit = max(all_limits)
            
            # define extreme threshold as 10x highest uk limit
            extreme_threshold = max_limit * 10
            out_of_range = (poll_data['value'] > extreme_threshold).sum()
            
            q_metrics['extreme_threshold'] = extreme_threshold
            q_metrics['out_of_range'] = int(out_of_range)
            q_metrics['out_of_range_pct'] = float((out_of_range / len(poll_data) * 100))
            
        else:
            # no uk limit defined for this pollutant
            print(f"  no uk limits defined")
            q_metrics['uk_annual_limit'] = None
            q_metrics['exceeds_uk_annual'] = False
            q_metrics['out_of_range'] = 0
            q_metrics['out_of_range_pct'] = 0.0
        
        quality_results[pollutant] = q_metrics
    
    return quality_results

In [68]:

def print_quality_metrics(quality_results):
    """
    Print comprehensive quality metrics report with uk compliance.
    
    Parameters:
        quality_metrics : dict
            Dictionary returned by calculate_quality_metrics_uk_limits
    """
    
    print("\n" + "="*40)
    print("Quality metrics report")
    print("="*40)
    
 
    for poll, metrics in quality_results.items():
        print(f"\n{poll}:")
        print(f"  total measurements: {metrics['count']:,}")
        print(f"  hourly mean: {metrics['mean_hourly']:.2f}")
        
        if 'mean_annual' in metrics:
            print(f"  annual mean: {metrics['mean_annual']:.2f} (limit: {metrics['uk_annual_limit']})")
            status = "exceeds" if metrics['exceeds_uk_annual'] else "compliant"
            print(f"    status: {status}")
        
        if 'daily_exceedances' in metrics:
            print(f"  24-hour exceedances: {metrics['daily_exceedances']} days")
        
        if 'hourly_exceedances' in metrics:
            print(f"  1-hour exceedances: {metrics['hourly_exceedances']} hours")
        
        if metrics['negative_values'] > 0:
            print(f"  warning: {metrics['negative_values']} negative values")
        
        if metrics['out_of_range'] > 0:
            print(f"  warning: {metrics['out_of_range']} extreme values")
    
    print("="*40)
    
    return quality_results

In [69]:
# run quality metrics with proper averaging periods
print("starting quality metrics calculation...")

# Calculate quality metrics
quality_results = calculate_quality_metrics(base_dir, csv_output_path)

print_quality_metrics =(quality_results)


if quality_results:
    # # save comprehensive report
    # print("\nsaving quality metrics report...")
    

    
    quality_rows = []
    for poll, metrics in quality_results.items():
        row = {
            'pollutant': metrics['pollutant'],
            'total_measurements': metrics['count'],
            'mean_hourly': f"{metrics['mean_hourly']:.2f}",
            'min': f"{metrics['min']:.2f}",
            'max': f"{metrics['max']:.2f}",
            'p95': f"{metrics['p95']:.2f}",
            'negative_values': metrics['negative_values'],
            'zero_values': metrics['zero_values'],
            'out_of_range': metrics['out_of_range']
        }
        
        # add uk limit compliance fields
        if 'uk_annual_limit' in metrics and metrics['uk_annual_limit']:
            row['uk_annual_limit'] = metrics['uk_annual_limit']
            row['mean_annual'] = f"{metrics['mean_annual']:.2f}" if 'mean_annual' in metrics else 'n/a'
            row['exceeds_annual'] = 'yes' if metrics.get('exceeds_uk_annual', False) else 'no'
        
        if 'daily_exceedances' in metrics:
            row['uk_24hour_limit'] = metrics['uk_24hour_limit']
            row['daily_exceedances'] = metrics['daily_exceedances']
        
        if 'hourly_exceedances' in metrics:
            row['uk_1hour_limit'] = metrics['uk_1hour_limit']
            row['hourly_exceedances'] = metrics['hourly_exceedances']
        
        quality_rows.append(row)
    
    pd.DataFrame(quality_rows).to_csv(quality_output, index=False)
    print(f"saved to: {quality_output}")
    print("done")
else:
    print("quality metrics calculation failed")

starting quality metrics calculation...

UK limits loaded for 11 pollutants:
  PM10: 24hour: 50.0, annual: 40.0
  PM2.5: annual: 20.0
  NO2: annual: 40.0
  O3: 8hour: 100.0
  SO2: 24hour: 125.0
  PAH: annual: 0.25
  Benzene: running_annual: 16.25
  BUTADIENE: running_annual: 2.25
  CO: daily_max: 10.0
  LEAD: annual: 0.5
  NOx: annual: 30.0

Loading measurement data.
loaded 2,525,991 total records
Analysing 2,303,824 valid measurements with timestamps

Processing quality metrics by pollutant...

processing Toluene...
  no uk limits defined

processing i-Butane...
  no uk limits defined

processing 1,2,3-TMB...
  no uk limits defined

processing Ethyne...
  no uk limits defined

processing 1-Butene...
  no uk limits defined

processing O3...
  8-hour: 2089 periods exceed 100.0

processing 1,2,4-TMB...
  no uk limits defined

processing cis-2-Butene...
  no uk limits defined

processing trans-2-Pentene...
  no uk limits defined

processing NOx...
  annual mean: 32.75 vs limit 30.0

proce

## 5) Chi-Square test
Uses statistical tests to mathematically prove that defra data collection process was consistent and reliable across time. 
It as a quality control check that ensures didn't accidentally collect more data in some months than others, which could bias defra analysis.

#### Why Chi-square test?
 - The chi-square test answers one simple question: Are my monthly file counts similar enough to trust, or are some months suspiciously different? And environmental dataset chi square test most common why, according to google.

- Air pollution varies by season
- Policy decisions need unbiased evidence
- Academic reviewers will question imbalanced datasets

### What Chi-Square Test Does

The chi-square test answers one simple question: Are my monthly file counts similar enough to trust, or are some months suspiciously different?

#### How It Works

1. What  observe: Count how many data files  have for each month.
2. What  expect: If data collection was perfect, each month should have roughly the same count.
3. The test: Measures how far observed counts are from the expected counts.
4. The result: Gives  a p-value that tells  if the differences are just random variation or a real problem.


### P-Value Meaning

The p-value tells the probability that  observed pattern happened by random chance:

| P-Value | Interpretation | What It Means for DEFRA Data |
|---------|---------------|----------------------------|
| p greater than or equal to 0.05 | Accept null hypothesis | Data is evenly distributed. Small differences between months are just normal variation.  data collection was consistent. |
| p less than 0.05 | Reject null hypothesis | Data is unevenly distributed. Some months have significantly more or less data than others.  should investigate why. |



### 1. Methodological Rigor
 data collection needs to be reliable

- Mathematical evidence not just visual inspection
- A standardized statistical measure p-value
- Reproducible results 


## Output

The test produces:

   - Test name: "Chi-square uniformity"
   - Chi-square statistic (χ²)
   - P-value
   - Interpretation (evenly/unevenly distributed)

2. Console output showing:
   - Null hypothesis statement
   - Test statistic value
   - P-value

---

## Expected Results for  Dataset

Based on  data collection using the DEFRA API:

- Expected p-value: greater than 0.05 (likely around 0.3-0.7)
- Why:  API calls were automated and systematic
- What this proves: Each month has 249 station-pollutant files (one per combination)

### If Get p less than 0.05

This would suggest:
1. Some months might have missing API data
2. New monitoring stations came online mid-year
3. Some stations stopped reporting in certain months


### Results Data Quality Section

Include the statistical test results as evidence that dataset is:
- Temporally balanced
- Methodologically sound
- Suitable for seasonal analysis


| Aspect | Details |
|--------|---------|
| Test Used | Chi-square test for uniformity |
| What It Tests | Whether monthly file counts are evenly distributed |
| Null Hypothesis | Data is uniformly distributed across months |
| Alternative Hypothesis | Data shows significant monthly imbalance |
| Acceptance Criterion | p-value greater than or equal to 0.05 |
| What p greater than or equal to 0.05 Means | Data collection was consistent and reliable |
| What p less than 0.05 Means | Some months have significantly different data volumes |
| Why This Matters | Proves  dataset is methodologically sound for thesis |

---


### If p greater than or equal to 0.05 (Expected Result)

1. Document result in thesis methodology
2. Include p-value in data quality section
3. Proceed with confidence to seasonal analysis

### If p less than 0.05 (Unexpected Result)

1. Review monthly counts to identify outliers
2. Check API logs for that month
3. Document known issues (e.g., "Station X offline in April 2024")
4. Consider excluding problematic months OR
5. Use weighted analysis to account for imbalance

---

In [82]:

def chi_square_tests(base_dir):
    """
    Run statistical tests to prove data collection was consistent.
    
    - Chi-square test: Checks if  similar amounts of data for each month
    - If p-value < 0.05 Data isn't evenly spreats problem!
    - If p-value > 0.05 Data is evenly spreats good!
    
    Parameters:
        base_dir : 
            
    """
    
    # Count files per month, 2025 only 19th of nov.
    yearly_data = {'2023': 0, '2024': 0, '2025': 0}
    year_months = {'2023': 12, '2024': 12, '2025': 11}
    
    for year in ['2023', '2024', '2025']:
        year_dir = Path(base_dir) / f'{year}measurements'
        if not year_dir.exists():
            continue
        pattern = f'*__{year}_*.csv'
        files = list(year_dir.rglob(pattern))
        yearly_data[year] = len(files)
    
    # prep for chi-square
    year_counts = [yearly_data[y] for y in ['2023', '2024', '2025']]
    total_files = sum(year_counts)
    total_months = 35  # 12 + 12 + 11
    
    expected_counts = [
        total_files * (year_months[year] / total_months)
        for year in ['2023', '2024', '2025']
    ]
    
    # run test
    chi2, p_value = stats.chisquare(
        f_obs=year_counts, 
        f_exp=expected_counts
    )
    for year, count, expected in zip(['2023', '2024', '2025'], 
                                      year_counts, expected_counts):
        print(f"  {year}: {count:5d} files (expected: {expected:7.1f})")
    
    print()
    print(f"Chi-square statistic: {chi2:.4f}")
    print(f"P-value: {p_value:.4f}")
    
    if p_value < 0.05:
        print(f"Result reject null hypothesis p < 0.05")
        print(f"Interpretation: Years NOT evenly distributed")
    else:
        print(f"Result: accept null hypothesis p >= 0.05")
        print(f"Interpretation: Years evenly distributed")
    
    return {
        'test': 'Chi-square year-wise',
        'chi2_statistic': chi2,
        'p_value': p_value,
        'year_counts': year_counts,
        'expected_counts': expected_counts
    }

In [83]:
# Run tests
test_results = chi_square_tests(base_dir)

# Save results

pd.DataFrame([{
    'test_name': test_results['test'],
    'statistic': f"{test_results['chi2_statistic']:.4f}",
    'p_value': f"{test_results['p_value']:.4f}",
    'interpretation': ('Evenly distributed' 
                      if test_results['p_value'] >= 0.05 
                      else 'Unevenly distributed')
}]).to_csv(chi_square_output, index=False)

print(f"\nStatistical test results saved to: {chi_square_output}")

  2023:  1431 files (expected:  1221.6)
  2024:  1193 files (expected:  1221.6)
  2025:   939 files (expected:  1119.8)

Chi-square statistic: 65.7553
P-value: 0.0000
Result reject null hypothesis p < 0.05
Interpretation: Years NOT evenly distributed

Statistical test results saved to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/defra/report/chi_square_tests.csv


    2023:  1431 files (expected:  1221.6)
    2024:  1193 files (expected:  1221.6)
    2025:   939 files (expected:  1119.8)

    Chi-square statistic: 65.7553
    P-value: 0.0000
    Result reject null hypothesis p < 0.05
    Interpretation: Years NOT evenly distributed

    Statistical test results saved to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/defra/report/chi_square_tests.csv

In [None]:
def analyse_year_difference(base_dir):
    """
    Find which stations/pollutants are missing in later years.
    """
    
    base_dir = Path(base_dir)
    
    # unique station-pollutant combinations per year
    year_files = {}
    
    for year in ['2023', '2024', '2025']:
        year_dir = base_dir / f'{year}measurements'
        if not year_dir.exists():
            continue
        
        files = list(year_dir.rglob(f'*__{year}_*.csv'))
        
        # Extract station/pollutant combinations
        combinations = set()
        for f in files:
            # File format station_pollutant_YY_MM
            parts = f.stem.split('__')
            if len(parts) >= 2:
                station = parts[0]
                pollutant = parts[1]
                combinations.add((station, pollutant))
        
        year_files[year] = combinations
    
    # Find whats in 23 but missing in 2024/2025
    lost_2024 = year_files['2023'] - year_files['2024']
    lost_2025 = year_files['2023'] - year_files['2025']
    
    print("\nStation-pollutant combinations lost over time:")
    print(f" 2023 total: {len(year_files['2023'])}")
    print(f"2024 total: {len(year_files['2024'])}")
    print(f"2025 total: {len(year_files['2025'])}")
    print()
    print(f"Lost in 2024 (vs 2023): {len(lost_2024)}")
    print(f"Lost in 2025 (vs 2023): {len(lost_2025)}")
    
    if lost_2024:
        print("\nExamples lost in 2024:")
        for station, pollutant in list(lost_2024)[:10]:
            print(f"    {station} - {pollutant}")
    
    if lost_2025:
        print("\nExamples lost in 2025:")
        for station, pollutant in list(lost_2025)[:10]:
            print(f"    {station} - {pollutant}")
    
    return {
        '2023_count': len(year_files['2023']),
        '2024_count': len(year_files['2024']),
        '2025_count': len(year_files['2025']),
        'lost_2024': lost_2024,
        'lost_2025': lost_2025
    }



In [85]:
# Run
analysis = analyse_year_difference(base_dir)


Station-pollutant combinations lost over time:
 2023 total: 444
2024 total: 444
2025 total: 370

Lost in 2024 (vs 2023): 444
Lost in 2025 (vs 2023): 444

Examples lost in 2024:
    NOx - 2023_09
    m,p-Xylene - 2023_02
    n-Heptane - 2023_03
    Isoprene - 2023_03
    i-Hexane - 2023_02
    O3 - 2023_08
    1,2,4-TMB - 2023_10
    CO - 2023_11
    SO2 - 2023_09
    Propene - 2023_10

Examples lost in 2025:
    NOx - 2023_09
    m,p-Xylene - 2023_02
    n-Heptane - 2023_03
    Isoprene - 2023_03
    i-Hexane - 2023_02
    O3 - 2023_08
    1,2,4-TMB - 2023_10
    CO - 2023_11
    SO2 - 2023_09
    Propene - 2023_10


    Station-pollutant combinations lost over time:
    2023 total: 444
    2024 total: 444
    2025 total: 370

    Lost in 2024 (vs 2023): 444
    Lost in 2025 (vs 2023): 444

    Examples lost in 2024:
        NOx - 2023_09
        m,p-Xylene - 2023_02
        n-Heptane - 2023_03
        Isoprene - 2023_03
        i-Hexane - 2023_02
        O3 - 2023_08
        1,2,4-TMB - 2023_10
        CO - 2023_11
        SO2 - 2023_09
        Propene - 2023_10

    Examples lost in 2025:
        NOx - 2023_09
        m,p-Xylene - 2023_02
        n-Heptane - 2023_03
        Isoprene - 2023_03
        i-Hexane - 2023_02
        O3 - 2023_08
        1,2,4-TMB - 2023_10
        CO - 2023_11
        SO2 - 2023_09
        Propene - 2023_10

#### adding monthly 2025 data completeness

In [86]:
def months_25 (base_dir):
    """
    See which 2025 months have data.
    """
    
    base_dir = Path(base_dir)
    year_dir = base_dir / '2025measurements'
    
    monthly_counts = {}
    
    for month in range(1, 12):  # Jan/Nov
        pattern = f'*__2025_{month:02d}.csv'
        files = list(year_dir.rglob(pattern))
        monthly_counts[f'2025-{month:02d}'] = len(files)
    
    print("\n2025 monthly file counts:")
    for month, count in monthly_counts.items():
        print(f"  {month}: {count:4d} files")
    
    # Check if recent months have less data
    avg_early = sum(list(monthly_counts.values())[:3]) / 3
    avg_late = sum(list(monthly_counts.values())[-3:]) / 3
    
    print(f"\n  Early 2025 avg (Jan-Mar): {avg_early:.0f} files/month")
    print(f"  Late 2025 avg (Sep-Nov):  {avg_late:.0f} files/month")
    
    if avg_late < avg_early * 0.9:
        print("Recent months have noticeably less data")

months_25 (base_dir)


2025 monthly file counts:
  2025-01:   95 files
  2025-02:    0 files
  2025-03:   92 files
  2025-04:   94 files
  2025-05:   94 files
  2025-06:   94 files
  2025-07:   94 files
  2025-08:   94 files
  2025-09:   94 files
  2025-10:   94 files
  2025-11:   94 files

  Early 2025 avg (Jan-Mar): 62 files/month
  Late 2025 avg (Sep-Nov):  94 files/month


  2025 monthly file counts:
    2025-01:   95 files
    2025-02:    0 files
    2025-03:   92 files
    2025-04:   94 files
    2025-05:   94 files
    2025-06:   94 files
    2025-07:   94 files
    2025-08:   94 files
    2025-09:   94 files
    2025-10:   94 files
    2025-11:   94 files

    Early 2025 avg (Jan-Mar): 62 files/month
    Late 2025 avg (Sep-Nov):  94 files/month