# LAQN Dataset Assesment


1) I'll be start adding my main paths and modules I will be using in this notebook below.

In [25]:
# possible python modules i will be using below
from curses import meta
import os
import pandas as pd
from pathlib import Path
import csv
from collections import defaultdict
#function 7 importing the full analysis function from pollution_analysis
import sys
sys.path.append('/mnt/user-data/outputs')

#last detailed anlasye and visualization imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set visualisation style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

#findings 7 .func
# for parse pdf uk pollutant limitations to csv
import re
# pdfplumber for pdf parsing

# function 5. chi-square test
from scipy import stats

#define base path  without hardcoding
base_dir = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "laqn" / "optimased"
#metadata file for pollutant name, location and site names
metadata_path = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" /"laqn"/"optimased_siteSpecies.csv"

# output path for saving statistics 1. function
#the first analyse dataset created without inclitiong nan optimased files, and cross referencing that's why changed the name to dataset_statistics-noNAN-incl.csv
os.makedirs(base_dir / "report", exist_ok=True)
stats_output_path =  Path.home()/"Desktop" / "data science projects" / "air-pollution-levels" / "data" /"laqn"/"report"/ "laqn_stats.csv"

# output paths for saving all the pollutant distribution and nan value analysis.
pollutant_distrubution_path = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" /"laqn" / "report"/"pollutant_distribution.csv"
nan_val_pollutant_split_path = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" /"laqn" / "report" / "nan_values_by_pollutant.csv"
nan_val_stationPollutant_path = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" /"laqn" / "report" / "nan_values_by_station_pollutant.csv"


# log file from nan replacement process
nan_log_path = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "laqn" / "logs" / "NaN_values_record.csv"

# function for uk pollutant regulations pdf to parse csv file path
csv_output_path = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "defra" / "capabilities" / "uk_pollutant_limits.csv"


# data quality metrics report output path
quality_output = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "laqn"/ "report" / "quality_metrics_validation.csv"
quality_output.parent.mkdir(parents=True, exist_ok=True)

#chi-square test output path func 5
chi_square_output1 = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "laqn" / "report" / "chi_square_tests1.csv"
chi_square_output = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "laqn" / "report" / "chi_square_tests.csv"

# detailed last analysis and visualization output directory
report_dir = Path.home() / "Desktop" / "data science projects" / "air-pollution-levels" / "data" / "laqn" / "report" / "detailed_analysis"

report_dir.mkdir(parents=True, exist_ok=True)

## 1) Initial Dataset Assessment: Raw Numbers

Before conducting quality checks, I need to establish the baseline characteristics of the LAQN dataset. This section calculates comprehensive statistics about the data collection effort, including file counts, measurement records, station coverage, and pollutant distribution.

### Purpose
- Document the scale and scope of data collection.
- Establish baseline metrics for comparison with LAQN.
- Provide context for subsequent quality analysis.

### Methodology
The function `get_laqn_dataset_statistics()` performs the following:
1. Loads standardised metadata to identify unique stations and pollutants.
2. Counts files across all three yearly directories (2023, 2024, 2025).
3. Calculates total measurement records by reading all CSV files.
4. Determines spatial coverage from unique coordinate pairs.
5. Documents temporal coverage (35 months: January 2023 to November 2025).

### Notes
- File counting is fast (scans directory structure only).
- Record counting can be slow (reads every CSV file).
- Results are saved to csv.

In [26]:
def get_laqn_dataset_statistics(base_dir, metadata_path, nan_log_path):
    """
    Calculate statistics for the LAQN dataset using the new column structure.
    This function scans all CSV files recursively under base_dir and calculates key metrics needed for reporting.

    Parameters:
        base_dir : Path
            Base directory containing LAQN data folders.
        metadata_path : Path
            Path to the standardised metadata csv file.
        nan_log_path : Path
            Path to the NaN values log file after notice data flags, changed them to NaN.

    Returns:
        dict : Dictionary containing all calculated statistics.
    """
    stats = {}

    # read metadata to get station and pollutant info
    print("\nloading metadata...")
    metadata = pd.read_csv(metadata_path, encoding="utf-8")

    # calculate metadata statistics
    stats['unique_stations'] = metadata['SiteCode'].nunique()
    stats['total_combinations'] = len(metadata)
    stats['unique_pollutants'] = metadata['SpeciesCode'].nunique()

    # get pollutant breakdown
    pollutant_counts = metadata['SpeciesCode'].value_counts()
    stats['pollutant_distribution'] = pollutant_counts.to_dict()

    # create set of expected (SiteCode, SpeciesCode) pairs from metadata
    expected_pairs = set(
        zip(metadata['SiteCode'], metadata['SpeciesCode'])
    )
    stats['expected_pairs'] = len(expected_pairs)
    print(f"  expected SiteCode/SpeciesCode pairs from metadata: {len(expected_pairs)}")

    # count unique coordinates for spatial coverage
    unique_coords = metadata[['Latitude', 'Longitude']].drop_duplicates()
    stats['unique_locations'] = len(unique_coords)

    # Scan all CSVs in all subfolders under base_dir
    print("\nscanning optimased directory for collected data...")
    all_csv_files = list(Path(base_dir).rglob('*.csv'))
    total_files = len(all_csv_files)
    print(f"\nTotal CSV files found: {total_files}")
    stats['total_files'] = total_files

    # Count files, records, and missing values by period (e.g., "2023_apr")
    files_by_period = defaultdict(int)
    records_by_period = defaultdict(int)
    missing_by_period = defaultdict(int)

    all_csvs = []
    total_records = 0
    total_missing = 0

    print("\nReading all CSV files to calculate statistics...")
    for csv_file in all_csv_files:
        period = csv_file.parent.name
        try:
            df = pd.read_csv(csv_file)
            n_records = len(df)
            n_missing = df['@Value'].isna().sum() + (df['@Value'] == "").sum() if '@Value' in df.columns else 0
            all_csvs.append(df)
            files_by_period[period] += 1
            records_by_period[period] += n_records
            missing_by_period[period] += n_missing
            total_records += n_records
            total_missing += n_missing
        except Exception as e:
            print(f"  warning: could not read {csv_file.name}: {e}")

    stats['files_by_period'] = dict(files_by_period)
    stats['records_by_period'] = dict(records_by_period)
    stats['missing_by_period'] = dict(missing_by_period)
    stats['total_records'] = total_records
    stats['total_missing'] = total_missing
    stats['overall_completeness'] = ((total_records - total_missing) / total_records * 100) if total_records > 0 else 0

    for period in files_by_period:
        rec = records_by_period[period]
        miss = missing_by_period[period]
        miss_pct = (miss / rec * 100) if rec > 0 else 0
        print(f"  {period}: {files_by_period[period]} files, {rec:,} records, {miss:,} missing ({miss_pct:.2f}%)")

    # cross-reference metadata with collected data
    print("\ncross-referencing collected data with metadata...")

    if all_csvs:
        all_data = pd.concat(all_csvs, ignore_index=True)

        # check if required columns exist in csv files
        if 'SiteCode' in all_data.columns and 'SpeciesCode' in all_data.columns:
            # identify actual (SiteCode, SpeciesCode) pairs in collected data
            collected_pairs = set(
                zip(all_data['SiteCode'], all_data['SpeciesCode'])
            )
            stats['collected_pairs'] = len(collected_pairs)

            # find missing pairs (in metadata but not in collected data)
            missing_pairs = expected_pairs - collected_pairs
            stats['missing_pairs'] = list(missing_pairs)
            stats['missing_pairs_count'] = len(missing_pairs)

            # find extra pairs (in collected data but not in metadata)
            extra_pairs = collected_pairs - expected_pairs
            stats['extra_pairs'] = list(extra_pairs)
            stats['extra_pairs_count'] = len(extra_pairs)

            print(f"  expected pairs from metadata: {len(expected_pairs)}")
            print(f"  actually collected pairs: {len(collected_pairs)}")
            print(f"  missing pairs (in metadata but not collected): {len(missing_pairs)}")
            print(f"  extra pairs (collected but not in metadata): {len(extra_pairs)}")

            # group by SiteCode and SpeciesCode, count missing values
            missing_breakdown = {}
            for (site, species), group in all_data.groupby(['SiteCode', 'SpeciesCode']):
                total_rows = len(group)
                if '@Value' in group.columns:
                    missing_rows = group['@Value'].isna().sum() + (group['@Value'] == "").sum()
                else:
                    missing_rows = 0
                missing_breakdown[(site, species)] = (int(missing_rows), int(total_rows))
            stats['missing_by_station_pollutant'] = missing_breakdown
        else:
            print("  warning: SiteCode or SpeciesCode columns not found")
            stats['missing_by_station_pollutant'] = {}
            stats['collected_pairs'] = 0
            stats['missing_pairs'] = []
            stats['missing_pairs_count'] = 0
            stats['extra_pairs'] = []
            stats['extra_pairs_count'] = 0
    else:
        stats['missing_by_station_pollutant'] = {}
        stats['collected_pairs'] = 0
        stats['missing_pairs'] = list(expected_pairs)
        stats['missing_pairs_count'] = len(expected_pairs)
        stats['extra_pairs'] = []
        stats['extra_pairs_count'] = 0

    # distribution of nan by pollutant over time
    if stats['missing_by_station_pollutant']:
        pollutant_missing_summary = {}
        for (site, species), (missing, total) in stats['missing_by_station_pollutant'].items():
            if species not in pollutant_missing_summary:
                pollutant_missing_summary[species] = {'total_missing': 0, 'total_records': 0}
            pollutant_missing_summary[species]['total_missing'] += missing
            pollutant_missing_summary[species]['total_records'] += total
        for species in pollutant_missing_summary:
            total_missing = pollutant_missing_summary[species]['total_missing']
            total_records = pollutant_missing_summary[species]['total_records']
            percentage = (total_missing / total_records * 100) if total_records > 0 else 0
            pollutant_missing_summary[species]['percentage_missing'] = percentage
        stats['missing_by_pollutant_type'] = pollutant_missing_summary
    else:
        stats['missing_by_pollutant_type'] = {}

    # log file created during data cleaning process
    if Path(nan_log_path).exists():
        nan_log = pd.read_csv(nan_log_path)
        replacements_by_year = nan_log.groupby('year_folder')['invalid_flags_replaced'].sum().to_dict()
        stats['nan_replacements_by_year'] = replacements_by_year
        stats['total_nan_replacements'] = nan_log['invalid_flags_replaced'].sum()
        stats['mean_invalid_percentage'] = nan_log['percentage_invalid'].mean()
        stats['max_invalid_percentage'] = nan_log['percentage_invalid'].max()
    else:
        stats['nan_replacements_by_year'] = {}
        stats['total_nan_replacements'] = 0
        stats['mean_invalid_percentage'] = 0
        stats['max_invalid_percentage'] = 0

    # calculate temporal coverage based on the files collected
    stats['temporal_coverage'] = {
        'start_date': '2023-01-01',
        'end_date': '2025-11-19',
        'total_months': 35
    }


    def extract_year(period):
        # period is usually like '2023_apr' or '2024_jan'
        return str(period)[:4] if len(str(period)) >= 4 and str(period)[:4].isdigit() else 'unknown'

    files_by_year = defaultdict(int)
    records_by_year = defaultdict(int)
    missing_by_year = defaultdict(int)
    for period in files_by_period:
        year = extract_year(period)
        files_by_year[year] += files_by_period[period]
        records_by_year[year] += records_by_period[period]
        missing_by_year[year] += missing_by_period[period]
    stats['files_by_year'] = dict(files_by_year)
    stats['records_by_year'] = dict(records_by_year)
    stats['missing_by_year'] = dict(missing_by_year)

    return stats

In [27]:
def print_dataset_statistics(stats):
    """
    Print dataset statistics for LAQN using new column structure.

    Parameters:
        stats : dict
            returned by get_laqn_dataset_statistics().
    """

    print("\n" + "="*40)
    print("LAQN dataset statistics: initial assessment")
    print("="*40)

    print("\nScale and scope:")
    print(f"Total files collected: {stats['total_files']:,}")
    print(f"Total measurement records: {stats['total_records']:,}")
    print(f"Total missing values (@Value): {stats['total_missing']:,}")
    print(f"Overall completeness: {stats['overall_completeness']:.2f}%")
    print(f"Unique monitoring sites (SiteCode): {stats['unique_stations']}")
    print(f"Total site-species combinations: {stats['total_combinations']}")
    print(f"Unique pollutant types (SpeciesCode): {stats['unique_pollutants']}")
    print(f"Unique geographic locations: {stats['unique_locations']}")

    # data collection coverage
    print("\nData collection coverage:")
    print(f"Expected SiteCode/SpeciesCode pairs (from metadata): {stats.get('expected_pairs', 0)}")
    print(f"Actually collected pairs: {stats.get('collected_pairs', 0)}")
    print(f"Missing pairs (not collected): {stats.get('missing_pairs_count', 0)}")
    print(f"Extra pairs (not in metadata): {stats.get('extra_pairs_count', 0)}")

    if stats.get('missing_pairs_count', 0) > 0:
        print(f"\nWarning: {stats['missing_pairs_count']} SiteCode/SpeciesCode pairs from metadata were not found in collected data.")
        print("First 10 missing pairs:")
        for i, (site, species) in enumerate(stats['missing_pairs'][:10], 1):
            print(f"  {i}. {site} - {species}")

    if stats.get('extra_pairs_count', 0) > 0:
        print(f"\nNote: {stats['extra_pairs_count']} SiteCode/SpeciesCode pairs in collected data are not in metadata.")

    print("\nFiles by year:")
    for year, count in stats['files_by_year'].items():
        print(f"  {year}: {count:,} files")

    print("\nRecords by year:")
    for year, count in stats['records_by_year'].items():
        missing = stats['missing_by_year'].get(year, 0)
        missing_pct = (missing / count * 100) if count > 0 else 0
        print(f"  {year}: {count:,} records, {missing:,} missing ({missing_pct:.2f}%)")

    # adding nan value summary below
    print("\nNaN replacement summary:")
    print(f"Total invalid flags replaced: {stats['total_nan_replacements']:,}")
    print(f"Mean invalid percentage per file: {stats['mean_invalid_percentage']:.2f}%")
    print(f"Max invalid percentage: {stats['max_invalid_percentage']:.2f}%")

    # count of replacements by year
    if stats['nan_replacements_by_year']:
        print("\nReplacements by year:")
        for year_folder, count in stats['nan_replacements_by_year'].items():
            print(f"  {year_folder}: {count:,} flags replaced")

    print("\nTemporal coverage:")
    print(f"Start date: {stats['temporal_coverage']['start_date']}")
    print(f"End date: {stats['temporal_coverage']['end_date']}")
    print(f"Total months: {stats['temporal_coverage']['total_months']}")

    print("\nPollutant (SpeciesCode) distribution:")
    print("Site/species combinations by type:")
    for species, count in sorted(stats['pollutant_distribution'].items(),
                                 key=lambda x: x[1], reverse=True):
        percentage = (count / stats['total_combinations']) * 100
        print(f"  {species}: {count} ({percentage:.1f}%)")

    # missing value distribution by pollutant type
    print("\nMissing value distribution by pollutant type (SpeciesCode):")
    if stats.get('missing_by_pollutant_type'):
        # sort by percentage missing (highest first)
        sorted_species = sorted(
            stats['missing_by_pollutant_type'].items(),
            key=lambda x: x[1]['percentage_missing'],
            reverse=True
        )
        print(f"{'SpeciesCode':<20} {'total records':>15} {'missing':>12} {'% missing':>12}")
        print("-" * 60)
        for species, data in sorted_species:
            print(f"{species:<20} {data['total_records']:>15,} {data['total_missing']:>12,} {data['percentage_missing']:>11.2f}%")
    else:
        print("  No missing value distribution available.")

    # print missing values by site/species breakdown with row_number column
    print("\nMissing values by site/species (SiteCode/SpeciesCode):")
    if stats.get('missing_by_station_pollutant'):
        # prepare a sorted list by missing percentage descending
        breakdown = []
        for (site, species), (missing, total) in stats['missing_by_station_pollutant'].items():
            percent = (missing / total * 100) if total > 0 else 0
            breakdown.append((site, species, missing, total, percent))
        # sort by percentage descending and take top 20
        breakdown.sort(key=lambda x: x[4], reverse=True)
        breakdown = breakdown[:20]
        print(f"{'SiteCode':<20} {'SpeciesCode':<20} {'missing':>10} {'total_row':>12} {'% missing':>12}")
        print("-" * 60)
        for site, species, missing, total, percent in breakdown:
            print(f"{site:<20} {species:<20} {missing:>10,} {total:>12,} {percent:>11.2f}%")
    else:
        print("  No missing value breakdown available.")

In [28]:
# Run the analysis
stats = get_laqn_dataset_statistics(base_dir, metadata_path, nan_log_path)
print_dataset_statistics(stats)

# Save statistics for later use as csv
# Prepare flat data structure for csv
stats_rows = []
stats_rows.append(["metric", "value"])
stats_rows.append(["total_files", stats['total_files']])
stats_rows.append(["total_records", stats['total_records']])
stats_rows.append(["total_missing", stats['total_missing']])
stats_rows.append(["overall_completeness_pct", f"{stats['overall_completeness']:.2f}"])
stats_rows.append(["unique_sites", stats['unique_stations']])
stats_rows.append(["total_site_species_combinations", stats['total_combinations']])
stats_rows.append(["unique_species", stats['unique_pollutants']])
stats_rows.append(["unique_locations", stats['unique_locations']])
stats_rows.append(["expected_site_species_pairs", stats.get('expected_pairs', 0)])
stats_rows.append(["collected_site_species_pairs", stats.get('collected_pairs', 0)])
stats_rows.append(["missing_site_species_pairs_count", stats.get('missing_pairs_count', 0)])
stats_rows.append(["extra_site_species_pairs_count", stats.get('extra_pairs_count', 0)])
stats_rows.append(["total_nan_replacements", stats['total_nan_replacements']])
stats_rows.append(["mean_invalid_pct", f"{stats['mean_invalid_percentage']:.2f}"])
stats_rows.append(["max_invalid_pct", f"{stats['max_invalid_percentage']:.2f}"])

# Add year-specific metrics
for year in ['2023', '2024', '2025']:
    stats_rows.append([f"files_{year}", stats['files_by_year'].get(year, 0)])
    stats_rows.append([f"records_{year}", stats['records_by_year'].get(year, 0)])
    stats_rows.append([f"missing_{year}", stats['missing_by_year'].get(year, 0)])
    year_key = f'{year}measurements'
    stats_rows.append([f"replacements_{year}", stats['nan_replacements_by_year'].get(year_key, 0)])

# Save to csv stats report
pd.DataFrame(stats_rows[1:], columns=stats_rows[0]).to_csv(stats_output_path, index=False)
print(f"\nStatistics saved to: {stats_output_path}")

# Save species (pollutant) distribution to csv
total_combinations = stats['total_combinations']
species_distribution_df = pd.DataFrame(
    [
        {
            'SpeciesCode': k,
            'count': v,
            'percentage': round((v / total_combinations) * 100, 2) if total_combinations > 0 else 0
        }
        for k, v in stats['pollutant_distribution'].items()
    ]
)
species_distribution_df.to_csv(pollutant_distrubution_path, index=False)
print(f"Species (pollutant) distribution saved to: {pollutant_distrubution_path}")

# Save missing value distribution by species to csv
if stats.get('missing_by_pollutant_type'):
    missing_by_species_df = pd.DataFrame([
        {
            'SpeciesCode': k,
            'total_records': v['total_records'],
            'total_missing': v['total_missing'],
            'percentage_missing': v['percentage_missing']
        }
        for k, v in stats['missing_by_pollutant_type'].items()
    ])
    missing_by_species_df.to_csv(nan_val_pollutant_split_path, index=False)
    print(f"Missing value distribution by species saved to: {nan_val_pollutant_split_path}")

# Save missing values by site/species to csv
if stats.get('missing_by_station_pollutant'):
    missing_by_site_species_df = pd.DataFrame([
        {
            'SiteCode': k[0],
            'SpeciesCode': k[1],
            'missing': v[0],
            'total_row': v[1],
            'percentage_missing': (v[0] / v[1] * 100) if v[1] > 0 else 0
        }
        for k, v in stats['missing_by_station_pollutant'].items()
    ])
    missing_by_site_species_df.to_csv(nan_val_stationPollutant_path, index=False)
    print(f"Missing values by site/species saved to: {nan_val_stationPollutant_path}")


loading metadata...
  expected SiteCode/SpeciesCode pairs from metadata: 170

scanning optimased directory for collected data...

Total CSV files found: 4932

Reading all CSV files to calculate statistics...
  2023_mar: 141 files, 101,520 records, 14,883 missing (14.66%)
  2025_feb: 141 files, 91,368 records, 12,034 missing (13.17%)
  2024_feb: 141 files, 94,752 records, 9,582 missing (10.11%)
  2025_aug: 141 files, 101,520 records, 16,123 missing (15.88%)
  2024_aug: 141 files, 101,520 records, 19,425 missing (19.13%)
  2025_mar: 141 files, 101,520 records, 15,384 missing (15.15%)
  2023_feb: 141 files, 91,368 records, 12,838 missing (14.05%)
  2024_mar: 141 files, 101,520 records, 11,279 missing (11.11%)
  2023_aug: 141 files, 101,520 records, 11,360 missing (11.19%)
  2024_jul: 141 files, 101,520 records, 12,934 missing (12.74%)
  2025_jul: 141 files, 101,520 records, 16,506 missing (16.26%)
  2024_oct: 141 files, 101,520 records, 11,079 missing (10.91%)
  2023_sep: 141 files, 98,1

    loading metadata...
    expected SiteCode/SpeciesCode pairs from metadata: 170

    scanning optimased directory for collected data...

    Total CSV files found: 4932

    Reading all CSV files to calculate statistics...
    2023_mar: 141 files, 101,520 records, 14,883 missing (14.66%)
    2025_feb: 141 files, 91,368 records, 12,034 missing (13.17%)
    2024_feb: 141 files, 94,752 records, 9,582 missing (10.11%)
    2025_aug: 141 files, 101,520 records, 16,123 missing (15.88%)
    2024_aug: 141 files, 101,520 records, 19,425 missing (19.13%)
    2025_mar: 141 files, 101,520 records, 15,384 missing (15.15%)
    2023_feb: 141 files, 91,368 records, 12,838 missing (14.05%)
    2024_mar: 141 files, 101,520 records, 11,279 missing (11.11%)
    2023_aug: 141 files, 101,520 records, 11,360 missing (11.19%)
    2024_jul: 141 files, 101,520 records, 12,934 missing (12.74%)
    2025_jul: 141 files, 101,520 records, 16,506 missing (16.26%)
    2024_oct: 141 files, 101,520 records, 11,079 missing (10.91%)
    2023_sep: 141 files, 98,136 records, 11,727 missing (11.95%)
    2025_oct: 141 files, 101,520 records, 12,342 missing (12.16%)
    2023_jan: 141 files, 101,520 records, 17,911 missing (17.64%)
    2023_jul: 141 files, 101,520 records, 11,160 missing (10.99%)
    2024_jan: 141 files, 101,520 records, 10,375 missing (10.22%)
    2025_sep: 141 files, 98,136 records, 18,344 missing (18.69%)
    2024_sep: 141 files, 98,136 records, 15,469 missing (15.76%)
    2023_oct: 141 files, 101,520 records, 13,964 missing (13.75%)
    2025_jan: 141 files, 101,520 records, 11,198 missing (11.03%)
    2024_dec: 141 files, 101,520 records, 9,421 missing (9.28%)
    2024_apr: 141 files, 98,136 records, 11,208 missing (11.42%)
    2024_nov: 141 files, 98,136 records, 9,063 missing (9.24%)
    2023_may: 141 files, 101,520 records, 12,641 missing (12.45%)
    2025_nov: 141 files, 60,912 records, 8,613 missing (14.14%)
    2025_apr: 141 files, 98,136 records, 10,606 missing (10.81%)
    2024_may: 141 files, 101,520 records, 12,839 missing (12.65%)
    2025_may: 141 files, 101,520 records, 10,843 missing (10.68%)
    2023_nov: 138 files, 96,048 records, 10,851 missing (11.30%)
    2023_apr: 141 files, 98,136 records, 10,992 missing (11.20%)
    2023_dec: 141 files, 101,520 records, 14,882 missing (14.66%)
    2025_jun: 141 files, 98,136 records, 14,018 missing (14.28%)
    2024_jun: 141 files, 98,136 records, 9,661 missing (9.84%)
    2023_jun: 141 files, 98,136 records, 12,103 missing (12.33%)

    cross-referencing collected data with metadata...
    expected pairs from metadata: 170
    actually collected pairs: 141
    missing pairs (in metadata but not collected): 53
    extra pairs (collected but not in metadata): 24

    ========================================
    LAQN dataset statistics: initial assessment
    ========================================

    Scale and scope:
    Total files collected: 4,932
    Total measurement records: 3,446,208
    Total missing values (@Value): 443,658
    Overall completeness: 87.13%
    Unique monitoring sites (SiteCode): 78
    Total site-species combinations: 173
    Unique pollutant types (SpeciesCode): 6
    Unique geographic locations: 76

    Data collection coverage:
    Expected SiteCode/SpeciesCode pairs (from metadata): 170
    Actually collected pairs: 141
    Missing pairs (not collected): 53
    Extra pairs (not in metadata): 24

    Warning: 53 SiteCode/SpeciesCode pairs from metadata were not found in collected data.
    First 10 missing pairs:
    1. BL0 - PM25
    2. TH4 - PM25
    3. BT6 - PM25
    4. MEB - PM25
    5. GN6 - PM25
    6. GR8 - PM25
    7. GN3 - PM25
    8. TL6 - PM25
    9. GT1 - PM25
    10. CE3 - PM25

    Note: 24 SiteCode/SpeciesCode pairs in collected data are not in metadata.

    Files by year:
    2023: 1,689 files
    2025: 1,551 files
    2024: 1,692 files

    Records by year:
    2023: 1,192,464 records, 155,312 missing (13.02%)
    2025: 1,055,808 records, 146,011 missing (13.83%)
    2024: 1,197,936 records, 142,335 missing (11.88%)

    NaN replacement summary:
    Total invalid flags replaced: 0
    Mean invalid percentage per file: 0.00%
    Max invalid percentage: 0.00%

    Temporal coverage:
    Start date: 2023-01-01
    End date: 2025-11-19
    Total months: 35

    Pollutant (SpeciesCode) distribution:
    Site/species combinations by type:
    NO2: 60 (34.7%)
    PM25: 53 (30.6%)
    PM10: 43 (24.9%)
    O3: 11 (6.4%)
    SO2: 4 (2.3%)
    CO: 2 (1.2%)

    Missing value distribution by pollutant type (SpeciesCode):
    SpeciesCode            total records      missing    % missing
    ------------------------------------------------------------
    O3                           268,320       47,056       17.54%
    PM2.5                        586,944      100,755       17.17%
    SO2                           97,824       15,803       16.15%
    PM10                       1,026,456      126,749       12.35%
    NO2                        1,417,752      148,803       10.50%
    CO                            48,912        4,492        9.18%

    Missing values by site/species (SiteCode/SpeciesCode):
    SiteCode             SpeciesCode             missing    total_row    % missing
    ------------------------------------------------------------
    WM6                  PM10                     15,357       24,456       62.79%
    CE3                  NO2                      11,394       24,456       46.59%
    TL4                  NO2                       9,869       24,456       40.35%
    RI2                  O3                        9,732       24,456       39.79%
    WA7                  NO2                       9,236       24,456       37.77%
    BG1                  SO2                       8,373       24,456       34.24%
    TH4                  PM2.5                     8,042       24,456       32.88%
    CD1                  PM10                      7,711       24,456       31.53%
    WAA                  NO2                       7,697       24,456       31.47%
    CE3                  PM10                      7,652       24,456       31.29%
    CE3                  PM2.5                     7,652       24,456       31.29%
    TH4                  PM10                      7,457       24,456       30.49%
    CD1                  PM2.5                     7,339       24,456       30.01%
    GN6                  PM2.5                     7,240       24,456       29.60%
    CR8                  PM2.5                     7,207       24,456       29.47%
    GN0                  PM2.5                     7,099       24,456       29.03%
    HG4                  O3                        7,092       24,456       29.00%
    CD1                  NO2                       6,984       24,456       28.56%
    BT5                  PM2.5                     6,783       24,456       27.74%
    MY1                  O3                        6,634       24,456       27.13%

    Statistics saved to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/report/laqn_stats.csv
    Species (pollutant) distribution saved to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/report/pollutant_distribution.csv
    Missing value distribution by species saved to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/report/nan_values_by_pollutant.csv
    Missing values by site/species saved to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/report/nan_values_by_station_pollutant.csv