# LAQN dataset Find Missing Parts

- I will identify the missing values and data gaps in the LAQN dataset and decide how to address them.
- I’ll start by importing the relevant modules and displaying the initial file paths.

In [60]:
import pandas as pd
from pathlib import Path
import logging
from typing import Dict

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# Use absolute path to avoid confusion
base_dir = Path("/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels")
processed_dir = base_dir / "data" / "laqn" / "processed"
optimased_dir = base_dir / "data" / "laqn" / "optimased"
metadata_path = base_dir / "data" / "laqn" / "actv_sites_species.csv"
month_dirs = sorted([d for d in optimased_dir.iterdir() if d.is_dir()])
year_2023_dir = base_dir / "data" / "laqn" / "year_2023"

#extra path points raw laqn data, for optimase KF1 and MR8 PM25 results seems like it isint in optimased folder
raw_filepath = base_dir / 'data'/'laqn'/'monthly_data'
missing_sites = ["KF1", "MR8"]

# path for updated metadata i will change the paths accordignly
updtd_metadata_path = base_dir / "data" / "laqn" / "updated_actv_siteSpecies.csv"

# Change output directory to data/laqn/missing
output_dir = base_dir / "data" / "laqn" / "missing"
output_dir.mkdir(parents=True, exist_ok=True)

# the paths for missing logs
missing_dir = base_dir / "data" / "laqn" / "missing"
input_file = missing_dir / "logs_missin_value.csv"
output_100_file = missing_dir / "missing_all_values.csv"
output_filtered_file = missing_dir / "missing_all_values_filtered.csv"

## 1) Standartisation LAQN 

The function for add LAQN files extra columns.
- Rename pollutant_std to SpeciesCode.
- extract sitecode and other columns from actv_sites_species.csv
- and new standartised csv files will be saved data/laqn/optimased/
- Final column structure:
        @MeasurementDateGMT, @Value, SpeciesCode, SiteCode, SpeciesName, SiteName, SiteType, Latitude, Longitude

In [2]:
def std_laqn_files():
    """
    standardise laqn csv files by:
    1. rename pollutant_std to SpeciesCode
    2. extract SiteCode from filename
    3. add metadata columns from actv_sites_species.csv
    4. save to data/laqn/optimased/
    
    final columns: @MeasurementDateGMT, @Value, SpeciesCode, SiteCode, SpeciesName, SiteName, SiteType, Latitude, Longitude
    """
    
    # create output directory if it does not exist
    optimased_dir.mkdir(parents=True, exist_ok=True)
    
    # load metadata
    metadata = pd.read_csv(metadata_path)
    
    print("\n" + "="*120)
    print("standardising laqn files")
    print("="*120)
    print(f"\nprocessed directory: {processed_dir}")
    print(f"output directory: {optimased_dir}")
    print(f"metadata file: {metadata_path}")
    print(f"metadata rows: {len(metadata)}")
    
    # get all month directories
    month_dirs = sorted([d for d in processed_dir.iterdir() if d.is_dir()])
    print(f"\nfound {len(month_dirs)} month directories")
    
    # track statistics
    stats = {
        'total_files': 0,
        'successfully_processed': 0,
        'failed': 0,
        'failed_files': [],
        'pm25_normalised' :0,
    }
    
    # process each month directory
    for month_dir in month_dirs:
        month_name = month_dir.name
        print(f"\n{'-'*120}")
        print(f"Processing month: {month_name}")
        print(f"{'-'*120}")
        
        # create corresponding output directory
        output_month_dir = optimased_dir / month_name
        output_month_dir.mkdir(parents=True, exist_ok=True)
        
        # get all csv files in this month
        csv_files = sorted(month_dir.glob("*.csv"))
        print(f"files in {month_name}: {len(csv_files)}")
        
        for csv_file in csv_files:
            stats['total_files'] += 1
            filename = csv_file.name
            
            try:
                # extract site code and species code from filename
                # format of each csv file SiteCode_SpeciesCode_timestamp.csv 
                parts = filename.replace('.csv', '').split('_')
                site_code = parts[0] if len(parts) > 0 else None
                species_code = parts[1] if len(parts) > 1 else None
                
                if not site_code or not species_code:
                    print(f"  warning: could not parse filename: {filename}")
                    stats['failed'] += 1
                    stats['failed_files'].append(filename)
                    continue
                
                # read the csv file
                df = pd.read_csv(str(csv_file))
                
                # check if file is empty
                if df.empty:
                    print(f"  warning: empty file: {filename}")
                    stats['failed'] += 1
                    stats['failed_files'].append(filename)
                    continue
                
                # rename pollutant_std to SpeciesCode if it exists
                if 'pollutant_std' in df.columns:
                    df.rename(columns={'pollutant_std': 'SpeciesCode'}, inplace=True)
                
                # add SiteCode column
                df['SiteCode'] = site_code
                
                # if SpeciesCode column does not exist, add it
                if 'SpeciesCode' not in df.columns:
                    df['SpeciesCode'] = species_code

                # normalise for metadata lookup
                species_lookup = species_code.replace("PM2.5", "PM25")
                
                # match with metadata to get additional columns
                metadata_match = metadata[
                    (metadata['SiteCode'] == site_code) & 
                    (metadata['SpeciesCode'] == species_lookup)
                ]
                
                if not metadata_match.empty:
                    # get first matching row
                    meta_row = metadata_match.iloc[0]
                    
                    # add metadata columns
                    df['SpeciesName'] = meta_row['SpeciesName']
                    df['SiteName'] = meta_row['SiteName']
                    df['SiteType'] = meta_row['SiteType']
                    df['Latitude'] = meta_row['Latitude']
                    df['Longitude'] = meta_row['Longitude']
                    if species_lookup != species_code:
                        stats["pm25_normalised"] += 1
                else:
                    print(f"  warning: no metadata found for {site_code}_{species_code}")
                    # add empty columns to maintain structure
                    df['SpeciesName'] = None
                    df['SiteName'] = None
                    df['SiteType'] = None
                    df['Latitude'] = None
                    df['Longitude'] = None
                
                # reorder columns to match required structure
                column_order = [
                    '@MeasurementDateGMT', '@Value', 'SpeciesCode', 'SiteCode',
                    'SpeciesName', 'SiteName', 'SiteType', 'Latitude', 'Longitude'
                ]
                
                # only keep columns that exist
                final_columns = [col for col in column_order if col in df.columns]
                df = df[final_columns]
                
                # save to output directory
                output_file = output_month_dir / filename
                df.to_csv(output_file, index=False)
                
                stats['successfully_processed'] += 1
                
                if stats['successfully_processed'] % 100 == 0:
                    print(f"  processed {stats['successfully_processed']} files...")
            
            except Exception as e:
                print(f"  error processing {filename}: {str(e)}")
                stats['failed'] += 1
                stats['failed_files'].append(filename)
    
    # print final summary
    print("\n" + "="*120)
    print("Standardisation complete")
    print("="*120)
    print(f"\ntotal files: {stats['total_files']}")
    print(f"Successfully processed: {stats['successfully_processed']}")
    print(f"failed: {stats['failed']}")
    
    if stats['failed'] > 0:
        print(f"\nfailed files (first 10):")
        for f in stats['failed_files'][:10]:
            print(f"  - {f}")
        if len(stats['failed_files']) > 10:
            print(f"  ... and {len(stats['failed_files']) - 10} more")
    
    print(f"\noutput directory: {optimased_dir}")
    print("="*120 + "\n")
    
    return stats

In [3]:
# step 1: standardize files
print(" Standardising LAQN files.")
standardisation_stats = std_laqn_files()

 Standardising LAQN files.

standardising laqn files

processed directory: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/processed
output directory: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased
metadata file: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/actv_sites_species.csv
metadata rows: 252

found 35 month directories

------------------------------------------------------------------------------------------------------------------------
Processing month: 2023_apr
------------------------------------------------------------------------------------------------------------------------
files in 2023_apr: 249
  processed 100 files...
  processed 200 files...

------------------------------------------------------------------------------------------------------------------------
Processing month: 2023_aug
-------------------------------------------------------

#### 2) Optimase unoptimased files, below:
- KF1 MR8 PM2.5 files data/laqn/monthly_data path 
- path name: raw_file_path 
- copy paste the same std function i used earlier.
- after optimasation I will be re run the function data_quality again.

In [57]:
# manually check 2 stations missing pm25 
missing_sites = ["KF1", "MR8"]
found_files = {}
raw_filepath = base_dir / 'data'/'laqn'/'monthly_data'

for site in missing_sites:
    files_pm25 = list(Path(raw_filepath).rglob(f"{site}_PM25_*.csv"))
    files_pm25dot = list(Path(raw_filepath).rglob(f"{site}_PM2.5_*.csv"))
    all_files = files_pm25 + files_pm25dot
    found_files[site] = all_files
    if all_files:
        print(f"Files found for {site} (PM25 or PM2.5):")
        for file in all_files:
            print(f"  {file}")
    else:
        print(f"No PM25 or PM2.5 files found for {site}")

print("\nSummary:")
for site, files in found_files.items():
    print(f"{site}: {len(files)} file(s) found")


Files found for KF1 (PM25 or PM2.5):
  /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/monthly_data/2023_mar/KF1_PM25_2023-03-01_2023-03-31.csv
  /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/monthly_data/2025_feb/KF1_PM25_2025-02-01_2025-02-28.csv
  /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/monthly_data/2024_feb/KF1_PM25_2024-02-01_2024-02-29.csv
  /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/monthly_data/2025_aug/KF1_PM25_2025-08-01_2025-08-31.csv
  /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/monthly_data/2024_aug/KF1_PM25_2024-08-01_2024-08-31.csv
  /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/monthly_data/2025_mar/KF1_PM25_2025-03-01_2025-03-31.csv
  /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/monthly_data

#### 3) optimased KF1, MR8 - PM25 file function below:


In [61]:
def optimise_specific_files(file_list, metadata_path, optimased_dir):
    """
    Standardise only the given list of files and save to optimased_dir.
    """
    metadata = pd.read_csv(metadata_path)
    for csv_file in file_list:
        filename = csv_file.name
        parts = filename.replace('.csv', '').split('_')
        site_code = parts[0] if len(parts) > 0 else None
        species_code = parts[1] if len(parts) > 1 else None
        if not site_code or not species_code:
            print(f"  warning: could not parse filename: {filename}")
            continue

        try:
            df = pd.read_csv(str(csv_file))
            if df.empty:
                print(f"  warning: empty file: {filename}")
                continue

            if 'pollutant_std' in df.columns:
                df.rename(columns={'pollutant_std': 'SpeciesCode'}, inplace=True)
            df['SiteCode'] = site_code
            if 'SpeciesCode' not in df.columns:
                df['SpeciesCode'] = species_code

            species_lookup = species_code.replace("PM2.5", "PM25")
            metadata_match = metadata[
                (metadata['SiteCode'] == site_code) &
                (metadata['SpeciesCode'] == species_lookup)
            ]
            if not metadata_match.empty:
                meta_row = metadata_match.iloc[0]
                df['SpeciesName'] = meta_row['SpeciesName']
                df['SiteName'] = meta_row['SiteName']
                df['SiteType'] = meta_row['SiteType']
                df['Latitude'] = meta_row['Latitude']
                df['Longitude'] = meta_row['Longitude']
            else:
                print(f"  warning: no metadata found for {site_code}_{species_code}")
                df['SpeciesName'] = None
                df['SiteName'] = None
                df['SiteType'] = None
                df['Latitude'] = None
                df['Longitude'] = None

            column_order = [
                '@MeasurementDateGMT', '@Value', 'SpeciesCode', 'SiteCode',
                'SpeciesName', 'SiteName', 'SiteType', 'Latitude', 'Longitude'
            ]
            final_columns = [col for col in column_order if col in df.columns]
            df = df[final_columns]

            # Output directory by month
            month_dir = csv_file.parent.name
            output_month_dir = optimased_dir / month_dir
            output_month_dir.mkdir(parents=True, exist_ok=True)
            output_file = output_month_dir / filename
            df.to_csv(output_file, index=False)
            print(f"Optimised file saved: {output_file}")
        except Exception as e:
            print(f"  error processing {filename}: {str(e)}")
    


In [62]:
all_files = []
for site in missing_sites:
    files_pm25 = list(Path(raw_filepath).rglob(f"{site}_PM25_*.csv"))
    files_pm25dot = list(Path(raw_filepath).rglob(f"{site}_PM2.5_*.csv"))
    all_files.extend(files_pm25 + files_pm25dot)

optimise_specific_files(all_files, metadata_path, optimased_dir)

Optimised file saved: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_mar/KF1_PM25_2023-03-01_2023-03-31.csv
Optimised file saved: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2025_feb/KF1_PM25_2025-02-01_2025-02-28.csv
Optimised file saved: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2024_feb/KF1_PM25_2024-02-01_2024-02-29.csv
Optimised file saved: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2025_aug/KF1_PM25_2025-08-01_2025-08-31.csv
Optimised file saved: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2024_aug/KF1_PM25_2024-08-01_2024-08-31.csv
Optimised file saved: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2025_mar/KF1_PM25_2025-03-01_2025-03-31.csv
Optimised file saved: /Users/burdz

- below validation of the files: 


In [63]:
optimased_dir = Path("/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased")
for site in ["KF1", "MR8"]:
    for file in optimased_dir.rglob(f"{site}_PM25_*.csv"):
        print(file)
    for file in optimased_dir.rglob(f"{site}_PM2.5_*.csv"):
        print(file)

/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_mar/KF1_PM25_2023-03-01_2023-03-31.csv
/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2025_feb/KF1_PM25_2025-02-01_2025-02-28.csv
/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2024_feb/KF1_PM25_2024-02-01_2024-02-29.csv
/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2025_aug/KF1_PM25_2025-08-01_2025-08-31.csv
/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2024_aug/KF1_PM25_2024-08-01_2024-08-31.csv
/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2025_mar/KF1_PM25_2025-03-01_2025-03-31.csv
/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_feb/KF1_PM25_2023-02-01_2023-02-28.csv
/Users/burdzhuchaglayan/Des

## 2) Functions below:

### 1.1)The functions for discover and checks data quality metrics before cleaning, below.

#### 1) Data quality function, what it does:
- Counts total rows in dataset
- Identifies missing values per column (count + percentage)
- Counts duplicate rows based on timestamp
- Detects negative values in measurements
- Checks timestamp format issues

In [64]:
def data_quality(optimased_dir):
    """
    Checking data quality metrics before start cleaning.
    
    Returns dict with:
    - total_rows
    - missing_values
    - duplicate_count
    - negative_values
    - timestamp_format
    """
    results = []
    total_files = 0
    total_rows = 0
    total_duplicates = 0
    total_negatives = 0
    files_with_timestamp_issues = 0
    files_with_missing = 0

    for csv_file in Path(optimased_dir).rglob("*.csv"):
        try:
            df = pd.read_csv(csv_file)
            filename = csv_file.name
            total_files += 1
            total_rows += len(df)
            missing_any = False
            assessment = {
                'filename': filename,
                'total_rows': len(df),
                'missing_values': {},
                'duplicate_count': 0,
                'negative_values': 0,
                'timestamp_issues': False
            }
            print(f"Total rows: {len(df)}")
            # missing values
            for col in df.columns:
                missing = df[col].isnull().sum()
                if missing > 0:
                    assessment['missing_values'][col] = {
                        'count': int(missing),
                        'percentage': round(missing / len(df) * 100, 2)
                    }
                    print(f"Missing in '{col}': {missing} ({round(missing / len(df) * 100, 2)}%)")
                    missing_any = True
            if missing_any:
                files_with_missing += 1

            # duplicates
            if '@MeasurementDateGMT' in df.columns:
                assessment['duplicate_count'] = df.duplicated(subset=['@MeasurementDateGMT']).sum()
                print(f"Duplicate timestamps: {assessment['duplicate_count']}")
                total_duplicates += assessment['duplicate_count']

            # negative values
            if '@Value' in df.columns:
                assessment['negative_values'] = (df['@Value'] < 0).sum()
                print(f"Negative @Value entries: {assessment['negative_values']}")
                total_negatives += assessment['negative_values']

            # timestamp format
            if '@MeasurementDateGMT' in df.columns:
                assessment['timestamp_issues'] = df['@MeasurementDateGMT'].dtype == 'object'
                print(f"Timestamp column is object type: {assessment['timestamp_issues']}")
                files_with_timestamp_issues += 1

            results.append(assessment)
        except Exception as e:
            print(f"Error processing {csv_file.name}: {str(e)}")
            #Summary of assessment
    print("\n" + "="*40)
    print("Summary of Data Quality Assessment")
    print("="*40)   
    print(f"Total files checked: {total_files}")
    print(f"Total rows across all files: {total_rows}")
    print(f"Files with missing values: {files_with_missing}")
    print(f"Total duplicate timestamps: {total_duplicates}")
    print(f"Total negative @Value entries: {total_negatives}")
    print(f"Files with timestamp format issues: {files_with_timestamp_issues}")  
    return results

In [65]:
results = data_quality(optimased_dir)

Total rows: 720
Duplicate timestamps: 0
Negative @Value entries: 0
Timestamp column is object type: True
Total rows: 720
Missing in '@Value': 720 (100.0%)
Duplicate timestamps: 0
Negative @Value entries: 0
Timestamp column is object type: True
Total rows: 720
Missing in '@Value': 720 (100.0%)
Duplicate timestamps: 0
Negative @Value entries: 0
Timestamp column is object type: True
Total rows: 720
Missing in '@Value': 5 (0.69%)
Duplicate timestamps: 0
Negative @Value entries: 0
Timestamp column is object type: True
Total rows: 720
Duplicate timestamps: 0
Negative @Value entries: 0
Timestamp column is object type: True
Total rows: 720
Missing in '@Value': 64 (8.89%)
Duplicate timestamps: 0
Negative @Value entries: 0
Timestamp column is object type: True
Total rows: 720
Missing in '@Value': 3 (0.42%)
Duplicate timestamps: 0
Negative @Value entries: 0
Timestamp column is object type: True
Total rows: 720
Missing in '@Value': 720 (100.0%)
Duplicate timestamps: 0
Negative @Value entries: 0
Ti

##### Summary of assestment:
        ========================================
        Summary of Data Quality Assessment
        ========================================
        Total files checked: 7379
        Total rows across all files: 5156016
        Files with missing values: 6488
        Total duplicate timestamps: 0
        Total negative @Value entries: 23311
        Files with timestamp format issues: 7379

after addition of 2 More station here is sum: (MR8 and KF1 PM25 pollutant)

        ========================================
        Summary of Data Quality Assessment
        ========================================
        Total files checked: 7449
        Total rows across all files: 5204928
        Files with missing values: 6558
        Total duplicate timestamps: 0
        Total negative @Value entries: 23313
        Files with timestamp format issues: 7449

#####  Summary of Data Analysis and API Findings After Adding KF1 and MR8 PM25 Files

##### Data Quality Metrics (Before vs After Addition)

| Metric                        | Before Addition | After Addition | Increase |
|-------------------------------|-----------------|---------------|----------|
| Total files checked           | 7,379           | 7,449         | +70      |
| Total rows across all files   | 5,156,016       | 5,204,928     | +48,912  |
| Files with missing values     | 6,488           | 6,558         | +70      |
| Total duplicate timestamps    | 0               | 0             | 0        |
| Total negative @Value entries | 23,311          | 23,313        | +2       |
| Files with timestamp issues   | 7,379           | 7,449         | +70      |

**Insights:**
- Adding the missing KF1 and MR8 PM25 files resulted in exactly 70 more files, matching the number of new files found and processed.
- The total number of rows increased by 48,912, indicating substantial new data coverage for these sites and pollutant.
- Each new file contained missing values and timestamp format issues, as reflected by the increase of 70 in both those categories.
- There was a very small increase in negative @Value entries (+2), suggesting only a couple of new negative values were added.
- No duplicate timestamps were added.
- The missing data for KF1 and MR8 PM25 has now been successfully filled, closing previous gaps in the dataset.

---

##### API/Postman Findings

- Checked the ERG API endpoint for these sites and pollutants:
  ```
  https://api.erg.ic.ac.uk/AirQuality/Data/SiteSpecies/SiteCode={{SiteCode}}/SpeciesCode={{SpeciesCode}}/StartDate={{StartDate}}/EndDate={{EndDate}}/csv
  ```
- Example for MR8 - PM25 and KF1 - PM25:
  - Headers: `MeasurementDateGMT,Marylebone Road - BAM: PM2.5 Particulate (ug/m3)`
  - Data: All values are missing (empty after the timestamp), e.g.:
    ```
    1999-10-01 00:00,
    2009-11-01 22:00,
    2009-11-01 23:00,
    2009-11-02 00:00,
    ```
- This confirms that these sites and pollutants are no longer active, which explains the 100% missing values in the files.

---

**Conclusion:**  
The addition of the missing KF1 and MR8 PM25 files is reflected in the data quality metrics, and the API check confirms that these pollutants are no longer active at these sites, resulting in files with only missing values. The dataset is now complete and accurately reflects the available data from the source.- This confirms that these sites and pollutants are no longer active, which explains the 100% missing values in the files.

---

**Conclusion:**  
The addition of the missing KF1 and MR8 PM25 files is reflected in the data quality metrics, and the API check confirms that these pollutants are no longer active at these sites, resulting in files with only missing values. The dataset is now complete and accurately reflects the available data from the source.

### 2) Below the function for missing data gasps:
- Checks which site/species  are missing in Monthly/year folders.

In [66]:
def analyse_missing_data():

    metadata = pd.read_csv(updtd_metadata_path, encoding= "utf-8")
    expected = {(r["SiteCode"], r["SpeciesCode"]) for _, r in metadata.iterrows()}

    out = {
        "monthly_data": {"by_location": {}, "by_pollutant": {}, "by_month": {}, "missing_combinations": []},
        "all_data": {"by_location": {}, "by_pollutant": {}, "missing_combinations": []},
        "summary": {"expected_combinations": len(expected), "monthly_data_found": 0, "all_data_found": 0, "total_missing": 0},
    }

    # Monthly data
    if optimased_dir.exists():
        found_m = set()
        for fp in optimased_dir.rglob("*.csv"):
            parts = fp.name.split("_")
            if len(parts) >= 2:
                site, pol = parts[0], parts[1]
                species_lookup = pol.replace("PM2.5", "PM25")
                month = fp.parent.name
                found_m.add((site, species_lookup))
                out["monthly_data"]["by_location"].setdefault(site, {}).setdefault(species_lookup, []).append(month)
                out["monthly_data"]["by_pollutant"].setdefault(species_lookup, {}).setdefault(site, []).append(month)
                out["monthly_data"]["by_month"].setdefault(month, []).append((site, species_lookup))
        missing_m = expected - found_m
        print("\nMissing combinations in monthly data:")
        for s, p in sorted(missing_m):
            print(f"  Site: {s}, Pollutant: {p}")
            out["monthly_data"]["missing_combinations"].append({"site": s, "pollutant": p, "data_source": "monthly_data"})
        out["summary"]["monthly_data_found"] = len(found_m)
        
    # all data data
    if optimased_dir.exists():
        found_all = set()
        for fp in optimased_dir.rglob("*.csv"):
            parts = fp.name.split("_")
            if len(parts) >= 2:
                site, pol = parts[0], parts[1]
                species_lookup = pol.replace("PM2.5", "PM25")
                found_all.add((site, species_lookup))
                out["all_data"]["by_location"].setdefault(site, []).append(species_lookup)
                out["all_data"]["by_pollutant"].setdefault(species_lookup, []).append(site)
        missing_all = expected - found_all
        for s, p in sorted(missing_all):
            print(f"  Site: {s}, Pollutant: {p}")
            out["all_data"]["missing_combinations"].append({"site": s, "pollutant": p, "data_source": "all_data"})
        out["summary"]["all_data_found"] = len(found_all)
        out["summary"]["total_missing"] = len(missing_all)
    print(f"\nTotal expected: {len(expected)}")
    print(f"Found: {len(found_all)}")
    print(f"Missing: {len(missing_all)}")
    return out

In [67]:
out = analyse_missing_data()


Missing combinations in monthly data:

Total expected: 213
Found: 213
Missing: 0


Missing combinations in monthly data:

Total expected: 213
Found: 213
Missing: 0

## 3) Comprehensive test to find ALL missing and problematic data in standardized files.

#### 1) testing functions for all the data checks.


What the testing function does below:
- Scans all year directories in std
- Checks each CSV file for 8 types of issues:
    - Empty files: No rows
    - Column errors: Missing required columns (timestamp, SiteCode, SpeciesCode, value)
    - Duplicate timestamps: Multiple measurements at same time
    - Missing SiteCode: Null values in SiteCode column
    - Missing SpeciesCode: Null values in SpeciesCode column
    - High missing values: >20% of value column is null
    - Format errors: Cannot read file
    - Provides detailed reporting with:
        - Total files processed
        - Issue statistics
        - Examples of each problem type
        - Severity analysis

#### 2) a comprehensive function that checks all columns for missing values, validates data types, calculates missing rates, prints paths, and logs files with >20% missing values.

In [73]:
def detailed_data_quality_analysis():
    """
    Comprehensive analysis of all data quality metrics including:
    - Empty files
    - Missing required columns
    - Duplicate timestamps
    - Missing values for ALL columns
    - Data type validation
    - Missing SiteCode/SpeciesCode
    - High missing values (>20% @Value)
    - Format errors
    - File paths printed
    - Log files with >20% missing to CSV
    
    """
    print("\n" + "="*120)
    print("Data Quality Analysis Report, checks all columns")
    print("="*120)
    print(f"\nFound {len(month_dirs)} month directories\n")
    
    # Initialize tracking
    missing_values_log = []
    all_issues = {
        'empty_files': [],
        'duplicate_timestamps': [],
        'missing_sitecode': [],
        'missing_speciescode': [],
        'high_missing_values': [],
        'column_errors': [],
        'format_errors': []
    }
    total_stats = {
        'total_files': 0,
        'files_processed': 0,
        'files_with_high_missing': 0,
        'total_rows': 0,
        'empty_files': 0,
        'files_with_missing_timestamps': 0,
        'files_with_missing_sitecodes': 0
    }
    
    # each month directory
    
    for month_dir in month_dirs:
        month_name = month_dir.name
        year = month_name.split('_')[0]
        print(f"\n{'='*120}")
        print(f"Month: {month_name} | Year: {year}")
        print(f"{'='*120}\n")
                
        
        
        # get all csv files in this month
        csv_files = sorted(list(month_dir.glob('*.csv')))
        print(f"Total files in {month_name}: {len(csv_files)}\n")

        if not csv_files:
            print(f"no csv files found in {month_name}")
            continue
        
        #process each csv files
        for filepath in csv_files:
            total_stats['total_files'] += 1
            filename = filepath.name
            file_path_str = str(filepath)
            
            # Extract site and pollutant from filename
            parts = filename.replace('.csv', '').split('_')
            site_code = parts[0] if len(parts) > 0 else "UNKNOWN"
            pollutant = parts[1] if len(parts) > 1 else "UNKNOWN"
            
            try:
                # Read file
                df = pd.read_csv(file_path_str)
                
                #check file is empty
                if df.empty or len(df) == 0:
                    print(f" Empty file: {filename}")
                    print(f" Path: {file_path_str}\n")
                    all_issues['empty_files'].append({'month': month_name, 'file': filename, 'path': file_path_str})
                    total_stats['empty_files'] += 1
                    continue
                
                total_stats['files_processed'] += 1
                total_stats['total_rows'] += len(df)
                
                # Print file header
                print(f"\n FILE: {filename}")
                print(f"Path: {file_path_str}")
                print(f"Site: {site_code} | Pollutant: {pollutant}")
                print(f"Rows: {len(df)} | Columns: {len(df.columns)}")
                print(f"     {'-'*110}")
                
                
                # Required columns
                required_columns = ['@MeasurementDateGMT', '@Value', 'SiteCode', 'SpeciesCode']
                missing_cols = [col for col in required_columns if col not in df.columns]

                if missing_cols:
                    print(f"  warning: missing required columns: {missing_cols}")
                    all_issues['column_errors'].append({'month': month_name, 'file': filename, 'path': file_path_str, 'missing_columns': missing_cols, 'actual_columns': list(df.columns)})
                    continue


                # Duplicate timestamps
                if '@MeasurementDateGMT' in df.columns:
                    duplicate_timestamps = df['@MeasurementDateGMT'].duplicated().sum()
                    if duplicate_timestamps > 0:
                        print(f"  duplicate timestamps: {duplicate_timestamps}/{len(df)}")
                        all_issues['duplicate_timestamps'].append({'month': month_name, 'file': filename, 'path': file_path_str, 'duplicate_count': int(duplicate_timestamps), 'total_rows': len(df)})
                
                # Missing SiteCode/SpeciesCode
                missing_sitecodes = df['SiteCode'].isna().sum()
                if missing_sitecodes > 0:
                    sitecode_pct = (100 * missing_sitecodes / len(df))
                    print(f"  missing sitecodes: {missing_sitecodes}/{len(df)} ({sitecode_pct:.2f}%)")
                    all_issues['missing_sitecode'].append({'month': month_name, 'file': filename, 'path': file_path_str, 'missing_count': int(missing_sitecodes), 'total_rows': len(df)})
                    total_stats['files_with_missing_sitecodes'] += 1

                missing_speciescodes = df['SpeciesCode'].isna().sum()
                if missing_speciescodes > 0:
                    speciescode_pct = (100 * missing_speciescodes / len(df))
                    print(f"  missing speciescodes: {missing_speciescodes}/{len(df)} ({speciescode_pct:.2f}%)")
                    all_issues['missing_speciescode'].append({'month': month_name, 'file': filename, 'path': file_path_str, 'missing_count': int(missing_speciescodes), 'total_rows': len(df)})


                # Data type validation
                for col in df.columns:
                    data_type = str(df[col].dtype)
                    if col == '@MeasurementDateGMT' and data_type not in ['datetime64[ns]', 'object']:
                        print(f"Type error: {col} expected datetime, got {data_type}")
                    elif col in ['@Value', 'SiteCode', 'SpeciesCode'] and data_type not in ['float64', 'int64', 'object']:
                        print(f"Type warning: {col} should be numeric or string, got {data_type}")

                # Missing values for all columns
                for col in df.columns:
                    missing_count = df[col].isna().sum()
                    missing_pct = (missing_count / len(df) * 100) if len(df) > 0 else 0
                    if missing_count > 0:
                        print(f"  missing {col}: {missing_count}/{len(df)} ({missing_pct:.2f}%)")
                
                # calculate missing @Value percentage
                missing_values = df['@Value'].isna().sum()
                empty_value_percentage = (100 * missing_values / len(df)) if len(df) > 0 else 0
                
                print(f"  missing @Value: {missing_values}/{len(df)} ({empty_value_percentage:.2f}%)")
                
                # log files with >20% missing values
                if empty_value_percentage > 20:
                    total_stats['files_with_high_missing'] += 1
                    
                    missing_values_log.append({
                        'filename': filename,
                        'path': file_path_str,
                        'siteCode': site_code,
                        'SpeciesCode': pollutant,
                        'year': year,
                        'month': month_name,
                        'EmptyValuePercentage': round(empty_value_percentage, 2)
                    })
                    
                    print(f"  flagged: >20% missing @Value ({empty_value_percentage:.2f}%)")
                    print(f"  will be logged to logs_missin_value.csv")
            
            except Exception as e:
                print(f"\n  error reading file: {filename}")
                print(f"  path: {file_path_str}")
                print(f"  error: {str(e)}")
                all_issues['format_errors'].append({'month': month_name, 'file': filename, 'path': file_path_str, 'error': str(e)})

    
    # Print final summary
    print("\n" + "="*120)
    print("Final Summary of Data Quality Analysis")
    print("="*120)
    print(f"\nTotal Files Processed: {total_stats['files_processed']}")
    print(f"empty files: {total_stats['empty_files']}")
    print(f"files with missing timestamps: {total_stats['files_with_missing_timestamps']}")
    print(f"files with missing sitecodes: {total_stats['files_with_missing_sitecodes']}")
    print(f"files with >20% missing @Value: {total_stats['files_with_high_missing']}")
    print(f"total rows analyzed: {total_stats['total_rows']:,}")
    
    if total_stats['files_processed'] > 0:
        issue_rate = (total_stats['files_with_high_missing'] / total_stats['files_processed'] * 100)
        print(f"issue rate: {issue_rate:.1f}%")
    
    # save missing files log to csv
    if missing_values_log:
        df_log = pd.DataFrame(missing_values_log)
        log_file = output_dir / "logs_nan_values.csv"
        df_log.to_csv(log_file, index=False)
        
        print(f"\nmissing files log saved to: {log_file}")
        print(f"files logged: {len(missing_values_log)}")
        print(f"\ncolumns in output csv:")
        print(f"  {', '.join(df_log.columns.tolist())}")
        print(f"\nfirst 5 entries:")
        print(df_log.head().to_string(index=False))
    else:
        print(f"\nno files with >20% missing @Value found.")
    
    print("\n" + "="*120 + "\n")
    
    return all_issues

In [74]:
# Run the detailed data quality analysis
missing_log = detailed_data_quality_analysis()


Data Quality Analysis Report, checks all columns

Found 35 month directories


Month: 2023_apr | Year: 2023

Total files in 2023_apr: 213


 FILE: BG1_NO2_2023-04-01_2023-04-30.csv
Path: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_apr/BG1_NO2_2023-04-01_2023-04-30.csv
Site: BG1 | Pollutant: NO2
Rows: 696 | Columns: 9
     --------------------------------------------------------------------------------------------------------------
  missing @Value: 0/696 (0.00%)

 FILE: BG1_SO2_2023-04-01_2023-04-30.csv
Path: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_apr/BG1_SO2_2023-04-01_2023-04-30.csv
Site: BG1 | Pollutant: SO2
Rows: 696 | Columns: 9
     --------------------------------------------------------------------------------------------------------------
  missing @Value: 42/696 (6.03%)
  missing @Value: 42/696 (6.03%)

 FILE: BG2_NO2_2023-04-01_2023-04-30.csv
Path: /Users/bur

result of the detailed test function run:

         FILE: WME_PM2.5_2025-09-01_2025-09-30.csv
            Path: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2025_sep/WME_PM2.5_2025-09-01_2025-09-30.csv
            Site: WME | Pollutant: PM2.5
            Rows: 696 | Columns: 9
                --------------------------------------------------------------------------------------------------------------
            missing @Value: 261/696 (37.50%)
            missing @Value: 261/696 (37.50%)
            flagged: >20% missing @Value (37.50%)
            will be logged to logs_missin_value.csv

            ========================================================================================================================
            Final Summary of Data Quality Analysis
            ========================================================================================================================

            Total Files Processed: 7449
            empty files: 0
            files with missing timestamps: 0
            files with missing sitecodes: 0
            files with >20% missing @Value: 2876
            total rows analyzed: 5,204,928
            issue rate: 38.6%

            missing files log saved to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/logs_nan_values.csv
            files logged: 2876

            columns in output csv:
            filename, path, siteCode, SpeciesCode, year, month, EmptyValuePercentage

#### 2) filter logs_missin_value.csv file according to 100 percent missing values.
- filters based on 100 value column and than creates another missing_files csv.
- filters siteCode and speciesCode columns different to find out that what site's don't have that species on their system.

In [23]:
    
def filter_missing_pollutants():
    """
    Filter the logs_missin_value.csv to create:
    1. missing_all_values.csv - rows with 100% EmptyValuePercentage
    2. missing_all_values_filtered.csv - further filter where siteCode != SpeciesCode
    """
    print("="*100)
    print(f"Filtering missing value logs from: {input_file}")
    print("="*100)

    # Load the file
    df = pd.read_csv(input_file, encoding='utf-8')
    print(f"Loaded {len(df)} rows from logs_missin_value.csv")

    # 1. Filter for 100% EmptyValuePercentage
    df_100 = df[df['EmptyValuePercentage'] == 100]
    df_100.to_csv(output_100_file, index=False)
    print(f"Saved {len(df_100)} rows with 100% missing values to: {output_100_file}")

    # 2. Filter where siteCode != SpeciesCode
    df_filtered = df_100[df_100['siteCode'] != df_100['SpeciesCode']]
    df_filtered.to_csv(output_filtered_file, index=False)
    print(f"Saved {len(df_filtered)} rows (siteCode != SpeciesCode) to: {output_filtered_file}")

    print("="*100)
    print("Filtering complete.\n")

    return df_filtered


In [24]:
missing_filter = filter_missing_pollutants()

Filtering missing value logs from: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/logs_missin_value.csv
Loaded 4136 rows from logs_missin_value.csv
Saved 3401 rows with 100% missing values to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/missing_all_values.csv
Saved 3401 rows (siteCode != SpeciesCode) to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/missing_all_values_filtered.csv
Filtering complete.

