# LAQN dataset Find Missing Parts

- I will identify the missing values and data gaps in the LAQN dataset and decide how to address them.
- I’ll start by importing the relevant modules and displaying the initial file paths.

In [None]:
import pandas as pd
from pathlib import Path
import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# Use absolute path to avoid confusion
base_dir = Path("/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels")
std_dir = base_dir / "data" / "laqn" / "processed"
processed_dir = base_dir / "data" / "laqn" / "processed"
optimased_dir = base_dir / "data" / "laqn" / "optimased"
metadata_path = base_dir / "data" / "laqn" / "actv_sites_species.csv"

SyntaxError: unterminated string literal (detected at line 16) (4266846513.py, line 16)

## 1) Standartisation LAQN 

The function for add LAQN files extra columns.
- Rename pollutant_std to SpeciesCode.
- extract sitecode and other columns from actv_sites_species.csv
- and new standartised csv files will be saved data/laqn/optimased/
- Final column structure:
        @MeasurementDateGMT, @Value, SpeciesCode, SiteCode, SpeciesName, SiteName, SiteType, Latitude, Longitude

In [None]:
def std_laqn_files():
    """
    standardise laqn csv files by:
    1. rename pollutant_std to SpeciesCode
    2. extract SiteCode from filename
    3. add metadata columns from actv_sites_species.csv
    4. save to data/laqn/optimased/
    
    final columns: @MeasurementDateGMT, @Value, SpeciesCode, SiteCode, SpeciesName, SiteName, SiteType, Latitude, Longitude
    """
    
    # create output directory if it does not exist
    optimased_dir.mkdir(parents=True, exist_ok=True)
    
    # load metadata
    metadata = pd.read_csv(metadata_path)
    
    print("\n" + "="*120)
    print("standardising laqn files")
    print("="*120)
    print(f"\nprocessed directory: {processed_dir}")
    print(f"output directory: {optimased_dir}")
    print(f"metadata file: {metadata_path}")
    print(f"metadata rows: {len(metadata)}")
    
    # get all month directories
    month_dirs = sorted([d for d in processed_dir.iterdir() if d.is_dir()])
    print(f"\nfound {len(month_dirs)} month directories")
    
    # track statistics
    stats = {
        'total_files': 0,
        'successfully_processed': 0,
        'failed': 0,
        'failed_files': []
    }
    
    # process each month directory
    for month_dir in month_dirs:
        month_name = month_dir.name
        print(f"\n{'-'*120}")
        print(f"Processing month: {month_name}")
        print(f"{'-'*120}")
        
        # create corresponding output directory
        output_month_dir = optimased_dir / month_name
        output_month_dir.mkdir(parents=True, exist_ok=True)
        
        # get all csv files in this month
        csv_files = sorted(month_dir.glob("*.csv"))
        print(f"files in {month_name}: {len(csv_files)}")
        
        for csv_file in csv_files:
            stats['total_files'] += 1
            filename = csv_file.name
            
            try:
                # extract site code and species code from filename
                # format of each csv file SiteCode_SpeciesCode_timestamp.csv 
                parts = filename.replace('.csv', '').split('_')
                site_code = parts[0] if len(parts) > 0 else None
                species_code = parts[1] if len(parts) > 1 else None
                
                if not site_code or not species_code:
                    print(f"  warning: could not parse filename: {filename}")
                    stats['failed'] += 1
                    stats['failed_files'].append(filename)
                    continue
                
                # read the csv file
                df = pd.read_csv(str(csv_file))
                
                # check if file is empty
                if df.empty:
                    print(f"  warning: empty file: {filename}")
                    stats['failed'] += 1
                    stats['failed_files'].append(filename)
                    continue
                
                # rename pollutant_std to SpeciesCode if it exists
                if 'pollutant_std' in df.columns:
                    df.rename(columns={'pollutant_std': 'SpeciesCode'}, inplace=True)
                
                # add SiteCode column
                df['SiteCode'] = site_code
                
                # if SpeciesCode column does not exist, add it
                if 'SpeciesCode' not in df.columns:
                    df['SpeciesCode'] = species_code
                
                # match with metadata to get additional columns
                metadata_match = metadata[
                    (metadata['SiteCode'] == site_code) & 
                    (metadata['SpeciesCode'] == species_code)
                ]
                
                if not metadata_match.empty:
                    # get first matching row
                    meta_row = metadata_match.iloc[0]
                    
                    # add metadata columns
                    df['SpeciesName'] = meta_row['SpeciesName']
                    df['SiteName'] = meta_row['SiteName']
                    df['SiteType'] = meta_row['SiteType']
                    df['Latitude'] = meta_row['Latitude']
                    df['Longitude'] = meta_row['Longitude']
                else:
                    print(f"  warning: no metadata found for {site_code}_{species_code}")
                    # add empty columns to maintain structure
                    df['SpeciesName'] = None
                    df['SiteName'] = None
                    df['SiteType'] = None
                    df['Latitude'] = None
                    df['Longitude'] = None
                
                # reorder columns to match required structure
                column_order = [
                    '@MeasurementDateGMT', '@Value', 'SpeciesCode', 'SiteCode',
                    'SpeciesName', 'SiteName', 'SiteType', 'Latitude', 'Longitude'
                ]
                
                # only keep columns that exist
                final_columns = [col for col in column_order if col in df.columns]
                df = df[final_columns]
                
                # save to output directory
                output_file = output_month_dir / filename
                df.to_csv(output_file, index=False)
                
                stats['successfully_processed'] += 1
                
                if stats['successfully_processed'] % 100 == 0:
                    print(f"  processed {stats['successfully_processed']} files...")
            
            except Exception as e:
                print(f"  error processing {filename}: {str(e)}")
                stats['failed'] += 1
                stats['failed_files'].append(filename)
    
    # print final summary
    print("\n" + "="*120)
    print("Standardisation complete")
    print("="*120)
    print(f"\ntotal files: {stats['total_files']}")
    print(f"Successfully processed: {stats['successfully_processed']}")
    print(f"failed: {stats['failed']}")
    
    if stats['failed'] > 0:
        print(f"\nfailed files (first 10):")
        for f in stats['failed_files'][:10]:
            print(f"  - {f}")
        if len(stats['failed_files']) > 10:
            print(f"  ... and {len(stats['failed_files']) - 10} more")
    
    print(f"\noutput directory: {optimased_dir}")
    print("="*120 + "\n")
    
    return stats

In [None]:
# step 1: standardize files
print(": Standardizing LAQN files.")
standardization_stats = std_laqn_files()

## 2) Functions below:

### 1.1)The functions for discover and checks data quality metrics before cleaning, below.

#### 1) Data quality function, what it does:
- Counts total rows in dataset
- Identifies missing values per column (count + percentage)
- Counts duplicate rows based on timestamp
- Detects negative values in measurements
- Checks timestamp format issues

In [None]:
from typing import Dict

def data_quality(df: pd.DataFrame, filename: str) -> Dict:
    """
    Checking data quality metrics before start cleaning.
    
    Returns dict with:
    - total_rows
    - missing_values
    - duplicate_count
    - negative_values
    - timestamp_format
    """
    assessment = {
        'filename': filename,
        'total_rows': len(df),
        'missing_values': {},
        'duplicate_count': 0,
        'negative_values': 0,
        'timestamp_issues': False
    }
    
    # missing values
    for col in df.columns:
        missing = df[col].isnull().sum()
        if missing > 0:
            assessment['missing_values'][col] = {
                'count': int(missing),
                'percentage': round(missing / len(df) * 100, 2)
            }
    
    # duplicates
    if '@MeasurementDateGMT' in df.columns:
        assessment['duplicate_count'] = df.duplicated(
            subset=['@MeasurementDateGMT']
        ).sum()
    
    # negative values
    if '@Value' in df.columns:
        assessment['negative_values'] = (df['@Value'] < 0).sum()
    
    # timestamp format
    if '@MeasurementDateGMT' in df.columns:
        assessment['timestamp_issues'] = df['@MeasurementDateGMT'].dtype == 'object'
    
    return assessment

#### 2) Below the function for missing data gasps:
- Checks which site/species  are missing in Monthly/year folders.

In [None]:
def analyse_missing_data():
    monthly_data_dir = base_dir / "data" / "laqn" / "monthly_data"
    year_2023_dir = base_dir / "data" / "laqn" / "year_2023"
    metadata_file = base_dir / "data" / "laqn" / "actv_sites_species.csv"

    metadata = pd.read_csv(metadata_file)
    expected = {(r["SiteCode"], r["SpeciesCode"]) for _, r in metadata.iterrows()}

    out = {
        "monthly_data": {"by_location": {}, "by_pollutant": {}, "by_month": {}, "missing_combinations": []},
        "year_2023": {"by_location": {}, "by_pollutant": {}, "missing_combinations": []},
        "summary": {"expected_combinations": len(expected), "monthly_data_found": 0, "year_2023_found": 0, "total_missing": 0},
    }

    if monthly_data_dir.exists():
        found_m = set()
        for fp in monthly_data_dir.rglob("*.csv"):
            parts = fp.name.split("_")
            if len(parts) >= 2:
                site, pol = parts[0], parts[1]
                month = fp.parent.name
                found_m.add((site, pol))
                out["monthly_data"]["by_location"].setdefault(site, {}).setdefault(pol, []).append(month)
                out["monthly_data"]["by_pollutant"].setdefault(pol, {}).setdefault(site, []).append(month)
                out["monthly_data"]["by_month"].setdefault(month, []).append((site, pol))
        missing_m = expected - found_m
        for s, p in sorted(missing_m):
            out["monthly_data"]["missing_combinations"].append({"site": s, "pollutant": p, "data_source": "monthly_data"})
        out["summary"]["monthly_data_found"] = len(found_m)

    if year_2023_dir.exists():
        found_y = set()
        for fp in year_2023_dir.glob("*.csv"):
            parts = fp.name.split("_")
            if len(parts) >= 2:
                site, pol = parts[0], parts[1]
                found_y.add((site, pol))
                out["year_2023"]["by_location"].setdefault(site, []).append(pol)
                out["year_2023"]["by_pollutant"].setdefault(pol, []).append(site)
        missing_y = expected - found_y
        for s, p in sorted(missing_y):
            out["year_2023"]["missing_combinations"].append({"site": s, "pollutant": p, "data_source": "year_2023"})
        out["summary"]["year_2023_found"] = len(found_y)

    out["summary"]["total_missing"] = (
        len(out["monthly_data"]["missing_combinations"]) + len(out["year_2023"]["missing_combinations"])
    )
    return out

#### 3) Get missing month locations function below:
- for each month in monthly_data/ lists available/missing pollutants per site.

In [None]:
def get_missing_month_location():
    monthly_data_dir = base_dir / "data" / "laqn" / "monthly_data"
    metadata_file = base_dir / "data" / "laqn" / "actv_sites_species.csv"

    md = pd.read_csv(metadata_file)
    all_pol = set(md["SpeciesCode"].unique())
    all_sites = set(md["SiteCode"].unique())
    result = {}

    if monthly_data_dir.exists():
        for month_dir in sorted([d for d in monthly_data_dir.iterdir() if d.is_dir()]):
            month = month_dir.name
            result[month] = {}
            found = {}
            for fp in month_dir.glob("*.csv"):
                parts = fp.name.split("_")
                if len(parts) >= 2:
                    site, pol = parts[0], parts[1]
                    found.setdefault(site, set()).add(pol)
            for site in sorted(all_sites):
                available = found.get(site, set())
                missing = all_pol - available
                if available or missing:
                    result[month][site] = {
                        "available_pollutants": sorted(available),
                        "missing_pollutants": sorted(missing),
                        "count_available": len(available),
                        "count_missing": len(missing),
                    }
    return result

#### 4) Analayse standartised files missing values.
 - finds atd files that have categoritical columns as the way I standartise.

In [None]:
def analyze_categorical_with_missing():
    if not std_dir.exists():
        logger.error(f"Directory not found: {std_dir}")
        return {}

    cat_cols = ["SiteName", "SiteType", "SpeciesName"]
    # Changed: iterate through month directories instead of year directories
    month_dirs = sorted([d for d in std_dir.iterdir() if d.is_dir()])

    results = {
        "files_with_categorical_and_missing": [],
        "summary": {"total_files_checked": 0, "files_with_categorical": 0, "files_with_both": 0},
    }

    for month_dir in month_dirs:
        month_name = month_dir.name  # e.g., "2023_apr"
        for fp in sorted(month_dir.glob("*.csv")):
            results["summary"]["total_files_checked"] += 1
            try:
                df = pd.read_csv(str(fp))  # Convert to string for space in path
                if df.empty:
                    continue
                has_cat = any(c in df.columns for c in cat_cols)
                if has_cat:
                    results["summary"]["files_with_categorical"] += 1
                    if df.isnull().any().any():
                        info = {
                            "month": month_name,
                            "file": fp.name,
                            "path": str(fp),
                            "total_rows": len(df),
                            "categorical_columns": [c for c in cat_cols if c in df.columns],
                            "missing_details": {},
                        }
                        for col in df.columns:
                            miss = df[col].isna().sum()
                            if miss > 0:
                                pct = (miss / len(df) * 100) if len(df) else 0
                                info["missing_details"][col] = {"missing_count": int(miss), "missing_pct": round(pct, 2)}
                        results["files_with_categorical_and_missing"].append(info)
                        results["summary"]["files_with_both"] += 1
            except Exception as e:
                logger.warning(f"Error analyzing {fp}: {e}")
    return results



#### 5) Function that checks what is categorically missing and creates a log entry to track the pattern.
- This extension is useful for understanding why data is missing whether due to system overload, a fetching error/bug, incorrect URL or endpoint requests, or non-responsive endpoints.
- While I was fetching the LAQN 2023 yearly data, I first fetched it quarterly using a parallel multiprocessing method. It took around 20–30 minutes to retrieve one year of data.
- After that, I decided to use the same method but fetch the data monthly for 2024 up to 19.11.2025, so I changed the folder structure. I then re-fetched the 2023 datasets monthly to keep everything consistent. However, I did not remove the first trial, so the data overlapped. That’s why I needed two different log files to check and compare.



In [None]:
def generate_categorical_missing_log():
    # Change output directory to data/laqn/missing
    output_dir = base_dir / "data" / "laqn" / "missing"
    output_dir.mkdir(parents=True, exist_ok=True)

    analysis = analyze_categorical_with_missing()
    all_files = analysis["files_with_categorical_and_missing"]
    if not all_files:
        logger.info("No files found with both categorical columns and missing values")
        return None

    rows = []
    for f in all_files:
        parts = f["file"].replace(".csv", "").split("_")
        site = parts[0] if len(parts) > 0 else ""
        species = parts[1] if len(parts) > 1 else ""
        month = parts[2].split("-")[1] if len(parts) >= 4 else ""
        val_missing = f["missing_details"].get("value", {})
        miss_count = val_missing.get("missing_count", 0)
        miss_pct = val_missing.get("missing_pct", 0)
        rows.append({
            "File": f["file"],
            "year": f["year"],
            "month": month,
            "siteCode": site,
            "SpeciesCode": species,
            "path": f["path"],
            "total_rows": f["total_rows"],
            "value": f"{miss_count}/{f['total_rows']} rows ({miss_pct}%)",
            "categorical_columns": ",".join(f["categorical_columns"]),
        })

    df_log = pd.DataFrame(rows)
    out_file = output_dir / "2missing_files_log.csv"
    df_log.to_csv(out_file, index=False)
    return str(out_file)

#### 6) Cross check the missing logs function:
 - Compares two log files log and log2 files for overlap/differences.

In [None]:
def cross_check_logs():
    # Update paths to data/laqn/missing
    log1 = base_dir / "data" / "laqn" / "missing" / "missing_files_log.csv"
    log2 = base_dir / "data" / "laqn" / "missing" / "2missing_files_log.csv"

    if not log1.exists():
        logger.error(f"First log not found: {log1}")
        return {}
    if not log2.exists():
        logger.error(f"Second log not found: {log2}")
        return {}

    df1, df2 = pd.read_csv(log1), pd.read_csv(log2)
    files1, files2 = set(df1["File"].values), set(df2["File"].values)

    only1 = files1 - files2
    only2 = files2 - files1
    both = files1 & files2

    return {
        "only_in_log1": sorted(only1),
        "only_in_log2": sorted(only2),
        "in_both": sorted(both),
        "statistics": {
            "log1_total": len(files1),
            "log2_total": len(files2),
            "only_in_log1_count": len(only1),
            "only_in_log2_count": len(only2),
            "in_both_count": len(both),
            "overlap_percentage": round((len(both) / len(files1) * 100) if files1 else 0, 2),
        },
        "log1_details": {
            "path": str(log1),
            "total_files": len(df1),
            "by_year": df1["year"].value_counts().to_dict(),
            "unique_sites": df1["siteCode"].nunique(),
            "unique_species": df1["SpeciesCode"].nunique(),
        },
        "log2_details": {
            "path": str(log2),
            "total_files": len(df2),
            "by_year": df2["year"].value_counts().to_dict() if len(df2) else {},
            "unique_sites": df2["siteCode"].nunique() if len(df2) else 0,
            "unique_species": df2["SpeciesCode"].nunique() if len(df2) else 0,
        },
    }

## 3) Comprehensive test to find ALL missing and problematic data in standardized files.

#### 1) testing functions for all the data checks.


What the testing function does below:
- Scans all year directories in std
- Checks each CSV file for 8 types of issues:
    - Empty files: No rows
    - Column errors: Missing required columns (timestamp, SiteCode, SpeciesCode, value)
    - Duplicate timestamps: Multiple measurements at same time
    - Missing SiteCode: Null values in SiteCode column
    - Missing SpeciesCode: Null values in SpeciesCode column
    - High missing values: >20% of value column is null
    - Format errors: Cannot read file
    - Provides detailed reporting with:
        - Total files processed
        - Issue statistics
        - Examples of each problem type
        - Severity analysis

In [None]:
def run_comprehensive_missing_data_analysis():
    """Comprehensive test to find all missing and problematic data in standardized files"""
    print("\n" + "="*100)
    print("Test result: comprehensive missing data analysis")
    print("Checking all files in: data/laqn/processed/")
    print("="*100)
    
    # Get all month directories
    month_dirs = sorted([d for d in std_dir.iterdir() if d.is_dir()])
    
    print(f"\nFound {len(month_dirs)} month directories")
    
    # Initialize tracking
    all_issues = {
        'empty_files': [],
        'duplicate_timestamps': [],
        'missing_sitecode': [],
        'missing_speciescode': [],
        'high_missing_values': [],
        'column_errors': [],
        'format_errors': []
    }
    
    total_files = 0
    processed_files = 0
    files_with_issues = 0
    
    for month_dir in month_dirs:
        month_name = month_dir.name
        print(f"\n" + "-"*100)
        print(f"Month: {month_name}")
        print("-"*100)
        
        csv_files = sorted(list(month_dir.glob('*.csv')))
        print(f"Total files in {month_name}: {len(csv_files)}")
        
        month_high_missing = 0
        month_duplicates = 0
        month_empty = 0
        
        for filepath in csv_files:
            total_files += 1
            filename = filepath.name
            
            try:
                # Convert Path to string and read file - FIX FOR SPACE IN PATH
                df = pd.read_csv(str(filepath))
                
                # Check if file is empty
                if df.empty or len(df) == 0:
                    all_issues['empty_files'].append({
                        'month': month_name,
                        'file': filename,
                        'path': str(filepath)
                    })
                    month_empty += 1
                    files_with_issues += 1
                    continue
                
                processed_files += 1
                
                # Parse filename to extract expected site and species code
                parts = filename.replace('.csv', '').split('_')
                expected_site = parts[0] if len(parts) > 0 else None
                expected_species = parts[1] if len(parts) > 1 else None
                
                # Rule 1: Check for required columns
                required_columns = ['timestamp', 'SiteCode', 'SpeciesCode', 'value']
                missing_cols = [col for col in required_columns if col not in df.columns]
                
                if missing_cols:
                    all_issues['column_errors'].append({
                        'month': month_name,
                        'file': filename,
                        'path': str(filepath),
                        'missing_columns': missing_cols,
                        'actual_columns': list(df.columns)
                    })
                    files_with_issues += 1
                    continue
                
                # Rule 2: Check for duplicate timestamps
                if 'timestamp' in df.columns:
                    duplicate_timestamps = df['timestamp'].duplicated().sum()
                    if duplicate_timestamps > 0:
                        all_issues['duplicate_timestamps'].append({
                            'month': month_name,
                            'file': filename,
                            'path': str(filepath),
                            'duplicate_count': int(duplicate_timestamps),
                            'total_rows': len(df),
                            'duplicate_pct': round(duplicate_timestamps / len(df) * 100, 2)
                        })
                        month_duplicates += 1
                        files_with_issues += 1
                
                # Rule 3: Check SiteCode
                if 'SiteCode' in df.columns:
                    missing_sitecode = df['SiteCode'].isna().sum()
                    if missing_sitecode > 0:
                        all_issues['missing_sitecode'].append({
                            'month': month_name,
                            'file': filename,
                            'path': str(filepath),
                            'missing_count': int(missing_sitecode),
                            'total_rows': len(df),
                            'missing_pct': round(missing_sitecode / len(df) * 100, 2)
                        })
                        files_with_issues += 1
                
                # Rule 4: Check SpeciesCode
                if 'SpeciesCode' in df.columns:
                    missing_speciescode = df['SpeciesCode'].isna().sum()
                    if missing_speciescode > 0:
                        all_issues['missing_speciescode'].append({
                            'month': month_name,
                            'file': filename,
                            'path': str(filepath),
                            'missing_count': int(missing_speciescode),
                            'total_rows': len(df),
                            'missing_pct': round(missing_speciescode / len(df) * 100, 2)
                        })
                        files_with_issues += 1
                
                # Rule 5: Check value column (>20% missing is a problem)
                if 'value' in df.columns:
                    missing_values = df['value'].isna().sum()
                    missing_pct = missing_values / len(df) * 100
                    
                    if missing_pct > 20:
                        all_issues['high_missing_values'].append({
                            'month': month_name,
                            'file': filename,
                            'path': str(filepath),
                            'missing_count': int(missing_values),
                            'total_rows': len(df),
                            'missing_pct': round(missing_pct, 2)
                        })
                        month_high_missing += 1
                        files_with_issues += 1
            
            except Exception as e:
                all_issues['format_errors'].append({
                    'month': month_name,
                    'file': filename,
                    'path': str(filepath),
                    'error': str(e)
                })
                files_with_issues += 1
        
        # Month summary
        print(f"\nMonth {month_name} summary:")
        print(f"  Total files: {len(csv_files)}")
        print(f"  Files with issues: {month_empty + month_duplicates + month_high_missing}")
        print(f"    - Empty files: {month_empty}")
        print(f"    - Duplicate timestamps: {month_duplicates}")
        print(f"    - High missing values (>20%): {month_high_missing}")
    
    # Print detailed report (keep the rest of the printing code the same, but change 'year' to 'month' in output)
    # ... rest stays the same but replace item['year'] with item['month'] in all print statements

In [None]:
# Run the comprehensive missing data analysis
run_comprehensive_missing_data_analysis()


Test result: comprehensive missing data analysis
Checking all files in: data/laqn/processed/

Found 35 month directories

----------------------------------------------------------------------------------------------------
Month: 2023_apr
----------------------------------------------------------------------------------------------------
Total files in 2023_apr: 249

Month 2023_apr summary:
  Total files: 249
  Files with issues: 0
    - Empty files: 0
    - Duplicate timestamps: 0
    - High missing values (>20%): 0

----------------------------------------------------------------------------------------------------
Month: 2023_aug
----------------------------------------------------------------------------------------------------
Total files in 2023_aug: 249

Month 2023_aug summary:
  Total files: 249
  Files with issues: 0
    - Empty files: 0
    - Duplicate timestamps: 0
    - High missing values (>20%): 0

-------------------------------------------------------------------------

#### 2) a comprehensive function that checks all columns for missing values, validates data types, calculates missing rates, prints paths, and logs files with >20% missing values.

In [None]:
def detailed_data_quality_analysis():
    """
    Comprehensive analysis of all data quality metrics including:
    - Missing values for ALL columns
    - Data type validation
    - Missing percentage per file
    - File paths printed
    - Log files with >20% missing to CSV
    """
    print("\n" + "="*120)
    print("Data Quality Analysis Report, checks all columns")
    print("="*120)
    
    # Get all month directories
    month_dirs = sorted([d for d in std_dir.iterdir() if d.is_dir()])
    
    print(f"\nFound {len(month_dirs)} month directories\n")
    
    # Initialize tracking
    missing_values_log = []
    total_stats = {
        'total_files': 0,
        'files_processed': 0,
        'files_with_issues': 0,
        'total_rows': 0,
        'columns_checked': 0
    }
    
    for month_dir in month_dirs:
        month_name = month_dir.name
        year = month_name.split('_')[0]
        
        print(f"\n{'='*120}")
        print(f"MONTH: {month_name} | YEAR: {year}")
        print(f"{'='*120}\n")
        
        csv_files = sorted(list(month_dir.glob('*.csv')))
        print(f"Total files in {month_name}: {len(csv_files)}\n")
        
        for filepath in csv_files:
            total_stats['total_files'] += 1
            filename = filepath.name
            file_path_str = str(filepath)
            
            # Extract site and pollutant from filename
            parts = filename.replace('.csv', '').split('_')
            site_code = parts[0] if len(parts) > 0 else "UNKNOWN"
            pollutant = parts[1] if len(parts) > 1 else "UNKNOWN"
            
            try:
                # Read file
                df = pd.read_csv(file_path_str)
                
                if df.empty or len(df) == 0:
                    print(f"  ⚠ EMPTY FILE: {filename}")
                    print(f"    Path: {file_path_str}\n")
                    continue
                
                total_stats['files_processed'] += 1
                total_stats['total_rows'] += len(df)
                
                # Print file header
                print(f"\n FILE: {filename}")
                print(f"Path: {file_path_str}")
                print(f"Site: {site_code} | Pollutant: {pollutant}")
                print(f"Rows: {len(df)} | Columns: {len(df.columns)}")
                print(f"     {'-'*110}")
                
                # Check all columns for missing values and data types
                file_has_issues = False
                overall_missing_count = 0
                overall_missing_pct = 0
                
                for col in df.columns:
                    missing_count = df[col].isna().sum()
                    missing_pct = (missing_count / len(df) * 100) if len(df) > 0 else 0
                    data_type = str(df[col].dtype)
                    
                    # Print all columns
                    status = "Missinf" if missing_count == 0 else "No missing"
                    print(f"     {status} Column: '{col}'")
                    print(f"        Data Type: {data_type}")
                    print(f"        Missing: {missing_count}/{len(df)} ({missing_pct:.2f}%)")
                    
                    if missing_count > 0:
                        file_has_issues = True
                        overall_missing_count += missing_count
                    
                    # Data type validation
                    if col == 'timestamp':
                        if data_type not in ['datetime64[ns]', 'object']:
                            print(f"Type error: Expected datetime, got {data_type}")
                    elif col in ['value', 'SiteCode', 'SpeciesCode']:
                        if col == 'value' and data_type not in ['float64', 'int64', 'object']:
                            print(f"Type warning: {col} should be numeric, got {data_type}")
                
                # Calculate overall file missing percentage
                total_cells = len(df) * len(df.columns)
                overall_missing_pct = (overall_missing_count / total_cells * 100) if total_cells > 0 else 0
                
                print(f"\n     Summary for files:")
                print(f"     Total Missing Cells: {overall_missing_count}/{total_cells}")
                print(f"     Overall Missing Rate: {overall_missing_pct:.2f}%\n")
                
                # Log files with >20% missing
                if overall_missing_pct > 20:
                    total_stats['files_with_issues'] += 1
                    missing_values_log.append({
                        'filename': filename,
                        'path': file_path_str,
                        'siteCode': site_code,
                        'pollutant': pollutant,
                        'missingPercentage': df['value'].isna().mean() * 100,
                        'year': year,
                        'month': month_name,
                        'totalRows': len(df),
                        'totalColumns': len(df.columns),
                        'missingCells': overall_missing_count
                    })
                    print(f"     File failed: >20% missing values ({overall_missing_pct:.2f}%)")
                    print(f"     Will be added to logs_missing_file.csv\n")
            
            except Exception as e:
                print(f"ERROR READING FILE: {filename}")
                print(f"Path: {file_path_str}")
                print(f"Error: {str(e)}\n")
    
    # Print final summary
    print("\n" + "="*120)
    print("Final Summary of Data Quality Analysis")
    print("="*120)
    print(f"\nTotal Files Processed: {total_stats['files_processed']}")
    print(f"Files with >20% Missing: {total_stats['files_with_issues']}")
    print(f"Total Rows Analyzed: {total_stats['total_rows']:,}")
    print(f"Issue Rate: {(total_stats['files_with_issues']/total_stats['files_processed']*100):.1f}%")
    
    # Save missing files log to CSV
    if missing_values_log:
        output_dir = base_dir / "data" / "laqn" / "missing"
        output_dir.mkdir(parents=True, exist_ok=True)
        
        df_log = pd.DataFrame(missing_values_log)
        log_file = output_dir / "logs_missing_file.csv"
        df_log.to_csv(log_file, index=False)
        
        print(f"\n Missing files log saved to:")
        print(f"  {log_file}")
        print(f"\nFiles logged: {len(missing_values_log)}")
        print(f"\nFirst few entries:")
        print(df_log.head().to_string())
    else:
        print(f"\n No files with >20% missing values found!")
    
    print("\n" + "="*120 + "\n")
    
    return missing_values_log

In [None]:
# Run the detailed data quality analysis
missing_log = detailed_data_quality_analysis()


Data Quality Analysis Report, checks all columns

Found 35 month directories


MONTH: 2023_apr | YEAR: 2023

Total files in 2023_apr: 249


 FILE: BG1_NO2_2023-04-01_2023-04-30.csv
Path: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/processed/2023_apr/BG1_NO2_2023-04-01_2023-04-30.csv
Site: BG1 | Pollutant: NO2
Rows: 696 | Columns: 3
     --------------------------------------------------------------------------------------------------------------
     Missinf Column: '@MeasurementDateGMT'
        Data Type: object
        Missing: 0/696 (0.00%)
     Missinf Column: '@Value'
        Data Type: float64
        Missing: 0/696 (0.00%)
     Missinf Column: 'pollutant_std'
        Data Type: object
        Missing: 0/696 (0.00%)

     Summary for files:
     Total Missing Cells: 0/2088
     Overall Missing Rate: 0.00%


 FILE: BG1_SO2_2023-04-01_2023-04-30.csv
Path: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/p