# Cleaning LAQN Datasets
I will be removing the files has %100 missing values.

import and path statements below.

In [None]:
import pandas as pd
from pathlib import Path
import os
import re

# paths beloww
base_dir = Path("/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels")
log_path = base_dir / "data" / "laqn" / "missing" / "logs_missin_value.csv"
optimased_dir = base_dir / "data" / "laqn" / "optimased"
processed_dir = base_dir / "data" / "laqn" / "processed"
month_dirs = sorted([d for d in optimased_dir.iterdir() if d.is_dir()])

# Change output directory to data/laqn/missing
output_dir = base_dir / "data" / "laqn" / "missing"
output_dir.mkdir(parents=True, exist_ok=True)

# function for value higher than 80%
input_file = base_dir / "data" / "laqn" / "missing" / "rm_100percnt_missingValues_log.csv"
output_file = base_dir / "data" / "laqn" / "missing" / "logs_emptyValue_higher80.csv"

# affected sites/species analysis function
nonactive_sites_input = base_dir / "data" / "laqn" / "missing" / "affected_sites_species_counts.csv"

### 1. Funtion for removes 100 percent missed data.

In [3]:
def remove_100_missing_files():
    """
    Remove CSV files in optimased/ that have 100% missing values,
    as listed in logs_missin_value.csv.
    """

    # Read log file
    df = pd.read_csv(log_path, encoding='utf-8')
    print(f"Loaded {len(df)} rows from {log_path}")

    # Filter for 100% missing
    df_100 = df[df['EmptyValuePercentage'] == 100]
    print(f"Found {len(df_100)} files with 100% missing values.")

    # Remove files
    removed = 0
    for idx, row in df_100.iterrows():
        file_path = Path(row['path'])
        if not file_path.is_absolute():
            file_path = optimased_dir / file_path
        if file_path.exists():
            os.remove(file_path)
            print(f"Removed: {file_path}")
            removed += 1
        else:
            print(f"File not found: {file_path}")

    print(f"Total files removed: {removed}")

In [4]:

# Example usage
remove_100_missing_files()

Loaded 4136 rows from /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/logs_missin_value.csv
Found 3401 files with 100% missing values.
Removed: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_apr/BL0_CO_2023-04-01_2023-04-30.csv
Removed: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_apr/BT4_SO2_2023-04-01_2023-04-30.csv
Removed: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_apr/BT5_PM2.5_2023-04-01_2023-04-30.csv
Removed: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_apr/BT6_PM2.5_2023-04-01_2023-04-30.csv
Removed: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_apr/BT6_SO2_2023-04-01_2023-04-30.csv
Removed: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-level

### 2) Removed all the files has %100 missing value match.
- below i will be check data/laqn/optimased folder again and will create new log csv according to what I have.
- Copy and paste the code from laqn_check.ipynb file changing the name of the new log csv.

In [6]:
def detailed_data_quality_analysis_rm():
    """
    Comprehensive analysis of all data quality metrics for files in optimased/.
    Saves log as rm_100percnt_missingValues_log.csv.
    """
    print("\n" + "="*120)
    print("Data Quality Analysis Report, checks all columns")
    print("="*120)
    month_dirs = sorted([d for d in optimased_dir.iterdir() if d.is_dir()])
    print(f"\nFound {len(month_dirs)} month directories\n")
    
    missing_values_log = []
    all_issues = {
        'empty_files': [],
        'duplicate_timestamps': [],
        'missing_sitecode': [],
        'missing_speciescode': [],
        'high_missing_values': [],
        'column_errors': [],
        'format_errors': []
    }
    total_stats = {
        'total_files': 0,
        'files_processed': 0,
        'files_with_high_missing': 0,
        'total_rows': 0,
        'empty_files': 0,
        'files_with_missing_timestamps': 0,
        'files_with_missing_sitecodes': 0
    }
    
    for month_dir in month_dirs:
        month_name = month_dir.name
        year = month_name.split('_')[0]
        print(f"\n{'='*120}")
        print(f"Month: {month_name} | Year: {year}")
        print(f"{'='*120}\n")
        
        csv_files = sorted(list(month_dir.glob('*.csv')))
        print(f"Total files in {month_name}: {len(csv_files)}\n")

        if not csv_files:
            print(f"no csv files found in {month_name}")
            continue
        
        for filepath in csv_files:
            total_stats['total_files'] += 1
            filename = filepath.name
            file_path_str = str(filepath)
            parts = filename.replace('.csv', '').split('_')
            site_code = parts[0] if len(parts) > 0 else "UNKNOWN"
            pollutant = parts[1] if len(parts) > 1 else "UNKNOWN"
            
            try:
                df = pd.read_csv(file_path_str)
                if df.empty or len(df) == 0:
                    print(f" Empty file: {filename}")
                    print(f" Path: {file_path_str}\n")
                    all_issues['empty_files'].append({'month': month_name, 'file': filename, 'path': file_path_str})
                    total_stats['empty_files'] += 1
                    continue
                
                total_stats['files_processed'] += 1
                total_stats['total_rows'] += len(df)
                
                print(f"\n FILE: {filename}")
                print(f"Path: {file_path_str}")
                print(f"Site: {site_code} | Pollutant: {pollutant}")
                print(f"Rows: {len(df)} | Columns: {len(df.columns)}")
                print(f"     {'-'*110}")
                
                required_columns = ['@MeasurementDateGMT', '@Value', 'SiteCode', 'SpeciesCode']
                missing_cols = [col for col in required_columns if col not in df.columns]
                if missing_cols:
                    print(f"  warning: missing required columns: {missing_cols}")
                    all_issues['column_errors'].append({'month': month_name, 'file': filename, 'path': file_path_str, 'missing_columns': missing_cols, 'actual_columns': list(df.columns)})
                    continue

                if '@MeasurementDateGMT' in df.columns:
                    duplicate_timestamps = df['@MeasurementDateGMT'].duplicated().sum()
                    if duplicate_timestamps > 0:
                        print(f"  duplicate timestamps: {duplicate_timestamps}/{len(df)}")
                        all_issues['duplicate_timestamps'].append({'month': month_name, 'file': filename, 'path': file_path_str, 'duplicate_count': int(duplicate_timestamps), 'total_rows': len(df)})
                
                missing_sitecodes = df['SiteCode'].isna().sum()
                if missing_sitecodes > 0:
                    sitecode_pct = (100 * missing_sitecodes / len(df))
                    print(f"  missing sitecodes: {missing_sitecodes}/{len(df)} ({sitecode_pct:.2f}%)")
                    all_issues['missing_sitecode'].append({'month': month_name, 'file': filename, 'path': file_path_str, 'missing_count': int(missing_sitecodes), 'total_rows': len(df)})
                    total_stats['files_with_missing_sitecodes'] += 1

                missing_speciescodes = df['SpeciesCode'].isna().sum()
                if missing_speciescodes > 0:
                    speciescode_pct = (100 * missing_speciescodes / len(df))
                    print(f"  missing speciescodes: {missing_speciescodes}/{len(df)} ({speciescode_pct:.2f}%)")
                    all_issues['missing_speciescode'].append({'month': month_name, 'file': filename, 'path': file_path_str, 'missing_count': int(missing_speciescodes), 'total_rows': len(df)})

                for col in df.columns:
                    data_type = str(df[col].dtype)
                    if col == '@MeasurementDateGMT' and data_type not in ['datetime64[ns]', 'object']:
                        print(f"Type error: {col} expected datetime, got {data_type}")
                    elif col in ['@Value', 'SiteCode', 'SpeciesCode'] and data_type not in ['float64', 'int64', 'object']:
                        print(f"Type warning: {col} should be numeric or string, got {data_type}")

                for col in df.columns:
                    missing_count = df[col].isna().sum()
                    missing_pct = (missing_count / len(df) * 100) if len(df) > 0 else 0
                    if missing_count > 0:
                        print(f"  missing {col}: {missing_count}/{len(df)} ({missing_pct:.2f}%)")
                
                missing_values = df['@Value'].isna().sum()
                empty_value_percentage = (100 * missing_values / len(df)) if len(df) > 0 else 0
                print(f"  missing @Value: {missing_values}/{len(df)} ({empty_value_percentage:.2f}%)")
                
                if empty_value_percentage > 20:
                    total_stats['files_with_high_missing'] += 1
                    missing_values_log.append({
                        'filename': filename,
                        'path': file_path_str,
                        'siteCode': site_code,
                        'SpeciesCode': pollutant,
                        'year': year,
                        'month': month_name,
                        'EmptyValuePercentage': round(empty_value_percentage, 2)
                    })
                    print(f"  flagged: >20% missing @Value ({empty_value_percentage:.2f}%)")
                    print(f"  will be logged to rm_100percnt_missingValues_log.csv")
            
            except Exception as e:
                print(f"\n  error reading file: {filename}")
                print(f"  path: {file_path_str}")
                print(f"  error: {str(e)}")
                all_issues['format_errors'].append({'month': month_name, 'file': filename, 'path': file_path_str, 'error': str(e)})

    print("\n" + "="*120)
    print("Final Summary of Data Quality Analysis")
    print("="*120)
    print(f"\nTotal Files Processed: {total_stats['files_processed']}")
    print(f"empty files: {total_stats['empty_files']}")
    print(f"files with missing timestamps: {total_stats['files_with_missing_timestamps']}")
    print(f"files with missing sitecodes: {total_stats['files_with_missing_sitecodes']}")
    print(f"files with >20% missing @Value: {total_stats['files_with_high_missing']}")
    print(f"total rows analyzed: {total_stats['total_rows']:,}")
    
    if total_stats['files_processed'] > 0:
        issue_rate = (total_stats['files_with_high_missing'] / total_stats['files_processed'] * 100)
        print(f"issue rate: {issue_rate:.1f}%")
    
    # save missing files log to csv
    if missing_values_log:
        df_log = pd.DataFrame(missing_values_log)
        log_file = output_dir / "rm_100percnt_missingValues_log.csv"
        df_log.to_csv(log_file, index=False)
        print(f"\nmissing files log saved to: {log_file}")
        print(f"files logged: {len(missing_values_log)}")
        print(f"\ncolumns in output csv:")
        print(f"  {', '.join(df_log.columns.tolist())}")
        print(f"\nfirst 5 entries:")
        print(df_log.head().to_string(index=False))
    else:
        print(f"\nno files with >20% missing @Value found.")
    
    print("\n" + "="*120 + "\n")
    return all_issues

In [7]:
reanalyses_laqn_data_quality = detailed_data_quality_analysis_rm()


Data Quality Analysis Report, checks all columns

Found 35 month directories


Month: 2023_apr | Year: 2023

Total files in 2023_apr: 146


 FILE: BG1_NO2_2023-04-01_2023-04-30.csv
Path: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_apr/BG1_NO2_2023-04-01_2023-04-30.csv
Site: BG1 | Pollutant: NO2
Rows: 696 | Columns: 9
     --------------------------------------------------------------------------------------------------------------
  missing @Value: 0/696 (0.00%)

 FILE: BG1_SO2_2023-04-01_2023-04-30.csv
Path: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_apr/BG1_SO2_2023-04-01_2023-04-30.csv
Site: BG1 | Pollutant: SO2
Rows: 696 | Columns: 9
     --------------------------------------------------------------------------------------------------------------
  missing @Value: 42/696 (6.03%)
  missing @Value: 42/696 (6.03%)

 FILE: BG2_NO2_2023-04-01_2023-04-30.csv
Path: /Users/bur

In [9]:
def filter_empty_value_higher_80():
    
    df = pd.read_csv(input_file)
    filtered = df[df['EmptyValuePercentage'] >= 80]
    filtered.to_csv(output_file, index=False)
    print(f"Saved {len(filtered)} rows to {output_file}")


In [10]:
# Example usage:
filter_empty_value_higher_80()

Saved 148 rows to /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/logs_emptyValue_higher80.csv


### 3) Document missing data below.

In [11]:
missing_log = pd.read_csv(log_path, encoding ='utf-8')

# Check which sites/species were most affected
grouped = missing_log.groupby(['siteCode', 'SpeciesCode', 'year', 'month']).size().reset_index(name='count')
print(grouped)

# Save to CSV
grouped.to_csv(log_path.parent /"affected_sites_species_counts.csv", index=False)


# Check temporal distribution
missing_log['month'] = missing_log['path'].str.extract(r'(\d{4}_\w{3})')
print(missing_log['month'].value_counts())

     siteCode SpeciesCode  year     month  count
0         BG1         NO2  2023  2023_jun      1
1         BG1         NO2  2023  2023_may      1
2         BG1         NO2  2024  2024_apr      1
3         BG1         NO2  2024  2024_aug      1
4         BG1         NO2  2024  2024_feb      1
...       ...         ...   ...       ...    ...
4131      WME       PM2.5  2025  2025_jul      1
4132      WME       PM2.5  2025  2025_jun      1
4133      WME       PM2.5  2025  2025_mar      1
4134      WME       PM2.5  2025  2025_may      1
4135      WME       PM2.5  2025  2025_sep      1

[4136 rows x 5 columns]
month
2024_aug    138
2025_mar    135
2024_may    133
2023_dec    131
2023_oct    131
2024_sep    129
2024_jul    128
2023_aug    127
2024_jan    127
2023_sep    126
2023_nov    125
2024_oct    124
2025_jan    124
2024_feb    123
2024_jun    123
2024_mar    123
2024_nov    123
2024_apr    123
2023_may    122
2024_dec    121
2023_mar    121
2025_feb    120
2023_jan    118
2023_jul    1

### 4) Function for whihc site-species are not monitored through to affecte_sites_species_counts.csv. 
 - If the station don't have value for all round year for that specie that's mean that that station not monitoring that species.
 - the funtion will returns tuples coveres monitored sitesm summary stats.
 

In [33]:
def extract_month_number(month_str):
    # expects format '2025_jul', returns 7 for 'jul'
    month_map = {'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6,
                 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}
    m = re.match(r"\d{4}_(\w{3})", str(month_str).lower())
    if m:
        return month_map.get(m.group(1), None)
    return None

def analyze_affected_sites( input_csv_path: Path, output_directory: Path) -> pd.DataFrame:
    """
    Simple scan of affected sites across 2023-2025 (2025 up to Nov).
    Args:
        non-active_sites: Path to affected_sites_species_counts.csv
        output_dir: Where to save output
    Returns:
        Filtered DataFrame
    """
    
    # Load the CSV
    df = pd.read_csv(input_csv_path)

    # Extract month number
    df['month_number'] = df['month'].apply(extract_month_number)

    # Remove rows where month conversion failed (just in case)
    df = df[df['month_number'].notna()].copy()
    
    # Filter: 2023 (all months), 2024 (all months), 2025 (Jan-Nov only)
    filtered = df[
        (df['year'] == 2023) |
        (df['year'] == 2024) |
        ((df['year'] == 2025) & (df['month_number'] <= 11))
    ].copy()
    
    # Create output directory
    output_directory.mkdir(parents=True, exist_ok=True)
    
    # Save filtered result
    output_file_path = output_directory / "non_active_sites_species.csv"
    filtered.to_csv(output_file_path, index=False)
    
    # Print summary
    print("="*60)
    print("affected sites/species analysis (2023-2025)")
    print("="*60)
    print(f"\nTotal rows: {len(filtered)}")
    print(f"Unique sites: {filtered['siteCode'].nunique()}")
    print(f"Unique species: {filtered['SpeciesCode'].nunique()}")
    print(f"Total missing files: {filtered['count'].sum()}")
    
    print(f"\nBreakdown by year:")
    print(filtered.groupby('year')['count'].sum())
    
    print(f"\nTop 10 affected site-species:")
    top10 = filtered.groupby(['siteCode', 'SpeciesCode'])['count'].sum().sort_values(ascending=False).head(10)
    print(top10)
    
    print(f"\n Saved to: {output_file_path}")
    
    return filtered

In [34]:
result_df = analyze_affected_sites(
    input_csv_path=nonactive_sites_input,
    output_directory=output_dir
)

print("\n" + "="*60)
print("First 10 row of filtered data:")
print("="*60)
print(result_df.head(10))

affected sites/species analysis (2023-2025)

Total rows: 4136
Unique sites: 81
Unique species: 6
Total missing files: 4136

Breakdown by year:
year
2023    1449
2024    1515
2025    1172
Name: count, dtype: int64

Top 10 affected site-species:
siteCode  SpeciesCode
HK6       O3             35
HV3       SO2            35
EN4       PM10           35
BX1       PM10           35
LB4       SO2            35
KF1       PM2.5          35
          PM10           35
KC1       PM10           35
BY7       CO             35
IS2       CO             35
Name: count, dtype: int64

 Saved to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/non_active_sites_species.csv

First 10 row of filtered data:
  siteCode SpeciesCode  year     month  count  month_number
0      BG1         NO2  2023  2023_jun      1             6
1      BG1         NO2  2023  2023_may      1             5
2      BG1         NO2  2024  2024_apr      1             4
3      BG1         NO2

In [38]:
df = pd.read_csv("/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/non_active_sites_species.csv")

# Group by siteCode, SpeciesCode, and year, then sum counts
summary = df.groupby(['siteCode', 'SpeciesCode', 'year'])['count'].sum().reset_index()

# Pivot to see years as columns (optional, for a wide view)
pivot = summary.pivot_table(index=['siteCode', 'SpeciesCode'], columns='year', values='count', fill_value=0)
pivot.to_csv("/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/nonActive_siteSpecies.csv")

print(pivot)

year                  2023  2024  2025
siteCode SpeciesCode                  
BG1      NO2           2.0   5.0   5.0
         SO2           3.0   4.0   8.0
BG2      NO2           1.0   2.0   6.0
         PM10          1.0   2.0   6.0
BL0      CO           12.0  12.0  11.0
...                    ...   ...   ...
WM6      PM10          0.0  11.0  11.0
         PM2.5        12.0   1.0   0.0
WME      NO2          12.0  12.0   9.0
         O3           12.0  12.0   7.0
         PM2.5        12.0  12.0   8.0

[233 rows x 3 columns]


In [39]:
pivot = pd.read_csv(
    "/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/nonActive_siteSpecies.csv",
    index_col=[0, 1]
)

# Ensure columns are integers (years)
pivot.columns = pivot.columns.astype(str)
required_years = ['2023', '2024', '2025']

# Only keep rows where all required years are present
pivot = pivot[[col for col in required_years if col in pivot.columns]]

# Filter for exact month counts: 2023 (12), 2024 (12), 2025 (11)
filtered = pivot[
    (pivot['2023'] == 12) &
    (pivot['2024'] == 12) &
    (pivot['2025'] == 11)
]

# Save the filtered DataFrame back to CSV
filtered.to_csv(
    "/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/nonActive_siteSpecies.csv"
)

print(f"Filtered site/species combos saved. Shape: {filtered.shape}")

Filtered site/species combos saved. Shape: (38, 3)


### 5) function to remove non-active site/species from active metadata csv file.
    - what does function:
        - removes rows from actv_sites_species.csv where (SiteName, SpeciesCode) matches any (siteCode, SpeciesCode) in nonActive_siteSpecies.csv
        - result is saved as updated_actv_siteSpecies.csv in the same directory

    - paths:
        - actv_sites_species.csv: base_dir / data / laqn / actv_sites_species.csv
        - nonActive_siteSpecies.csv: base_dir / data / laqn / missing / nonActive_siteSpecies.csv
        - updated_actv_siteSpecies.csv: base_dir / data / laqn / updated_actv_siteSpecies.csv

    - new csv file save as:updated_actv_siteSpecies.csv



In [None]:
def remove_nonactive_from_active(active_path, nonactive_path, output_path):
    """
    Remove non-active site/species combinations from active dataset.
    Args:
        active_path: Path to active site/species CSV
        nonactive_path: Path to non-active site/species CSV
        output_path: Where to save filtered active CSV
    """
    active_df = pd.read_csv(active_path)
    nonactive_df = pd.read_csv(nonactive_path)

    # Create a set of (siteCode, SpeciesCode) tuples for non-active
    nonactive_set = set(zip(nonactive_df['siteCode'], nonactive_df['SpeciesCode']))

    # Filter active_df to exclude non-active combinations
    filtered_active = active_df[
        ~active_df.apply(lambda row: (row['SiteCode'], row['SpeciesCode']) in nonactive_set, axis=1)
    ]

    # Save the filtered DataFrame
    filtered_active.to_csv(output_path, index=False)
    print(f"Filtered active data saved to {output_path}. Shape: {filtered_active.shape}")

In [None]:
remove_nonactive_from_active(
    active_path=base_dir / "data" / "laqn" / "actv_sites_species.csv",
    nonactive_path=base_dir / "data" / "laqn" / "missing" / "nonActive_siteSpecies.csv",
    output_path=base_dir / "data" / "laqn" / "updated_actv_siteSpecies.csv"
)

KeyError: 'siteCode'