# Cleaning LAQN Datasets
I will be removing the files has %100 missing values.

import and path statements below.

In [11]:
import pandas as pd
from pathlib import Path
import os
import re

# paths beloww
base_dir = Path("/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels")
nonActive_path = base_dir / "data" / "laqn" / "missing" / "nonActive_siteSpecies.csv"
optimased_dir = base_dir / "data" / "laqn" / "optimased"
processed_dir = base_dir / "data" / "laqn" / "processed"
month_dirs = sorted([d for d in optimased_dir.iterdir() if d.is_dir()])

# Change output directory to data/laqn/missing
output_dir = base_dir / "data" / "laqn" / "missing"
output_dir.mkdir(parents=True, exist_ok=True)

# function for value higher than 80%
input_file = base_dir / "data" / "laqn" / "missing" / "rm_100percnt_missingValues_log.csv"
output_file = base_dir / "data" / "laqn" / "missing" / "logs_emptyValue_higher80.csv"

# affected sites/species analysis function
nonactive_sites_input = base_dir / "data" / "laqn" / "missing" / "affected_sites_species_counts.csv"

# function to remove non-active sites/species from active metadata csv file I used in get_laqn fetching script
active_path = base_dir / "data" / "laqn" / "actv_sites_species.csv"
nonactive_path = base_dir / "data" / "laqn" / "missing" / "nonActive_siteSpecies.csv"
output_path = base_dir / "data" / "laqn" / "updated_actv_siteSpecies.csv"

### 1. Funtion for removes 100 percent missed data.

In [6]:
def rm_nonActiveSiteSpecies():
    """
    Remove CSV files in optimased/ that have 100% missing values,
    as listed in logs_missin_value.csv.
    """

    # Read log file
    df = pd.read_csv(nonActive_path, encoding='utf-8')
    print(f"Loaded {len(df)} rows from {nonActive_path}")

    # Remove files
    removed = 0
    for idx, row in df.iterrows():
        site = str(row['siteCode'])
        species = str(row['SpeciesCode'])
        # search for files matching site/species in all month folders
        for month_dir in optimased_dir.iterdir():
            if month_dir.is_dir():
                pattern = f"{site}_{species}_*.csv"
                for file in month_dir.glob(pattern):
                    if file.exists():
                        os.remove(file)
                        print(f"Removed: {file}")
                        removed += 1
    print(f"Total files removed: {removed}")

In [8]:

# Example usage
rm_nonActiveSiteSpecies()

Loaded 38 rows from /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/nonActive_siteSpecies.csv
Total files removed: 0


### 3) Document missing data below.

In [None]:
missing_log = pd.read_csv(nonActive_path, encoding ='utf-8')

# Check which sites/species were most affected
grouped = missing_log.groupby(['siteCode', 'SpeciesCode', 'year', 'month']).size().reset_index(name='count')
print(grouped)

# Save to CSV
grouped.to_csv(nonActive_path.parent /"affected_sites_species_counts.csv", index=False)


# Check temporal distribution
missing_log['month'] = missing_log['path'].str.extract(r'(\d{4}_\w{3})')
print(missing_log['month'].value_counts())

     siteCode SpeciesCode  year     month  count
0         BG1         NO2  2023  2023_jun      1
1         BG1         NO2  2023  2023_may      1
2         BG1         NO2  2024  2024_apr      1
3         BG1         NO2  2024  2024_aug      1
4         BG1         NO2  2024  2024_feb      1
...       ...         ...   ...       ...    ...
4131      WME       PM2.5  2025  2025_jul      1
4132      WME       PM2.5  2025  2025_jun      1
4133      WME       PM2.5  2025  2025_mar      1
4134      WME       PM2.5  2025  2025_may      1
4135      WME       PM2.5  2025  2025_sep      1

[4136 rows x 5 columns]
month
2024_aug    138
2025_mar    135
2024_may    133
2023_dec    131
2023_oct    131
2024_sep    129
2024_jul    128
2023_aug    127
2024_jan    127
2023_sep    126
2023_nov    125
2024_oct    124
2025_jan    124
2024_feb    123
2024_jun    123
2024_mar    123
2024_nov    123
2024_apr    123
2023_may    122
2024_dec    121
2023_mar    121
2025_feb    120
2023_jan    118
2023_jul    1

### 4) Function for whihc site-species are not monitored through to affecte_sites_species_counts.csv. 
 - If the station don't have value for all round year for that specie that's mean that that station not monitoring that species.
 - the funtion will returns tuples coveres monitored sitesm summary stats.
 

In [33]:
def extract_month_number(month_str):
    # expects format '2025_jul', returns 7 for 'jul'
    month_map = {'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6,
                 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}
    m = re.match(r"\d{4}_(\w{3})", str(month_str).lower())
    if m:
        return month_map.get(m.group(1), None)
    return None

def analyze_affected_sites( input_csv_path: Path, output_directory: Path) -> pd.DataFrame:
    """
    Simple scan of affected sites across 2023-2025 (2025 up to Nov).
    Args:
        non-active_sites: Path to affected_sites_species_counts.csv
        output_dir: Where to save output
    Returns:
        Filtered DataFrame
    """
    
    # Load the CSV
    df = pd.read_csv(input_csv_path)

    # Extract month number
    df['month_number'] = df['month'].apply(extract_month_number)

    # Remove rows where month conversion failed (just in case)
    df = df[df['month_number'].notna()].copy()
    
    # Filter: 2023 (all months), 2024 (all months), 2025 (Jan-Nov only)
    filtered = df[
        (df['year'] == 2023) |
        (df['year'] == 2024) |
        ((df['year'] == 2025) & (df['month_number'] <= 11))
    ].copy()
    
    # Create output directory
    output_directory.mkdir(parents=True, exist_ok=True)
    
    # Save filtered result
    output_file_path = output_directory / "non_active_sites_species.csv"
    filtered.to_csv(output_file_path, index=False)
    
    # Print summary
    print("="*60)
    print("affected sites/species analysis (2023-2025)")
    print("="*60)
    print(f"\nTotal rows: {len(filtered)}")
    print(f"Unique sites: {filtered['siteCode'].nunique()}")
    print(f"Unique species: {filtered['SpeciesCode'].nunique()}")
    print(f"Total missing files: {filtered['count'].sum()}")
    
    print(f"\nBreakdown by year:")
    print(filtered.groupby('year')['count'].sum())
    
    print(f"\nTop 10 affected site-species:")
    top10 = filtered.groupby(['siteCode', 'SpeciesCode'])['count'].sum().sort_values(ascending=False).head(10)
    print(top10)
    
    print(f"\n Saved to: {output_file_path}")
    
    return filtered

### 5) function to remove non-active site/species from active metadata csv file.
- what does function:
    - removes rows from actv_sites_species.csv where (SiteName, SpeciesCode) matches any (siteCode, SpeciesCode) in nonActive_siteSpecies.csv
    - result is saved as updated_actv_siteSpecies.csv in the same directory

- paths:
    - actv_sites_species.csv: base_dir / data / laqn / actv_sites_species.csv
    - nonActive_siteSpecies.csv: base_dir / data / laqn / missing / nonActive_siteSpecies.csv
    - updated_actv_siteSpecies.csv: base_dir / data / laqn / updated_actv_siteSpecies.csv

- new csv file save as:updated_actv_siteSpecies.csv



In [34]:
result_df = analyze_affected_sites(
    input_csv_path=nonactive_sites_input,
    output_directory=output_dir
)

print("\n" + "="*60)
print("First 10 row of filtered data:")
print("="*60)
print(result_df.head(10))

affected sites/species analysis (2023-2025)

Total rows: 4136
Unique sites: 81
Unique species: 6
Total missing files: 4136

Breakdown by year:
year
2023    1449
2024    1515
2025    1172
Name: count, dtype: int64

Top 10 affected site-species:
siteCode  SpeciesCode
HK6       O3             35
HV3       SO2            35
EN4       PM10           35
BX1       PM10           35
LB4       SO2            35
KF1       PM2.5          35
          PM10           35
KC1       PM10           35
BY7       CO             35
IS2       CO             35
Name: count, dtype: int64

 Saved to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/non_active_sites_species.csv

First 10 row of filtered data:
  siteCode SpeciesCode  year     month  count  month_number
0      BG1         NO2  2023  2023_jun      1             6
1      BG1         NO2  2023  2023_may      1             5
2      BG1         NO2  2024  2024_apr      1             4
3      BG1         NO2

In [38]:
df = pd.read_csv("/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/non_active_sites_species.csv")

# Group by siteCode, SpeciesCode, and year, then sum counts
summary = df.groupby(['siteCode', 'SpeciesCode', 'year'])['count'].sum().reset_index()

# Pivot to see years as columns (optional, for a wide view)
pivot = summary.pivot_table(index=['siteCode', 'SpeciesCode'], columns='year', values='count', fill_value=0)
pivot.to_csv("/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/nonActive_siteSpecies.csv")

print(pivot)

year                  2023  2024  2025
siteCode SpeciesCode                  
BG1      NO2           2.0   5.0   5.0
         SO2           3.0   4.0   8.0
BG2      NO2           1.0   2.0   6.0
         PM10          1.0   2.0   6.0
BL0      CO           12.0  12.0  11.0
...                    ...   ...   ...
WM6      PM10          0.0  11.0  11.0
         PM2.5        12.0   1.0   0.0
WME      NO2          12.0  12.0   9.0
         O3           12.0  12.0   7.0
         PM2.5        12.0  12.0   8.0

[233 rows x 3 columns]


In [39]:
pivot = pd.read_csv(
    "/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/nonActive_siteSpecies.csv",
    index_col=[0, 1]
)

# Ensure columns are integers (years)
pivot.columns = pivot.columns.astype(str)
required_years = ['2023', '2024', '2025']

# Only keep rows where all required years are present
pivot = pivot[[col for col in required_years if col in pivot.columns]]

# Filter for exact month counts: 2023 (12), 2024 (12), 2025 (11)
filtered = pivot[
    (pivot['2023'] == 12) &
    (pivot['2024'] == 12) &
    (pivot['2025'] == 11)
]

# Save the filtered DataFrame back to CSV
filtered.to_csv(
    "/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/nonActive_siteSpecies.csv"
)

print(f"Filtered site/species combos saved. Shape: {filtered.shape}")

Filtered site/species combos saved. Shape: (38, 3)
