# Cleaning LAQN Datasets
I will be removing the files has %100 missing values.

import and path statements below.

In [8]:
import pandas as pd
from pathlib import Path
import os
import re

# paths beloww
base_dir = Path("/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels")

# filter logs_missin_value.csv value=100% missing value function paths below.
input_all_missing_file = base_dir / "data" / "laqn" / "missing" / "logs_missin_value.csv"
output_filtered_value_file = base_dir / "data" / "laqn" / "missing" / "value_100filtered_missing.csv"

# analyse the sites/speciest not have any value so they're not active site/species/  analyse_affected_sites function
output_notActive_site_species = base_dir / "data"/ "laqn"/ "missing"/ "notActive_site_species.csv"



#### 1) filter logs_missin_value.csv file according to 100 percent missing values.
- filters based on 100 value column and than creates another missing_files csv.
- filters siteCode and speciesCode columns different to find out that what site's don't have that species on their system.

In [6]:
    
def filter_missing_pollutants():
    """
    Filter the logs_missin_value.csv to create:
   value_100filtered_missing.csv - rows with 100% EmptyValuePercentage and siteCode != SpeciesCode
    
    """
    print("="*40)
    print(f"Filtering missing value logs from: {input_all_missing_file}")
    print("="*40)

    # Load the file
    df = pd.read_csv(input_all_missing_file, encoding='utf-8')
    print(f"Loaded {len(df)} rows from logs_missin_value.csv")

    # 1. Filter for 100% EmptyValuePercentage
    df_100 = df[df['EmptyValuePercentage'] == 100]
    df_100.to_csv(output_filtered_value_file, index=False)
    print(f"Saved {len(df_100)} rows with 100% missing values to: {output_filtered_value_file}")

    # 2. Filter where siteCode != SpeciesCode
    df_100 = df_100[df_100['siteCode'] != df_100['SpeciesCode']]
    df_100.to_csv(output_filtered_value_file, index=False)
    print(f"Saved {len(df_100)} rows (siteCode != SpeciesCode) to: {output_filtered_value_file}")

    print("="*100)
    print("Filtering complete.\n")

    return df_100


In [7]:

# Example usage
filter_missing_pollutants()

Filtering missing value logs from: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/logs_missin_value.csv
Loaded 4136 rows from logs_missin_value.csv
Saved 3401 rows with 100% missing values to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/value_100filtered_missing.csv
Saved 3401 rows (siteCode != SpeciesCode) to: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/value_100filtered_missing.csv
Filtering complete.



Unnamed: 0,filename,path,siteCode,SpeciesCode,year,month,EmptyValuePercentage
0,BL0_CO_2023-04-01_2023-04-30.csv,/Users/burdzhuchaglayan/Desktop/data science p...,BL0,CO,2023,2023_apr,100.0
2,BT4_SO2_2023-04-01_2023-04-30.csv,/Users/burdzhuchaglayan/Desktop/data science p...,BT4,SO2,2023,2023_apr,100.0
3,BT5_PM2.5_2023-04-01_2023-04-30.csv,/Users/burdzhuchaglayan/Desktop/data science p...,BT5,PM2.5,2023,2023_apr,100.0
4,BT6_PM2.5_2023-04-01_2023-04-30.csv,/Users/burdzhuchaglayan/Desktop/data science p...,BT6,PM2.5,2023,2023_apr,100.0
5,BT6_SO2_2023-04-01_2023-04-30.csv,/Users/burdzhuchaglayan/Desktop/data science p...,BT6,SO2,2023,2023_apr,100.0
...,...,...,...,...,...,...,...
4129,WM0_O3_2025-09-01_2025-09-30.csv,/Users/burdzhuchaglayan/Desktop/data science p...,WM0,O3,2025,2025_sep,100.0
4130,WM0_PM10_2025-09-01_2025-09-30.csv,/Users/burdzhuchaglayan/Desktop/data science p...,WM0,PM10,2025,2025_sep,100.0
4131,WM0_PM2.5_2025-09-01_2025-09-30.csv,/Users/burdzhuchaglayan/Desktop/data science p...,WM0,PM2.5,2025,2025_sep,100.0
4132,WM0_SO2_2025-09-01_2025-09-30.csv,/Users/burdzhuchaglayan/Desktop/data science p...,WM0,SO2,2025,2025_sep,100.0


### 2) Function for identifying site-species combinations not monitored throughout the year

- This function analyses the value_100filtered_missing.csv file to find site-species pairs that have 100% missing values for every month in a year.
- If a station has no valid values for a given species across all months (2023: 12 months, 2024: 12 months, 2025: 11 months), it is considered as not monitoring that species.
- The function groups and pivots the data to summarise which site-species pairs are consistently missing, providing a clear list of non-active monitoring combinations.
- The output includes summary statistics and a filtered table of site-species pairs that are not monitored, which can be used for further reporting or to update active site/species metadata.

In [11]:
def extract_month_number(month_str):
    # expects format 2025_jul returns 7 for 'jul
    month_map = {'jan':1, 'feb':2, 'mar':3, 'apr':4, 'may':5, 'jun':6,
                 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}
    m = re.match(r"\d{4}_(\w{3})", str(month_str).lower())
    if m:
        return month_map.get(m.group(1), None)
    return None

def analyse_affected_sites(output_filtered_value_file, output_notActive_site_species):
    """
    Scan for site-species combinations with 100% missing values for all months in a year.
    Prints summary and returns the grouped DataFrame.
    """
    # Load the CSV
    df = pd.read_csv(output_filtered_value_file, encoding='utf-8')
    print("CSV structure (columns):", df.columns.tolist())
    print("First 5 rows:\n", df.head())

    # Extract month number from 'month' column
    df['month_number'] = df['month'].apply(extract_month_number)

    # Remove rows where month conversion failed
    df = df[df['month_number'].notna()].copy()

    # Count files per site, species, year, and month
    df['count'] = 1
    summary = df.groupby(['siteCode', 'SpeciesCode', 'year', 'month']).size().reset_index(name='count')
    print("\nGrouped summary (site, species, year, month):")
    print(summary.head())

    # Pivot for wide view: months per year
    pivot = summary.pivot_table(index=['siteCode', 'SpeciesCode', 'year'], values='count', aggfunc='sum', fill_value=0)
    print("\nPivot table (site, species, year):")
    print(pivot.head())

    # Pivot to see years as columns (site/species as index)
    summary_year = df.groupby(['siteCode', 'SpeciesCode', 'year'])['count'].sum().reset_index()
    pivot_year = summary_year.pivot_table(index=['siteCode', 'SpeciesCode'], columns='year', values='count', fill_value=0)
    print("\nPivot table (site/species x year):")
    print(pivot_year.head())

    # Ensure columns are strings for year
    pivot_year.columns = pivot_year.columns.astype(str)
    required_years = ['2023', '2024', '2025']
    pivot_year = pivot_year[[col for col in required_years if col in pivot_year.columns]]

    # Filter for exact month counts: 2023 (12), 2024 (12), 2025 (11)
    filtered = pivot_year[
        (pivot_year.get('2023', 0) == 12) &
        (pivot_year.get('2024', 0) == 12) &
        (pivot_year.get('2025', 0) == 11)
    ]
    print("\nFiltered site/species with all months missing (2023:12, 2024:12, 2025:11):")
    print(filtered)

    # Save the filtered DataFrame to CSV (commented out for now)
    filtered.to_csv(output_notActive_site_species, index=True, encoding='utf-8')
    print(f"\nFiltered site/species combos saved to: {output_notActive_site_species}")

    return filtered

In [12]:
# use the function to analyse affected sites/species
filtered = analyse_affected_sites(input_all_missing_file, output_notActive_site_species)

CSV structure (columns): ['filename', 'path', 'siteCode', 'SpeciesCode', 'year', 'month', 'EmptyValuePercentage']
First 5 rows:
                               filename  \
0     BL0_CO_2023-04-01_2023-04-30.csv   
1     BT4_O3_2023-04-01_2023-04-30.csv   
2    BT4_SO2_2023-04-01_2023-04-30.csv   
3  BT5_PM2.5_2023-04-01_2023-04-30.csv   
4  BT6_PM2.5_2023-04-01_2023-04-30.csv   

                                                path siteCode SpeciesCode  \
0  /Users/burdzhuchaglayan/Desktop/data science p...      BL0          CO   
1  /Users/burdzhuchaglayan/Desktop/data science p...      BT4          O3   
2  /Users/burdzhuchaglayan/Desktop/data science p...      BT4         SO2   
3  /Users/burdzhuchaglayan/Desktop/data science p...      BT5       PM2.5   
4  /Users/burdzhuchaglayan/Desktop/data science p...      BT6       PM2.5   

   year     month  EmptyValuePercentage  
0  2023  2023_apr                100.00  
1  2023  2023_apr                 52.87  
2  2023  2023_apr            

### 5) function to remove non-active site/species from active metadata csv file.
- what does function:
    - removes rows from actv_sites_species.csv where (SiteName, SpeciesCode) matches any (siteCode, SpeciesCode) in nonActive_siteSpecies.csv
    - result is saved as updated_actv_siteSpecies.csv in the same directory

- paths:
    - actv_sites_species.csv: base_dir / data / laqn / actv_sites_species.csv
    - nonActive_siteSpecies.csv: base_dir / data / laqn / missing / nonActive_siteSpecies.csv
    - updated_actv_siteSpecies.csv: base_dir / data / laqn / updated_actv_siteSpecies.csv

- new csv file save as:updated_actv_siteSpecies.csv

