# LAQN Updated Active sites/species File Function:
- laqn_remove notebook got slower so i will move the update function here.
- Start with paths and modules to import.

In [1]:
import pandas as pd
from pathlib import Path
import re

# paths beloww
base_dir = Path("/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels")
# the metadata file for nan @value 
nanValue_path = base_dir / "data" / "laqn" / "missing" / "logs_nan_value.csv"

# Month abbreviation list for reference/use in functions
month_list = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

# analyse_affected_sites_2023 path below.
check_nonActive_path = base_dir / "data" / "laqn" / "missing" / "notActive_siteSpecies.csv"
output_notActive_siteSpecies_2023 = base_dir / "data" / "laqn" / "missing" / "notActive_siteSpecies_2023.csv"

# checks the removed site-species combinations against active list and removes them
existing_nonactive_path = base_dir / "data" / "laqn" / "missing" / "notActive_site_species.csv"



## 1) analyse_affected_sites_2023 function to identify site/species with all months missing in 2023
    what does function:

- scans the missing value log for site/species combinations that have 100% missing values for all 12 months in 2023
- if an existing non-active site/species list is provided, only new combinations not already in that list are included
- prints the results and saves them as a CSV file for further review

    paths:

- nanValue_path: base_dir / data / laqn / missing / logs_nan_value.csv
- output_notActive_siteSpecies_2023: base_dir / data / laqn / missing / notActive_siteSpecies_2023.csv
- existing_nonactive_path (optional): base_dir / data / laqn / missing / notActive_site_species.csv
- new csv file save as: notActive_siteSpecies_2023.csv

In [10]:
def analyse_affected_sites_2023(
    nanValue_path,  # value_100filtered_missing.csv
    output_notActive_site_species_2023,  # recommended output path
    check_nonActive_path=None  # notActive_site_species.csv
):
    """
    1. Checks value_100filtered_missing.csv for site/species combos with 100% missing values for all months in 2023.
    2. Compares to notActive_site_species.csv and finds new combos not already listed.
    3. Prints and saves these new combos to output_notActive_site_species_2023.
    """

    def extract_month_number(month_str):
        month_map = {abbr: i+1 for i, abbr in enumerate(month_list)}
        m = re.match(r"\d{4}_(\w{3})", str(month_str).lower())
        if m:
            return month_map.get(m.group(1), None)
        return None

    # Load and filter for 2023, 100% missing
    df = pd.read_csv(nanValue_path, encoding='utf-8')
    df['month_number'] = df['month'].apply(extract_month_number)
    df = df[df['month_number'].notna()].copy()
    df_2023 = df[df['year'] == 2023]
    summary_2023 = df_2023.groupby(['siteCode', 'SpeciesCode'])['month_number'].nunique().reset_index()
    affected_2023 = summary_2023[summary_2023['month_number'] == 12].copy()

    print(f"\nTotal site/species combos with 100% missing in 2023: {len(affected_2023)}")
    print(affected_2023)

    #  Compare to notActive_site_species.csv 
    if check_nonActive_path is not None:
        try:
            existing = pd.read_csv(check_nonActive_path, encoding='utf-8')
            existing_set = set(zip(existing['siteCode'], existing['SpeciesCode']))
            affected_2023_set = set(zip(affected_2023['siteCode'], affected_2023['SpeciesCode']))
            new_combos = affected_2023_set - existing_set
            new_affected_2023 = affected_2023[
                affected_2023.apply(lambda row: (row['siteCode'], row['SpeciesCode']) in new_combos, axis=1)
            ]
            print(f"\nNew site/species combos NOT in notActive_site_species.csv: {len(new_affected_2023)}")
            print(new_affected_2023)
            # Save to output
            new_affected_2023.to_csv(output_notActive_site_species_2023, index=False, encoding='utf-8')
            print(f"\nSaved new combos to: {output_notActive_site_species_2023}")
            return new_affected_2023
        except Exception as e:
            print(f"Warning: Could not filter by notActive_site_species.csv: {e}")
    else:
        print("\nNo notActive_site_species.csv provided for comparison.")
    return affected_2023


In [11]:
# Run the function with filtering against existing non-active site/species list
# Use value_100filtered_missing.csv as input, notActive_site_species.csv for comparison, and output to output_notActive_siteSpecies_2023

affected_2023 = analyse_affected_sites_2023(
    nanValue_path,  # value_100filtered_missing.csv
    output_notActive_siteSpecies_2023,  # recommended output path
    check_nonActive_path=existing_nonactive_path  # notActive_site_species.csv
)



Total site/species combos with 100% missing in 2023: 41
    siteCode SpeciesCode  month_number
40       EN5       PM2.5            12
61       GT1         NO2            12
62       GT1        PM10            12
63       GT1       PM2.5            12
65       HG1       PM2.5            12
80       IS2       PM2.5            12
83       IS6       PM2.5            12
86       KF1        PM25            12
98       ME2         NO2            12
99       ME2        PM10            12
100      ME2       PM2.5            12
101      MEA         NO2            12
102      MEA       PM2.5            12
103      MEB        PM10            12
104      MEB       PM2.5            12
105      MR8        PM25            12
110      RHI         NO2            12
111      RHI        PM10            12
112      RHI       PM2.5            12
117      RI2       PM2.5            12
121      TH2       PM2.5            12
124      TH5         NO2            12
125      TH5        PM10            12
126    