# LAQN Updated Active sites/species File Function:
- laqn_remove notebook got slower so i will move the update function here.
- Start with paths and modules to import.

In [37]:
import pandas as pd
from pathlib import Path
import re
import glob

# paths beloww
base_dir = Path("/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels")
# the metadata file for nan @value 
nanValue_path = base_dir / "data" / "laqn" / "missing" / "logs_nan_value.csv"

# Month abbreviation list for reference/use in functions
month_list = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

# analyse_affected_sites_2023 path below.
check_nonActive_path = base_dir / "data" / "laqn" / "missing" / "notActive_siteSpecies.csv"
output_notActive_siteSpecies_2023 = base_dir / "data" / "laqn" / "missing" / "notActive_siteSpecies_2023.csv"

# checks the removed site-species combinations against active list and removes them
existing_nonactive_path = base_dir / "data" / "laqn" / "missing" / "notActive_site_species.csv"

# analyse_affected_sites_2024 path below.
output_notActive_siteSpecies_2024 = base_dir / "data" / "laqn" / "missing" / "notActive_siteSpecies_2024.csv"

# analyse_affected_sites_2025 path below.
output_notActive_siteSpecies_2025 = base_dir / "data" / "laqn" / "missing" / "notActive_siteSpecies_2025.csv"

#calculating new issue rate, so I will be count all files in optimased folder and than recalculate the issue rate taking 
# out the notAcvtive_2024/2023 files from output_notActive_siteSpecies_2023, output_notActive_siteSpecies_2024 csv's
optimased_root = base_dir / "data" / "laqn" / "optimased"
log_file = base_dir / "data" / "laqn" / "missing" / "logs_missin_value.csv"

# Collect all CSV files in the optimased_root directory
all_csv_files = list(Path(optimased_root).glob("*.csv"))


## 1) analyse_affected_sites_2023 function to identify site/species with all months missing in 2023
    what does function:

- scans the missing value log for site/species combinations that have 100% missing values for all 12 months in 2023
- if an existing non-active site/species list is provided, only new combinations not already in that list are included
- prints the results and saves them as a CSV file for further review

    paths:

- nanValue_path: base_dir / data / laqn / missing / logs_nan_value.csv
- output_notActive_siteSpecies_2023: base_dir / data / laqn / missing / notActive_siteSpecies_2023.csv
- existing_nonactive_path (optional): base_dir / data / laqn / missing / notActive_site_species.csv
- new csv file save as: notActive_siteSpecies_2023.csv

In [40]:
def analyse_affected_sites_year(
    nanValue_path,  # value_100filtered_missing.csv
    output_notActive_site_species,  # recommended output path
    check_nonActive_path=None,  # notActive_site_species.csv
    year=2025,
    optimased_dir=None  # Optional: base dir for constructing file paths
):
    """
    1. Checks value_100filtered_missing.csv for site/species combos with 100% missing values for all months in the given year.
    2. Compares to notActive_site_species.csv and finds new combos not already listed.
    3. Adds actual filenames (and optionally full paths) for each combo from logs_nan_value.csv.
    4. Prints and saves these new combos to output_notActive_site_species.
    """

    def extract_month_number(month_str):
        month_map = {abbr: i+1 for i, abbr in enumerate(month_list)}
        m = re.match(r"\d{4}_(\w{3})", str(month_str).lower())
        if m:
            return month_map.get(m.group(1), None)
        return None

    # Load and filter for year, 100% missing
    df = pd.read_csv(nanValue_path, encoding='utf-8')
    df['month_number'] = df['month'].apply(extract_month_number)
    df = df[df['month_number'].notna()].copy()
    df_year = df[df['year'] == year]
    summary_year = df_year.groupby(['siteCode', 'SpeciesCode'])['month_number'].nunique().reset_index()
    affected_year = summary_year[summary_year['month_number'] == 12].copy()

    print(f"\nTotal site/species combos with 100% missing in {year}: {len(affected_year)}")
    print(affected_year)

    # Merge to get actual filenames and (optionally) full paths from nanValue_path
    merged = pd.merge(
        affected_year,
        df_year[['siteCode', 'SpeciesCode', 'filename', 'path']].drop_duplicates(),
        on=['siteCode', 'SpeciesCode'],
        how='left'
    )

    # Add expected filename prefix for reference
    def make_filename(row):
        return f"{str(row['siteCode']).lower()}_{str(row['SpeciesCode']).lower()}_"
    merged['expected_filename_prefix'] = merged.apply(make_filename, axis=1)
    if optimased_dir is not None:
        merged['expected_path_prefix'] = merged['expected_filename_prefix'].apply(
            lambda x: str(Path(optimased_dir) / x)
        )

    #  Compare to notActive_site_species.csv 
    if check_nonActive_path is not None:
        try:
            existing = pd.read_csv(check_nonActive_path, encoding='utf-8')
            existing_set = set(zip(existing['siteCode'], existing['SpeciesCode']))
            affected_year_set = set(zip(merged['siteCode'], merged['SpeciesCode']))
            new_combos = affected_year_set - existing_set
            new_affected_year = merged[
                merged.apply(lambda row: (row['siteCode'], row['SpeciesCode']) in new_combos, axis=1)
            ]
            print(f"\nNew site/species combos NOT in notActive_site_species.csv: {len(new_affected_year)}")
            print(new_affected_year)
            # Save to output
            new_affected_year.to_csv(output_notActive_site_species, index=False, encoding='utf-8')
            print(f"\nSaved new combos to: {output_notActive_site_species}")
            return new_affected_year
        except Exception as e:
            print(f"Warning: Could not filter by notActive_site_species.csv: {e}")
    else:
        print(f"\nNo notActive_site_species.csv provided for comparison.")
    merged.to_csv(output_notActive_site_species, index=False, encoding='utf-8')
    return merged


In [41]:
# Run the function for 2023
affected_2023 = analyse_affected_sites_year(
    nanValue_path,
    output_notActive_siteSpecies_2023,
    check_nonActive_path=existing_nonactive_path,
    year=2023
)

# Run the function for 2024
affected_2024 = analyse_affected_sites_year(
    nanValue_path,
    output_notActive_siteSpecies_2024,
    check_nonActive_path=existing_nonactive_path,
    year=2024
)

# Run the function for 2025
affected_2025 = analyse_affected_sites_year(
    nanValue_path,
    output_notActive_siteSpecies_2025,
    check_nonActive_path=existing_nonactive_path,
    year=2025
)



Total site/species combos with 100% missing in 2023: 41
    siteCode SpeciesCode  month_number
40       EN5       PM2.5            12
61       GT1         NO2            12
62       GT1        PM10            12
63       GT1       PM2.5            12
65       HG1       PM2.5            12
80       IS2       PM2.5            12
83       IS6       PM2.5            12
86       KF1        PM25            12
98       ME2         NO2            12
99       ME2        PM10            12
100      ME2       PM2.5            12
101      MEA         NO2            12
102      MEA       PM2.5            12
103      MEB        PM10            12
104      MEB       PM2.5            12
105      MR8        PM25            12
110      RHI         NO2            12
111      RHI        PM10            12
112      RHI       PM2.5            12
117      RI2       PM2.5            12
121      TH2       PM2.5            12
124      TH5         NO2            12
125      TH5        PM10            12
126    

## 2) calculate_adjusted_issue_rate
    This function calculates the adjusted issue rate of files with more than 20% missing @Value in the optimased directory, after excluding files associated with site/species combinations listed in the notActive_siteSpecies_2023.csv and notActive_siteSpecies_2024.csv files.

- How it works:

    - Reads the log file (logs_missin_value.csv) to count the number of files with >20% missing @Value.
    - Recursively counts all CSV files in the optimased directory.
    - Excludes from the total any files that match site/species pairs found in the not-active lists for 2023 and 2024.
    - Calculates the issue rate as: issue rate = ( Number of files with >20% missing @Value / total number of files) Ã— 100


- Purpose:

    - To provide a more accurate data quality metric by removing files that are already flagged as not active for 2023 and 2024, so the issue rate reflects only the remaining, relevant files.

In [49]:
def calculate_adjusted_issue_rate(optimased_root, log_file, notactive_files, all_csv_files):
    """
    Calculate the adjusted issue rate of files with >20% missing values,
    excluding files associated with not-active site/species combinations.
    Now excludes files by matching the 'filename' column in notActive_siteSpecies_... CSVs (using full paths, no lowercasing).
    """
    # Read log file (files with >20% missing @Value)
    df_log = pd.read_csv(log_file)
    # Use full path as filename (no lowercasing)
    df_log['filename'] = df_log['filename'].apply(lambda x: str(x).strip())
    all_csv_files = set(Path(f).name for f in all_csv_files)

    # Read notActive_siteSpecies_2023/2024 and collect filenames to exclude (by filename column if present)
    exclude_filenames = set()
    for naf in notactive_files:
        if Path(naf).exists():
            df_exclude = pd.read_csv(naf)
            if 'filename' in df_exclude.columns:
                # Use the filename column directly (full path, no lowercasing)
                exclude_filenames.update(df_exclude['filename'].dropna().astype(str).str.strip())
            else:
                # Fallback to old pattern-based exclusion
                for _, row in df_exclude.iterrows():
                    site = str(row['siteCode'])
                    species = str(row['SpeciesCode'])
                    pattern = f"{site}_{species}_"
                    matches = [fname for fname in all_csv_files if Path(fname).name.startswith(pattern)]
                    exclude_filenames.update(matches)

    # Remove excluded files from total
    adjusted_total_files = len([f for f in all_csv_files if str(f) not in exclude_filenames])

    # Filter log to only include files not in exclude_filenames
    df_log_filtered = df_log[~df_log['filename'].isin(exclude_filenames)]
    files_with_high_missing = df_log_filtered['filename'].nunique()

    print("Total CSV files in optimased:", len(all_csv_files))
    print("Total files after exclusion:", adjusted_total_files)
    print("Files with >20% missing after exclusion:", files_with_high_missing)
    print("Sample exclude_filenames:", list(exclude_filenames)[:5])
    print("Sample log filenames:", df_log['filename'].head())
    print("Sample all_csv_filenames:", list(all_csv_files)[:5])

    if adjusted_total_files == 0:
        return 0.0
    issue_rate = (files_with_high_missing / adjusted_total_files) * 100
    return issue_rate


In [50]:
# Example usage:
notactive_files = [output_notActive_siteSpecies_2023, output_notActive_siteSpecies_2024]
adjusted_issue_rate = calculate_adjusted_issue_rate(
    optimased_root, log_file, notactive_files, all_csv_files
)
print("Adjusted issue rate: {:.2f}%".format(adjusted_issue_rate))

Total CSV files in optimased: 0
Total files after exclusion: 0
Files with >20% missing after exclusion: 2936
Sample exclude_filenames: ['TH2_PM2.5_2023-03-01_2023-03-31.csv', 'TH6_O3_2023-02-01_2023-02-28.csv', 'GR4_NO2_2024-05-01_2024-05-31.csv', 'BL0_PM10_2024-12-01_2024-12-31.csv', 'TH6_O3_2024-08-01_2024-08-31.csv']
Sample log filenames: 0       BL0_CO_2023-04-01_2023-04-30.csv
1       BT4_O3_2023-04-01_2023-04-30.csv
2      BT4_SO2_2023-04-01_2023-04-30.csv
3    BT5_PM2.5_2023-04-01_2023-04-30.csv
4    BT6_PM2.5_2023-04-01_2023-04-30.csv
Name: filename, dtype: object
Sample all_csv_filenames: []
Adjusted issue rate: 0.00%


In [None]:
def detailed_issue_rate_excluding_notactive(optimased_dir, notactive_files):
    """
    Checks all files in optimased_dir for >20% missing @Value, and calculates the issue rate
    after excluding files listed in notActive_siteSpecies_2023/2024 (by filename).
    Returns: (issue_rate, total_files_checked, files_with_high_missing, missing_values_log)
    """
    from pathlib import Path
    import pandas as pd

    # Gather all CSV files recursively
    all_csv_files = list(Path(optimased_dir).rglob("*.csv"))
    all_csv_filenames = set(f.name for f in all_csv_files)

    # Collect filenames to exclude from notactive_files
    exclude_filenames = set()
    for naf in notactive_files:
        if Path(naf).exists():
            df_exclude = pd.read_csv(naf)
            if 'filename' in df_exclude.columns:
                exclude_filenames.update(df_exclude['filename'].dropna().astype(str).str.strip())
            else:
                for _, row in df_exclude.iterrows():
                    site = str(row['siteCode'])
                    species = str(row['SpeciesCode'])
                    pattern = f"{site}_{species}_"
                    matches = [fname for fname in all_csv_filenames if fname.startswith(pattern)]
                    exclude_filenames.update(matches)

    # Filter files to check (exclude notactive)
    files_to_check = [f for f in all_csv_files if f.name not in exclude_filenames]
    total_files_checked = len(files_to_check)
    files_with_high_missing = 0
    missing_values_log = []

    for filepath in files_to_check:
        try:
            df = pd.read_csv(filepath)
            if '@Value' in df.columns and len(df) > 0:
                missing_values = df['@Value'].isna().sum()
                empty_value_percentage = (100 * missing_values / len(df))
                if empty_value_percentage > 20:
                    files_with_high_missing += 1
                    missing_values_log.append({
                        'filename': filepath.name,
                        'path': str(filepath),
                        'EmptyValuePercentage': round(empty_value_percentage, 2)
                    })
        except Exception as e:
            print(f"Error reading {filepath}: {e}")

    issue_rate = (files_with_high_missing / total_files_checked * 100) if total_files_checked > 0 else 0.0
    print(f"Total files checked: {total_files_checked}")
    print(f"Files with >20% missing @Value: {files_with_high_missing}")
    print(f"Issue rate (excluding not-active): {issue_rate:.2f}%")
    print(f"Files excluded (not-active): {len(exclude_filenames)}")
    return issue_rate, total_files_checked, files_with_high_missing, missing_values_log

# Example usage:
notactive_files = [output_notActive_siteSpecies_2023, output_notActive_siteSpecies_2024]
issue_rate, total_files_checked, files_with_high_missing, missing_values_log = detailed_issue_rate_excluding_notactive(
    optimased_root, notactive_files
)
print(f"\nFinal issue rate (excluding not-active): {issue_rate:.2f}%")
