# LAQN Updated Active sites/species File Function:
- laqn_remove notebook got slower so i will move the update function here.
- Start with paths and modules to import.

In [3]:
import pandas as pd
from pathlib import Path
import re
import glob

# paths beloww
base_dir = Path("/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels")
# the metadata file for nan @value 
nanValue_path = base_dir / "data" / "laqn" / "missing" / "logs_nan_value.csv"

# Month abbreviation list for reference/use in functions
month_list = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

# analyse_affected_sites_2023 path below.
check_nonActive_path = base_dir / "data" / "laqn" / "missing" / "notActive_siteSpecies.csv"
output_notActive_siteSpecies_2023 = base_dir / "data" / "laqn" / "missing" / "notActive_siteSpecies_2023.csv"

# checks the removed site-species combinations against active list and removes them
existing_nonactive_path = base_dir / "data" / "laqn" / "missing" / "notActive_site_species.csv"

# analyse_affected_sites_2024 path below.
output_notActive_siteSpecies_2024 = base_dir / "data" / "laqn" / "missing" / "notActive_siteSpecies_2024.csv"

# analyse_affected_sites_2025 path below.
output_notActive_siteSpecies_2025 = base_dir / "data" / "laqn" / "missing" / "notActive_siteSpecies_2025.csv"

#calculating new issue rate, so I will be count all files in optimased folder and than recalculate the issue rate taking 
# out the notAcvtive_2024/2023 files from output_notActive_siteSpecies_2023, output_notActive_siteSpecies_2024 csv's
optimased_root = base_dir / "data" / "laqn" / "optimased"
log_file = base_dir / "data" / "laqn" / "missing" / "logs_missin_value.csv"

# Collect all CSV files in the optimased_root directory
all_csv_files = list(Path(optimased_root).rglob("*.csv"))
all_csv_filenames = set(f.name for f in all_csv_files)

# logs of removed files
log_csv_path = base_dir / "data" / "laqn" / "missing" / "logs_rm_notActive_23_24.csv"

## 1) analyse_affected_sites_2023 function to identify site/species with all months missing in 2023
    what does function:

- scans the missing value log for site/species combinations that have 100% missing values for all 12 months in 2023
- if an existing non-active site/species list is provided, only new combinations not already in that list are included
- prints the results and saves them as a CSV file for further review

    paths:

- nanValue_path: base_dir / data / laqn / missing / logs_nan_value.csv
- output_notActive_siteSpecies_2023: base_dir / data / laqn / missing / notActive_siteSpecies_2023.csv
- existing_nonactive_path (optional): base_dir / data / laqn / missing / notActive_site_species.csv
- new csv file save as: notActive_siteSpecies_2023.csv

In [40]:
def analyse_affected_sites_year(
    nanValue_path,  # value_100filtered_missing.csv
    output_notActive_site_species,  # recommended output path
    check_nonActive_path=None,  # notActive_site_species.csv
    year=2025,
    optimased_dir=None  # Optional: base dir for constructing file paths
):
    """
    1. Checks value_100filtered_missing.csv for site/species combos with 100% missing values for all months in the given year.
    2. Compares to notActive_site_species.csv and finds new combos not already listed.
    3. Adds actual filenames (and optionally full paths) for each combo from logs_nan_value.csv.
    4. Prints and saves these new combos to output_notActive_site_species.
    """

    def extract_month_number(month_str):
        month_map = {abbr: i+1 for i, abbr in enumerate(month_list)}
        m = re.match(r"\d{4}_(\w{3})", str(month_str).lower())
        if m:
            return month_map.get(m.group(1), None)
        return None

    # Load and filter for year, 100% missing
    df = pd.read_csv(nanValue_path, encoding='utf-8')
    df['month_number'] = df['month'].apply(extract_month_number)
    df = df[df['month_number'].notna()].copy()
    df_year = df[df['year'] == year]
    summary_year = df_year.groupby(['siteCode', 'SpeciesCode'])['month_number'].nunique().reset_index()
    affected_year = summary_year[summary_year['month_number'] == 12].copy()

    print(f"\nTotal site/species combos with 100% missing in {year}: {len(affected_year)}")
    print(affected_year)

    # Merge to get actual filenames and (optionally) full paths from nanValue_path
    merged = pd.merge(
        affected_year,
        df_year[['siteCode', 'SpeciesCode', 'filename', 'path']].drop_duplicates(),
        on=['siteCode', 'SpeciesCode'],
        how='left'
    )

    # Add expected filename prefix for reference
    def make_filename(row):
        return f"{str(row['siteCode']).lower()}_{str(row['SpeciesCode']).lower()}_"
    merged['expected_filename_prefix'] = merged.apply(make_filename, axis=1)
    if optimased_dir is not None:
        merged['expected_path_prefix'] = merged['expected_filename_prefix'].apply(
            lambda x: str(Path(optimased_dir) / x)
        )

    #  Compare to notActive_site_species.csv 
    if check_nonActive_path is not None:
        try:
            existing = pd.read_csv(check_nonActive_path, encoding='utf-8')
            existing_set = set(zip(existing['siteCode'], existing['SpeciesCode']))
            affected_year_set = set(zip(merged['siteCode'], merged['SpeciesCode']))
            new_combos = affected_year_set - existing_set
            new_affected_year = merged[
                merged.apply(lambda row: (row['siteCode'], row['SpeciesCode']) in new_combos, axis=1)
            ]
            print(f"\nNew site/species combos NOT in notActive_site_species.csv: {len(new_affected_year)}")
            print(new_affected_year)
            # Save to output
            new_affected_year.to_csv(output_notActive_site_species, index=False, encoding='utf-8')
            print(f"\nSaved new combos to: {output_notActive_site_species}")
            return new_affected_year
        except Exception as e:
            print(f"Warning: Could not filter by notActive_site_species.csv: {e}")
    else:
        print(f"\nNo notActive_site_species.csv provided for comparison.")
    merged.to_csv(output_notActive_site_species, index=False, encoding='utf-8')
    return merged


In [41]:
# Run the function for 2023
affected_2023 = analyse_affected_sites_year(
    nanValue_path,
    output_notActive_siteSpecies_2023,
    check_nonActive_path=existing_nonactive_path,
    year=2023
)

# Run the function for 2024
affected_2024 = analyse_affected_sites_year(
    nanValue_path,
    output_notActive_siteSpecies_2024,
    check_nonActive_path=existing_nonactive_path,
    year=2024
)

# Run the function for 2025
affected_2025 = analyse_affected_sites_year(
    nanValue_path,
    output_notActive_siteSpecies_2025,
    check_nonActive_path=existing_nonactive_path,
    year=2025
)



Total site/species combos with 100% missing in 2023: 41
    siteCode SpeciesCode  month_number
40       EN5       PM2.5            12
61       GT1         NO2            12
62       GT1        PM10            12
63       GT1       PM2.5            12
65       HG1       PM2.5            12
80       IS2       PM2.5            12
83       IS6       PM2.5            12
86       KF1        PM25            12
98       ME2         NO2            12
99       ME2        PM10            12
100      ME2       PM2.5            12
101      MEA         NO2            12
102      MEA       PM2.5            12
103      MEB        PM10            12
104      MEB       PM2.5            12
105      MR8        PM25            12
110      RHI         NO2            12
111      RHI        PM10            12
112      RHI       PM2.5            12
117      RI2       PM2.5            12
121      TH2       PM2.5            12
124      TH5         NO2            12
125      TH5        PM10            12
126    

## 2) calculate_adjusted_issue_rate
    This function calculates the adjusted issue rate of files with more than 20% missing @Value in the optimased directory, after excluding files associated with site/species combinations listed in the notActive_siteSpecies_2023.csv and notActive_siteSpecies_2024.csv files.

- How it works:

    - Reads the log file (logs_missin_value.csv) to count the number of files with >20% missing @Value.
    - Recursively counts all CSV files in the optimased directory.
    - Excludes from the total any files that match site/species pairs found in the not-active lists for 2023 and 2024.
    - Calculates the issue rate as: issue rate = ( Number of files with >20% missing @Value / total number of files) Ã— 100


- Purpose:

    - To provide a more accurate data quality metric by removing files that are already flagged as not active for 2023 and 2024, so the issue rate reflects only the remaining, relevant files.

In [52]:
def detailed_issue_rate_excluding_notactive(optimased_dir, notactive_files):
    """
    Checks all files in optimased_dir for >20% missing @Value, and calculates the issue rate
    after excluding files listed in notActive_siteSpecies_2023/2024 (by filename).
    Returns: (issue_rate, total_files_checked, files_with_high_missing, missing_values_log)
    """

    # Collect filenames to exclude from notactive_files
    exclude_filenames = set()
    for naf in notactive_files:
        if Path(naf).exists():
            df_exclude = pd.read_csv(naf)
            if 'filename' in df_exclude.columns:
                exclude_filenames.update(df_exclude['filename'].dropna().astype(str).str.strip())
            else:
                for _, row in df_exclude.iterrows():
                    site = str(row['siteCode'])
                    species = str(row['SpeciesCode'])
                    pattern = f"{site}_{species}_"
                    matches = [fname for fname in all_csv_filenames if fname.startswith(pattern)]
                    exclude_filenames.update(matches)

    # Filter files to check (exclude notactive)
    files_to_check = [f for f in all_csv_files if f.name not in exclude_filenames]
    total_files_checked = len(files_to_check)
    files_with_high_missing = 0
    missing_values_log = []

    for filepath in files_to_check:
        try:
            df = pd.read_csv(filepath)
            if '@Value' in df.columns and len(df) > 0:
                missing_values = df['@Value'].isna().sum()
                empty_value_percentage = (100 * missing_values / len(df))
                if empty_value_percentage > 20:
                    files_with_high_missing += 1
                    missing_values_log.append({
                        'filename': filepath.name,
                        'path': str(filepath),
                        'EmptyValuePercentage': round(empty_value_percentage, 2)
                    })
        except Exception as e:
            print(f"Error reading {filepath}: {e}")

    issue_rate = (files_with_high_missing / total_files_checked * 100) if total_files_checked > 0 else 0.0
    print(f"Total files checked: {total_files_checked}")
    print(f"Files with >20% missing @Value: {files_with_high_missing}")
    print(f"Issue rate (excluding not-active): {issue_rate:.2f}%")
    print(f"Files excluded (not-active): {len(exclude_filenames)}")
    return issue_rate, total_files_checked, files_with_high_missing, missing_values_log




In [53]:
# Example usage:
notactive_files = [output_notActive_siteSpecies_2023, output_notActive_siteSpecies_2024]
issue_rate, total_files_checked, files_with_high_missing, missing_values_log = detailed_issue_rate_excluding_notactive(
    optimased_root, notactive_files
)
print(f"\nFinal issue rate (excluding not-active): {issue_rate:.2f}%")

Total files checked: 6201
Files with >20% missing @Value: 1628
Issue rate (excluding not-active): 26.25%
Files excluded (not-active): 1248

Final issue rate (excluding not-active): 26.25%


## 3) remove the notActive_siteSpecies_2023 and 2024 files from optimased dataset.
    - Total files checked: 6201
    - Files with >20% missing @Value: 1628
    - Issue rate (excluding not-active): 26.25%
    - Files excluded (not-active): 1248

    - Final issue rate (excluding not-active): 26.25%

In [None]:
# Remove files using the exclusion logic from detailed_issue_rate_excluding_notactive and log removals

def rm_files_notactive(optimased_root, notactive_files, dry_run=True, log_csv_path=None):
    """
    Remove (or list for removal) all files in optimased_dir that would be excluded by detailed_issue_rate_excluding_notactive.
    If dry_run is True, only print the files that would be removed. If False, actually delete them.
    If log_csv_path is provided, save a CSV log of removed files with columns:
    siteCode,SpeciesCode,month_number,filename,path,expected_filename_prefix
    Returns the list of files removed (or would be removed).
    """
    exclude_filenames = set()
    file_info = {}
    # Build exclusion set and info from notactive_files
    for naf in notactive_files:
        if Path(naf).exists():
            df_exclude = pd.read_csv(naf)
            if 'filename' in df_exclude.columns:
                for _, row in df_exclude.iterrows():
                    fname = str(row['filename']).strip()
                    exclude_filenames.add(fname)
                    file_info[fname] = {
                        'siteCode': row.get('siteCode', ''),
                        'SpeciesCode': row.get('SpeciesCode', ''),
                        'month_number': row.get('month_number', ''),
                        'filename': fname,
                        'path': row.get('path', ''),
                        'expected_filename_prefix': row.get('expected_filename_prefix', '')
                    }
            else:
                for _, row in df_exclude.iterrows():
                    site = str(row['siteCode'])
                    species = str(row['SpeciesCode'])
                    pattern = f"{site}_{species}_"
                    for f in all_csv_files:
                        if f.name.startswith(pattern):
                            exclude_filenames.add(f.name)
                            file_info[f.name] = {
                                'siteCode': site,
                                'SpeciesCode': species,
                                'month_number': row.get('month_number', ''),
                                'filename': f.name,
                                'path': str(f),
                                'expected_filename_prefix': pattern
                            }
    files_to_remove = [f for f in all_csv_files if f.name in exclude_filenames]
    print(f"Found {len(files_to_remove)} files to remove from {optimased_root}.")
    log_rows = []
    if dry_run:
        print("Dry run: No files will be deleted. Files that would be removed:")
        for f in files_to_remove:
            print(f)
            if f.name in file_info:
                log_rows.append(file_info[f.name])
            else:
                log_rows.append({
                    'siteCode': '', 'SpeciesCode': '', 'month_number': '',
                    'filename': f.name, 'path': str(f), 'expected_filename_prefix': ''
                })
    else:
        for f in files_to_remove:
            try:
                f.unlink()
                print(f"Deleted: {f}")
            except Exception as e:
                print(f"Error deleting {f}: {e}")
            if f.name in file_info:
                log_rows.append(file_info[f.name])
            else:
                log_rows.append({
                    'siteCode': '', 'SpeciesCode': '', 'month_number': '',
                    'filename': f.name, 'path': str(f), 'expected_filename_prefix': ''
                })
    # adding to missing log
    if log_csv_path and log_rows:
        df_log = pd.DataFrame(log_rows)
        # df_log = df_log[['siteCode','SpeciesCode','month_number','filename','path','expected_filename_prefix']]
        # df_log.to_csv(log_csv_path, index=False, encoding='utf-8')
        print(f"Log written to {log_csv_path}")
    return files_to_remove




In [None]:
# Example usage (dry run):
notactive_files = [output_notActive_siteSpecies_2023, output_notActive_siteSpecies_2024]
#for removal dry_run= False 
files_to_remove = rm_files_notactive(optimased_root, notactive_files, dry_run=True, log_csv_path=log_csv_path)
print(f"Total files that would be removed: {len(files_to_remove)}")

Found 1248 files to remove from /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased.
Error deleting /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_mar/MEA_PM2.5_2023-03-01_2023-03-31.csv: [Errno 2] No such file or directory: '/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_mar/MEA_PM2.5_2023-03-01_2023-03-31.csv'
Error deleting /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_mar/IS6_PM2.5_2023-03-01_2023-03-31.csv: [Errno 2] No such file or directory: '/Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_mar/IS6_PM2.5_2023-03-01_2023-03-31.csv'
Error deleting /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2023_mar/WA9_NO2_2023-03-01_2023-03-31.csv: [Errno 2] No such file or directory: '/Users/bur

        Log written to /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/missing/logs_rm_notActive_23_24.csv
        Total files that would be removed: 1248

### 4)  remove all KF1 and MR8 PM2.5/PM25 files from the optimased directory:
 -  noticed that those files still at optimased path.
 - Below the function for remove them all.


In [10]:
def remove_kf1_mr8_pm25_files(optimased_root, dry_run=False):
    """
    Remove all KF1 and MR8 PM2.5/PM25 files from the optimased directory (all months/years).
    If dry_run is True, only print the files that would be removed. If False, actually delete them.
    """

    patterns = [
        "KF1_PM2.5_*.csv", "KF1_PM25_*.csv",
        "MR8_PM2.5_*.csv", "MR8_PM25_*.csv"
    ]
    files_to_rm = []
    for pattern in patterns:
        files_to_rm.extend(Path(optimased_root).rglob(pattern))
    print(f"Found {len(files_to_rm)} KF1/MR8 PM2.5/PM25 files to remove in {optimased_root}.")
    if dry_run:
        print("Dry run: No files will be deleted. Files that would be removed:")
        for f in files_to_rm:
            print(f)
    else:
        for f in files_to_rm:
            try:
                f.unlink()
                print(f"Deleted: {f}")
            except Exception as e:
                print(f"Error deleting {f}: {e}")
    print(f"Total files processed: {len(files_to_rm)}")
    return files_to_rm
    

In [11]:
removed_files = remove_kf1_mr8_pm25_files(optimased_root, dry_run=False)

# Log the removed files to the same log as not-active removals
if removed_files:
    import pandas as pd
    log_rows = []
    for f in removed_files:
        log_rows.append({
            'siteCode': '',
            'SpeciesCode': '',
            'month_number': '',
            'filename': f.name,
            'path': str(f),
            'expected_filename_prefix': f.name[:f.name.find('_', 5)] if '_' in f.name[5:] else ''
        })
    try:
        # If the log file exists, append; otherwise, create
        try:
            df_log = pd.read_csv(log_csv_path)
            df_new = pd.DataFrame(log_rows)
            df_log = pd.concat([df_log, df_new], ignore_index=True)
        except FileNotFoundError:
            df_log = pd.DataFrame(log_rows)
        df_log.to_csv(log_csv_path, index=False, encoding='utf-8')
        print(f"Appended {len(log_rows)} entries to log: {log_csv_path}")
    except Exception as e:
        print(f"Error writing to log: {e}")
else:
    print("No files were removed.")



Found 22 KF1/MR8 PM2.5/PM25 files to remove in /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased.
Deleted: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2025_feb/KF1_PM25_2025-02-01_2025-02-28.csv
Deleted: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2025_aug/KF1_PM25_2025-08-01_2025-08-31.csv
Deleted: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2025_mar/KF1_PM25_2025-03-01_2025-03-31.csv
Deleted: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2025_jul/KF1_PM25_2025-07-01_2025-07-31.csv
Deleted: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2025_oct/KF1_PM25_2025-10-01_2025-10-31.csv
Deleted: /Users/burdzhuchaglayan/Desktop/data science projects/air-pollution-levels/data/laqn/optimased/2025_sep/KF1_

## 4) Analysis of Site-Species Overlaps(2023/2024),  Issue Rate Calculation

This function checks  which site/species combo appear in both the non-active lists. (2023 and 2024) and the optimised dataset. The goal is to understand which files can be 
removed and what the new issue rate will be after exclusion.
Because I realised that, the non active site/species not balanced in each year and trying to do machine learning won't be work, in this unbalanced data. Soo I need to balanced the data. 

    - Steps:
        - 1. Load the non-active site/species lists for 2023 and 2024
        - 2. Identify which combinations exist in the optimised folder
        - 3. Calculate overlap between non-active lists and actual files
        - 4. Determine new metadata after removal
        - 5. Calculate updated issue rate excluding non-active combinations