# LAQN dataset Find Missing Parts

- I will identify the missing values and data gaps in the LAQN dataset and decide how to address them.
- I’ll start by importing the relevant modules and displaying the initial file paths.

In [13]:
import pandas as pd
from pathlib import Path
import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

base_dir = Path.cwd().resolve()
std_dir = base_dir / "data" / "laqn" / "std"

## 1) Functions below:

### 1.1)The functions for discover and checks data quality metrics before cleaning, below.

#### 1) Data quality function, what it does:
- Counts total rows in dataset
- Identifies missing values per column (count + percentage)
- Counts duplicate rows based on timestamp
- Detects negative values in measurements
- Checks timestamp format issues

In [11]:
def data_quality(self, df: pd.DataFrame, filename: str) -> Dict:
        """
        Checking data quality metrics before start cleaning.
        
        Returns dict with:
        - total_rows
        - missing_values
        - duplicate_count
        - negative_values
        - timestamp_format
        """
        assessment = {
            'filename': filename,
            'total_rows': len(df),
            'missing_values': {},
            'duplicate_count': 0,
            'negative_values': 0,
            'timestamp_issues': False
        }
        
        # missing values
        for col in df.columns:
            missing = df[col].isnull().sum()
            if missing > 0:
                assessment['missing_values'][col] = {
                    'count': int(missing),
                    'percentage': round(missing / len(df) * 100, 2)
                }
        
        # duplicates
        if '@MeasurementDateGMT' in df.columns:
            assessment['duplicate_count'] = df.duplicated(
                subset=['@MeasurementDateGMT']
            ).sum()
        
        # negative values
        if '@Value' in df.columns:
            assessment['negative_values'] = (df['@Value'] < 0).sum()
        
        # timestamp format
        if '@MeasurementDateGMT' in df.columns:
            assessment['timestamp_issues'] = df['@MeasurementDateGMT'].dtype == 'object'
        
        return assessment

#### 2) Below the function for missing data gasps:
- Checks which site/species  are missing in Monthly/year folders.

In [12]:
def analyse_missing_data(base_dir: Path = base_dir):
    monthly_data_dir = base_dir / "data" / "laqn" / "monthly_data"
    year_2023_dir = base_dir / "data" / "laqn" / "year_2023"
    metadata_file = base_dir / "data" / "laqn" / "actv_sites_species.csv"

    metadata = pd.read_csv(metadata_file)
    expected = {(r["SiteCode"], r["SpeciesCode"]) for _, r in metadata.iterrows()}

    out = {
        "monthly_data": {"by_location": {}, "by_pollutant": {}, "by_month": {}, "missing_combinations": []},
        "year_2023": {"by_location": {}, "by_pollutant": {}, "missing_combinations": []},
        "summary": {"expected_combinations": len(expected), "monthly_data_found": 0, "year_2023_found": 0, "total_missing": 0},
    }

    if monthly_data_dir.exists():
        found_m = set()
        for fp in monthly_data_dir.rglob("*.csv"):
            parts = fp.name.split("_")
            if len(parts) >= 2:
                site, pol = parts[0], parts[1]
                month = fp.parent.name
                found_m.add((site, pol))
                out["monthly_data"]["by_location"].setdefault(site, {}).setdefault(pol, []).append(month)
                out["monthly_data"]["by_pollutant"].setdefault(pol, {}).setdefault(site, []).append(month)
                out["monthly_data"]["by_month"].setdefault(month, []).append((site, pol))
        missing_m = expected - found_m
        for s, p in sorted(missing_m):
            out["monthly_data"]["missing_combinations"].append({"site": s, "pollutant": p, "data_source": "monthly_data"})
        out["summary"]["monthly_data_found"] = len(found_m)

    if year_2023_dir.exists():
        found_y = set()
        for fp in year_2023_dir.glob("*.csv"):
            parts = fp.name.split("_")
            if len(parts) >= 2:
                site, pol = parts[0], parts[1]
                found_y.add((site, pol))
                out["year_2023"]["by_location"].setdefault(site, []).append(pol)
                out["year_2023"]["by_pollutant"].setdefault(pol, []).append(site)
        missing_y = expected - found_y
        for s, p in sorted(missing_y):
            out["year_2023"]["missing_combinations"].append({"site": s, "pollutant": p, "data_source": "year_2023"})
        out["summary"]["year_2023_found"] = len(found_y)

    out["summary"]["total_missing"] = (
        len(out["monthly_data"]["missing_combinations"]) + len(out["year_2023"]["missing_combinations"])
    )
    return out

#### 3) Get missing month locations function below:
- for each month in monthly_data/ lists available/missing pollutants per site.

In [None]:
def get_missing_month_location(base_dir: Path = base_dir):
    monthly_data_dir = base_dir / "data" / "laqn" / "monthly_data"
    metadata_file = base_dir / "data" / "laqn" / "actv_sites_species.csv"

    md = pd.read_csv(metadata_file)
    all_pol = set(md["SpeciesCode"].unique())
    all_sites = set(md["SiteCode"].unique())
    result = {}

    if monthly_data_dir.exists():
        for month_dir in sorted([d for d in monthly_data_dir.iterdir() if d.is_dir()]):
            month = month_dir.name
            result[month] = {}
            found = {}
            for fp in month_dir.glob("*.csv"):
                parts = fp.name.split("_")
                if len(parts) >= 2:
                    site, pol = parts[0], parts[1]
                    found.setdefault(site, set()).add(pol)
            for site in sorted(all_sites):
                available = found.get(site, set())
                missing = all_pol - available
                if available or missing:
                    result[month][site] = {
                        "available_pollutants": sorted(available),
                        "missing_pollutants": sorted(missing),
                        "count_available": len(available),
                        "count_missing": len(missing),
                    }
    return result

#### 4) Analayse standartised files missing values.
 - finds atd files that have categoritical columns as the way I standartise.

In [None]:
def analyze_categorical_with_missing(std_dir: Path = std_dir):
    if not std_dir.exists():
        logger.error(f"Directory not found: {std_dir}")
        return {}

    cat_cols = ["SiteName", "SiteType", "SpeciesName"]
    year_dirs = sorted([d for d in std_dir.iterdir() if d.is_dir() and d.name.isdigit()])

    results = {
        "files_with_categorical_and_missing": [],
        "summary": {"total_files_checked": 0, "files_with_categorical": 0, "files_with_both": 0},
    }

    for ydir in year_dirs:
        year = ydir.name
        for fp in sorted(ydir.glob("*.csv")):
            results["summary"]["total_files_checked"] += 1
            try:
                df = pd.read_csv(fp)
                if df.empty:
                    continue
                has_cat = any(c in df.columns for c in cat_cols)
                if has_cat:
                    results["summary"]["files_with_categorical"] += 1
                    if df.isnull().any().any():
                        info = {
                            "year": year,
                            "file": fp.name,
                            "path": str(fp),
                            "total_rows": len(df),
                            "categorical_columns": [c for c in cat_cols if c in df.columns],
                            "missing_details": {},
                        }
                        for col in df.columns:
                            miss = df[col].isna().sum()
                            if miss > 0:
                                pct = (miss / len(df) * 100) if len(df) else 0
                                info["missing_details"][col] = {"missing_count": int(miss), "missing_pct": round(pct, 2)}
                        results["files_with_categorical_and_missing"].append(info)
                        results["summary"]["files_with_both"] += 1
            except Exception as e:
                logger.warning(f"Error analyzing {fp}: {e}")
    return results



#### 5) Function that checks what is categorically missing and creates a log entry to track the pattern.
- This extension is useful for understanding why data is missing whether due to system overload, a fetching error/bug, incorrect URL or endpoint requests, or non-responsive endpoints.



In [None]:
def generate_categorical_missing_log(base_dir: Path = base_dir):
    output_dir = base_dir / "data" / "laqn" / "std" / "missing"
    output_dir.mkdir(parents=True, exist_ok=True)

    analysis = analyze_categorical_with_missing(base_dir / "data" / "laqn" / "std")
    all_files = analysis["files_with_categorical_and_missing"]
    if not all_files:
        logger.info("No files found with both categorical columns and missing values")
        return None

    rows = []
    for f in all_files:
        parts = f["file"].replace(".csv", "").split("_")
        site = parts[0] if len(parts) > 0 else ""
        species = parts[1] if len(parts) > 1 else ""
        month = parts[2].split("-")[1] if len(parts) >= 4 else ""
        val_missing = f["missing_details"].get("value", {})
        miss_count = val_missing.get("missing_count", 0)
        miss_pct = val_missing.get("missing_pct", 0)
        rows.append({
            "File": f["file"],
            "year": f["year"],
            "month": month,
            "siteCode": site,
            "SpeciesCode": species,
            "path": f["path"],
            "total_rows": f["total_rows"],
            "value": f"{miss_count}/{f['total_rows']} rows ({miss_pct}%)",
            "categorical_columns": ",".join(f["categorical_columns"]),
        })

    df_log = pd.DataFrame(rows)
    out_file = output_dir / "2missing_files_log.csv"
    df_log.to_csv(out_file, index=False)
    return str(out_file)

### 2) Comprehensive test to find ALL missing and problematic data in standardized files.

What the testing function does below:
- Scans all year directories in std
- Checks each CSV file for 8 types of issues:
    - Empty files: No rows
    - Column errors: Missing required columns (timestamp, SiteCode, SpeciesCode, value)
    - Duplicate timestamps: Multiple measurements at same time
    - Missing SiteCode: Null values in SiteCode column
    - Missing SpeciesCode: Null values in SpeciesCode column
    - High missing values: >20% of value column is null
    - Format errors: Cannot read file
    - Provides detailed reporting with:
        - Total files processed
        - Issue statistics
        - Examples of each problem type
        - Severity analysis

In [8]:
def test_comprehensive_missing_data_analysis(self):
        """Comprehensive test to find all missing and problematic data in standardized files"""
        print("\n" + "="*100)
        print("Missing Data Analysis for LAQN Data Files")
        print("Checking all files in: data/laqn/std/")
        print("="*100)
        
        # Get all year directories
        year_dirs = sorted([d for d in self.std_dir.iterdir() if d.is_dir() and d.name.isdigit()])
        
        print(f"\nFound {len(year_dirs)} year directories: {[d.name for d in year_dirs]}")
        
        # Initialize tracking
        all_issues = {
            'missing_files': [],
            'duplicate_timestamps': [],
            'missing_sitecode': [],
            'missing_speciescode': [],
            'high_missing_values': [],
            'empty_files': [],
            'column_errors': [],
            'format_errors': []
        }
        
        total_files = 0
        processed_files = 0
        files_with_issues = 0
        
        for year_dir in year_dirs:
            year = year_dir.name
            print(f"\n" + "-"*100)
            print(f"YEAR: {year}")
            print("-"*100)
            
            csv_files = sorted(list(year_dir.glob('*.csv')))
            print(f"Total files in {year}: {len(csv_files)}")
            
            year_issues = 0
            year_high_missing = 0
            year_duplicates = 0
            year_empty = 0
            
            for filepath in csv_files:
                total_files += 1
                filename = filepath.name
                
                try:
                    # Read file
                    df = pd.read_csv(filepath)
                    
                    # Check if file is empty
                    if df.empty or len(df) == 0:
                        all_issues['empty_files'].append({
                            'year': year,
                            'file': filename,
                            'path': str(filepath)
                        })
                        year_empty += 1
                        files_with_issues += 1
                        continue
                    
                    processed_files += 1
                    file_issues = []
                    
                    # Parse filename to extract expected site and species code
                    parts = filename.replace('.csv', '').split('_')
                    expected_site = parts[0] if len(parts) > 0 else None
                    expected_species = parts[1] if len(parts) > 1 else None
                    expected_date_range = f"{parts[2]}_{parts[3]}" if len(parts) > 3 else None
                    
                    # rule 1 Check for required columns
                    required_columns = ['timestamp', 'SiteCode', 'SpeciesCode', 'value']
                    missing_cols = [col for col in required_columns if col not in df.columns]
                    
                    if missing_cols:
                        all_issues['column_errors'].append({
                            'year': year,
                            'file': filename,
                            'path': str(filepath),
                            'missing_columns': missing_cols,
                            'actual_columns': list(df.columns)
                        })
                        files_with_issues += 1
                        continue
                    
                    # rule 2 Check for duplicate timestamps
                    if 'timestamp' in df.columns:
                        duplicate_timestamps = df['timestamp'].duplicated().sum()
                        if duplicate_timestamps > 0:
                            all_issues['duplicate_timestamps'].append({
                                'year': year,
                                'file': filename,
                                'path': str(filepath),
                                'duplicate_count': int(duplicate_timestamps),
                                'total_rows': len(df),
                                'duplicate_pct': round(duplicate_timestamps / len(df) * 100, 2)
                            })
                            year_duplicates += 1
                            file_issues.append(f"duplicate_timestamps({duplicate_timestamps})")
                            files_with_issues += 1
                    
                    # rule 3 Check SiteCode
                    if 'SiteCode' in df.columns:
                        # Check if all rows have SiteCode
                        missing_sitecode = df['SiteCode'].isna().sum()
                        if missing_sitecode > 0:
                            all_issues['missing_sitecode'].append({
                                'year': year,
                                'file': filename,
                                'path': str(filepath),
                                'missing_count': int(missing_sitecode),
                                'total_rows': len(df),
                                'missing_pct': round(missing_sitecode / len(df) * 100, 2)
                            })
                            file_issues.append(f"missing_sitecode({missing_sitecode})")
                            files_with_issues += 1
                        
                        # Check if SiteCode matches filename
                        unique_sites = df['SiteCode'].unique()
                        if len(unique_sites) > 1:
                            file_issues.append(f"multiple_sitecodes({len(unique_sites)})")
                        elif len(unique_sites) == 1 and unique_sites[0] != expected_site:
                            file_issues.append(f"sitecode_mismatch(expected:{expected_site},got:{unique_sites[0]})")
                    
                    # RULE 4: Check SpeciesCode
                    if 'SpeciesCode' in df.columns:
                        # Check if all rows have SpeciesCode
                        missing_speciescode = df['SpeciesCode'].isna().sum()
                        if missing_speciescode > 0:
                            all_issues['missing_speciescode'].append({
                                'year': year,
                                'file': filename,
                                'path': str(filepath),
                                'missing_count': int(missing_speciescode),
                                'total_rows': len(df),
                                'missing_pct': round(missing_speciescode / len(df) * 100, 2)
                            })
                            file_issues.append(f"missing_speciescode({missing_speciescode})")
                            files_with_issues += 1
                        
                        # Check if SpeciesCode matches filename
                        unique_species = df['SpeciesCode'].unique()
                        if len(unique_species) > 1:
                            file_issues.append(f"multiple_speciescodes({len(unique_species)})")
                        elif len(unique_species) == 1 and unique_species[0] != expected_species:
                            file_issues.append(f"speciescode_mismatch(expected:{expected_species},got:{unique_species[0]})")
                    
                    # RULE 5: Check value column (>20% missing is a problem)
                    if 'value' in df.columns:
                        missing_values = df['value'].isna().sum()
                        missing_pct = missing_values / len(df) * 100
                        
                        if missing_pct > 20:
                            all_issues['high_missing_values'].append({
                                'year': year,
                                'file': filename,
                                'path': str(filepath),
                                'missing_count': int(missing_values),
                                'total_rows': len(df),
                                'missing_pct': round(missing_pct, 2)
                            })
                            year_high_missing += 1
                            file_issues.append(f"high_missing_values({round(missing_pct, 1)}%)")
                            files_with_issues += 1
                
                except Exception as e:
                    all_issues['format_errors'].append({
                        'year': year,
                        'file': filename,
                        'path': str(filepath),
                        'error': str(e)
                    })
                    files_with_issues += 1
            
            # Year summary
            print(f"\nYear {year} Summary:")
            print(f"  Total files: {len(csv_files)}")
            print(f"  Files with issues: {year_issues + year_duplicates + year_high_missing + year_empty}")
            print(f"    - Empty files: {year_empty}")
            print(f"    - Duplicate timestamps: {year_duplicates}")
            print(f"    - High missing values (>20%): {year_high_missing}")
        
        # PRINT DETAILED REPORT
        print("\n" + "="*100)
        print("Missing Data Analysis Report")
        print("="*100)
        
        print(f"\nTotal files processed: {total_files}")
        print(f"Files with issues: {files_with_issues}")
        if total_files > 0:
            print(f"Issue rate: {(files_with_issues/total_files*100):.1f}%")
        
        # 1. EMPTY FILES
        print("\n" + "-"*100)
        print("EMPTY FILES")
        print("-"*100)
        if all_issues['empty_files']:
            print(f"Total: {len(all_issues['empty_files'])}\n")
            for item in all_issues['empty_files'][:20]:
                print(f"  {item['year']}/{item['file']}")
            if len(all_issues['empty_files']) > 20:
                print(f"  ... and {len(all_issues['empty_files']) - 20} more")
        else:
            print("No empty files found")
        
        # 2. COLUMN ERRORS
        print("\n" + "-"*100)
        print("Column errors for missing required columns")
        print("-"*100)
        if all_issues['column_errors']:
            print(f"Total: {len(all_issues['column_errors'])}\n")
            for item in all_issues['column_errors'][:20]:
                print(f"  {item['year']}/{item['file']}")
                print(f"    Missing: {item['missing_columns']}")
                print(f"    Has: {item['actual_columns']}")
            if len(all_issues['column_errors']) > 20:
                print(f"  ... and {len(all_issues['column_errors']) - 20} more")
        else:
            print(" All files have required columns")
        
        # 3. duplicate timestamp
        print("\n" + "-"*100)
        print("duplicate timestamp")
        print("-"*100)
        if all_issues['duplicate_timestamps']:
            print(f"Total files with duplicates: {len(all_issues['duplicate_timestamps'])}\n")
            for item in sorted(all_issues['duplicate_timestamps'], key=lambda x: x['duplicate_pct'], reverse=True)[:20]:
                print(f"  {item['year']}/{item['file']}")
                print(f"    Duplicates: {item['duplicate_count']}/{item['total_rows']} ({item['duplicate_pct']}%)")
            if len(all_issues['duplicate_timestamps']) > 20:
                print(f"  ... and {len(all_issues['duplicate_timestamps']) - 20} more")
        else:
            print("  No duplicate timestamps found")
        
        # 4.mmissing sitecode
        print("\n" + "-"*100)
        print("missing sitecode")
        print("-"*100)
        if all_issues['missing_sitecode']:
            print(f"Total files affected: {len(all_issues['missing_sitecode'])}\n")
            for item in sorted(all_issues['missing_sitecode'], key=lambda x: x['missing_pct'], reverse=True)[:20]:
                print(f"  {item['year']}/{item['file']}")
                print(f"    Missing: {item['missing_count']}/{item['total_rows']} ({item['missing_pct']}%)")
            if len(all_issues['missing_sitecode']) > 20:
                print(f"  ... and {len(all_issues['missing_sitecode']) - 20} more")
        else:
            print("  No missing SiteCode values found")
        
        # 5. Mmissing speciescode
        print("\n" + "-"*100)
        print("Missing speciescode")
        print("-"*100)
        if all_issues['missing_speciescode']:
            print(f"Total files affected: {len(all_issues['missing_speciescode'])}\n")
            for item in sorted(all_issues['missing_speciescode'], key=lambda x: x['missing_pct'], reverse=True)[:20]:
                print(f"  {item['year']}/{item['file']}")
                print(f"    Missing: {item['missing_count']}/{item['total_rows']} ({item['missing_pct']}%)")
            if len(all_issues['missing_speciescode']) > 20:
                print(f"  ... and {len(all_issues['missing_speciescode']) - 20} more")
        else:
            print(" No missing SpeciesCode values found")
        
        # 6. HIGH MISSING VALUES (>20%)
        print("\n" + "-"*100)
        print("Missing values (>20%)")
        print("-"*100)
        if all_issues['high_missing_values']:
            print(f"Total files with >20% missing: {len(all_issues['high_missing_values'])}\n")
            for item in sorted(all_issues['high_missing_values'], key=lambda x: x['missing_pct'], reverse=True)[:20]:
                print(f"  {item['year']}/{item['file']}")
                print(f"    Missing: {item['missing_count']}/{item['total_rows']} ({item['missing_pct']}%)")
            if len(all_issues['high_missing_values']) > 20:
                print(f"  ... and {len(all_issues['high_missing_values']) - 20} more")
        else:
            print("No files with >20% missing values")
        
        # 7. FORMAT ERRORS
        print("\n" + "-"*100)
        print("format errors (Cannot read file)")
        print("-"*100)
        if all_issues['format_errors']:
            print(f"Total files with errors: {len(all_issues['format_errors'])}\n")
            for item in all_issues['format_errors'][:10]:
                print(f"  {item['year']}/{item['file']}")
                print(f"    Error: {item['error'][:80]}")
            if len(all_issues['format_errors']) > 10:
                print(f"  ... and {len(all_issues['format_errors']) - 10} more")
        else:
            print(" All files can be read successfully")
        
        # FINAL SUMMARY
        print("\n" + "="*100)
        print("FINAL SUMMARY")
        print("="*100)
        
        total_issues = (
            len(all_issues['empty_files']) +
            len(all_issues['column_errors']) +
            len(all_issues['duplicate_timestamps']) +
            len(all_issues['missing_sitecode']) +
            len(all_issues['missing_speciescode']) +
            len(all_issues['high_missing_values']) +
            len(all_issues['format_errors'])
        )
        
        print(f"\nTotal issues found: {total_issues}")
        print(f"  Empty files: {len(all_issues['empty_files'])}")
        print(f"  Column errors: {len(all_issues['column_errors'])}")
        print(f"  Duplicate timestamps: {len(all_issues['duplicate_timestamps'])}")
        print(f"  Missing SiteCode: {len(all_issues['missing_sitecode'])}")
        print(f"  Missing SpeciesCode: {len(all_issues['missing_speciescode'])}")
        print(f"  High missing values (>20%): {len(all_issues['high_missing_values'])}")
        print(f"  Format errors: {len(all_issues['format_errors'])}")
        
        if total_issues == 0:
            print("\n All files passed quality checks! ")
        else:
            print(f"\n⚠ {total_issues} issues need attention")
        
        print("\n" + "="*100 + "\n")