# LAQN dataset Find Missing Parts

- I will identify the missing values and data gaps in the LAQN dataset and decide how to address them.
- Iâ€™ll start by importing the relevant modules and displaying the initial file paths.

In [3]:
import pandas as pd
from pathlib import Path    
import os
from typing import Dict, List, Tuple, Optional
import logging
from tqdm import tqdm
from datetime import datetime

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


The functions for discover and checks data quality metrics before cleaning, below.

In [4]:
def __init__(self, base_dir: Path = None):
        """Initialize LAQN cleaner with directory paths."""
        if base_dir is None:
            base_dir = Path(__file__).resolve().parent.parent.parent
        
        self.base_dir = base_dir
        self.raw_dir = base_dir / 'data' / 'laqn' / 'monthly_data'
        self.processed_dir = base_dir / 'data' / 'laqn' / 'processed'
        self.clean_dir = base_dir / 'data' / 'laqn' / 'clean'
        
        #new folder creation.
        self.clean_dir.mkdir(parents=True, exist_ok=True)
        
        logger.info(f"LAQN Cleaner initialized")
        logger.info(f"Raw data: {self.raw_dir}")
        logger.info(f"Output: {self.clean_dir}")


Data quality function, what it does:
- Counts total rows in dataset
- Identifies missing values per column (count + percentage)
- Counts duplicate rows based on timestamp
- Detects negative values in measurements
- Checks timestamp format issues

In [6]:
def data_quality(self, df: pd.DataFrame, filename: str) -> Dict:
        """
        Checking data quality metrics before start cleaning.
        
        Returns dict with:
        - total_rows
        - missing_values
        - duplicate_count
        - negative_values
        - timestamp_format
        """
        assessment = {
            'filename': filename,
            'total_rows': len(df),
            'missing_values': {},
            'duplicate_count': 0,
            'negative_values': 0,
            'timestamp_issues': False
        }
        
        # missing values
        for col in df.columns:
            missing = df[col].isnull().sum()
            if missing > 0:
                assessment['missing_values'][col] = {
                    'count': int(missing),
                    'percentage': round(missing / len(df) * 100, 2)
                }
        
        # duplicates
        if '@MeasurementDateGMT' in df.columns:
            assessment['duplicate_count'] = df.duplicated(
                subset=['@MeasurementDateGMT']
            ).sum()
        
        # negative values
        if '@Value' in df.columns:
            assessment['negative_values'] = (df['@Value'] < 0).sum()
        
        # timestamp format
        if '@MeasurementDateGMT' in df.columns:
            assessment['timestamp_issues'] = df['@MeasurementDateGMT'].dtype == 'object'
        
        return assessment

testing script of this function below.