In [6]:
import os
import pandas as pd
import numpy as np

# Function to calculate AQI for a single pollutant
def calculate_aqi(concentration, pollutant):
    if pd.isna(concentration):  # Skip if NaN
        return None
    for C_lo, C_hi, I_lo, I_hi in breakpoints.get(pollutant, []):
        if C_lo <= concentration <= C_hi:
            return int(((I_hi - I_lo) / (C_hi - C_lo)) * (concentration - C_lo) + I_lo)  # Convert to int
    return None

# Function to calculate overall AQI for a row
def calculate_overall_aqi(row):
    aqi_values = []
    for pollutant in breakpoints.keys():
        if pollutant in row:
            aqi = calculate_aqi(row[pollutant], pollutant)
            if aqi is not None:
                aqi_values.append(aqi)
    return int(max(aqi_values)) if aqi_values else None  # Ensure result is int
    
# Path to the parent directory
base_dir = "../Datasets/"  # Change this to your root directory
output_dir = "./ProcessedData/"  # Directory to save processed files
os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists

# Iterate through each city
for city in os.listdir(base_dir):
    city_path = os.path.join(base_dir, city)
    
    # Check if the item is a directory
    if os.path.isdir(city_path):
        print(f"\nProcessing city: {city}")

        # Create the output directory for the city
        city_output_dir = os.path.join(output_dir, city)  # Define the output directory for the city
        os.makedirs(city_output_dir, exist_ok=True)  # Create the directory if it doesn't exist
        
        # Iterate through each region (CSV file) in the city
        for csv_file in os.listdir(city_path):
            csv_path = os.path.join(city_path, csv_file)
            
            # Check if the item is a CSV file
            if csv_file.endswith('.csv'):
                region_name = os.path.splitext(csv_file)[0]  # Extract region name
                region_output_dir = os.path.join(city_output_dir, region_name)
                os.makedirs(region_output_dir, exist_ok=True)

                print(f"  Reading data for region: {csv_file}")
                
                # Read the CSV file
                df = pd.read_csv(csv_path,encoding="utf-8")

                # Ensure 'Date' column is in proper datetime format
                df['date'] = pd.to_datetime(df['date'], errors='coerce')
                
                # Sort the DataFrame by the 'Date' column
                df = df.sort_values(by='date')
                
                # Reset the index after sorting
                df = df.reset_index(drop=True)

                # Remove leading/trailing spaces in column names
                df.columns = df.columns.str.strip()
                
                # Remove leading/trailing spaces from all string-type columns
                df = df.apply(lambda col: col.str.strip() if col.dtypes == 'object' else col)

                # Convert numeric-looking columns to float
                for col in df.columns:
                    if col != 'date':  # Skip the 'date' column
                        df[col] = pd.to_numeric(df[col], errors='coerce')
                
                # Apply forward and backward fill (exclude the 'date' column)
                for col in df.columns:
                    if col != 'date':  # Skip the 'date' column
                        df[col] = df[col].ffill()  # Forward fill
                        df[col] = df[col].bfill() # Backward fill

                # Convert all numeric columns to integers
                for col in df.columns:
                    if df[col].dtypes in ['float64']:  # Check for float columns
                        df[col] = df[col].astype(int)  # Convert to int

                # Define breakpoints for each pollutant
                breakpoints = {
                    'pm25': [
                        (0, 12, 0, 50),
                        (12.1, 35.4, 51, 100),
                        (35.5, 55.4, 101, 150),
                        (55.5, 150.4, 151, 200),
                        (150.5, 250.4, 201, 300),
                        (250.5, 350.4, 301, 400),
                        (350.5, 500.4, 401, 500),
                    ],
                    'pm10': [
                        (0, 54, 0, 50),
                        (55, 154, 51, 100),
                        (155, 254, 101, 150),
                        (255, 354, 151, 200),
                        (355, 424, 201, 300),
                        (425, 504, 301, 400),
                        (505, 604, 401, 500),
                    ],
                    'no2': [
                        (0, 53, 0, 50),
                        (54, 100, 51, 100),
                        (101, 360, 101, 150),
                        (361, 649, 151, 200),
                        (650, 1249, 201, 300),
                        (1250, 1649, 301, 400),
                        (1650, 2049, 401, 500),
                    ],
                    'so2': [
                        (0, 35, 0, 50),
                        (36, 75, 51, 100),
                        (76, 185, 101, 150),
                        (186, 304, 151, 200),
                        (305, 604, 201, 300),
                        (605, 804, 301, 400),
                        (805, 1004, 401, 500),
                    ],
                    'o3': [
                        (0, 54, 0, 50),
                        (55, 70, 51, 100),
                        (71, 85, 101, 150),
                        (86, 105, 151, 200),
                        (106, 200, 201, 300),
                        (201, 300, 301, 400),
                        (301, 400, 401, 500),
                    ],
                }
                # Add AQI column
                df['AQI'] = df.apply(calculate_overall_aqi, axis=1)
                
                # Save processed file for the region
                region_output_path = os.path.join(city_output_dir, csv_file)
                df.to_csv(region_output_path, index=False)
                print(f"    Saved processed file to {region_output_path}")
                                


Processing city: Bangalore
  Reading data for region: btm,-bangalore-air-quality.csv
    Saved processed file to ./ProcessedData/Bangalore/btm,-bangalore-air-quality.csv
  Reading data for region: hebbal,-bengaluru-air-quality.csv
    Saved processed file to ./ProcessedData/Bangalore/hebbal,-bengaluru-air-quality.csv
  Reading data for region: peenya,-bangalore-air-quality.csv
    Saved processed file to ./ProcessedData/Bangalore/peenya,-bangalore-air-quality.csv

Processing city: Thiruvananthapuram
  Reading data for region: plammoodu,-thiruvananthapuram-air-quality.csv
    Saved processed file to ./ProcessedData/Thiruvananthapuram/plammoodu,-thiruvananthapuram-air-quality.csv
  Reading data for region: kariavattom,-thiruvananthapuram-air-quality.csv
    Saved processed file to ./ProcessedData/Thiruvananthapuram/kariavattom,-thiruvananthapuram-air-quality.csv

Processing city: Delhi
  Reading data for region: r.k.-puram, delhi-air-quality.csv
    Saved processed file to ./ProcessedDa