In [3]:
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
import warnings
import json
import os

warnings.filterwarnings('ignore')


def get_all_india_cities():
    # Comprehensive city dictionary across all Indian states
    cities = {
        # Major Metropolitan Cities
        'Mumbai': {'lat': 19.0760, 'lng': 72.8777, 'state': 'Maharashtra', 'region': 'West'},
        'Delhi': {'lat': 28.6139, 'lng': 77.2090, 'state': 'Delhi', 'region': 'North'},
        'Bangalore': {'lat': 12.9716, 'lng': 77.5946, 'state': 'Karnataka', 'region': 'South'},
        'Chennai': {'lat': 13.0827, 'lng': 80.2707, 'state': 'Tamil Nadu', 'region': 'South'},
        'Kolkata': {'lat': 22.5726, 'lng': 88.3639, 'state': 'West Bengal', 'region': 'East'},
        'Hyderabad': {'lat': 17.3850, 'lng': 78.4867, 'state': 'Telangana', 'region': 'South'},

        # Delhi NCR
        'New Delhi': {'lat': 28.6139, 'lng': 77.2090, 'state': 'Delhi', 'region': 'North'},
        'Gurgaon': {'lat': 28.4595, 'lng': 77.0266, 'state': 'Haryana', 'region': 'North'},
        'Noida': {'lat': 28.5355, 'lng': 77.3910, 'state': 'Uttar Pradesh', 'region': 'North'},
        'Faridabad': {'lat': 28.4089, 'lng': 77.3178, 'state': 'Haryana', 'region': 'North'},
        'Ghaziabad': {'lat': 28.6692, 'lng': 77.4538, 'state': 'Uttar Pradesh', 'region': 'North'},

        # Maharashtra
        'Pune': {'lat': 18.5204, 'lng': 73.8567, 'state': 'Maharashtra', 'region': 'West'},
        'Nagpur': {'lat': 21.1458, 'lng': 79.0882, 'state': 'Maharashtra', 'region': 'West'},
        'Nashik': {'lat': 19.9975, 'lng': 73.7898, 'state': 'Maharashtra', 'region': 'West'},
        'Aurangabad': {'lat': 19.8762, 'lng': 75.3433, 'state': 'Maharashtra', 'region': 'West'},
        'Solapur': {'lat': 17.6599, 'lng': 75.9064, 'state': 'Maharashtra', 'region': 'West'},
        'Thane': {'lat': 19.2183, 'lng': 72.9781, 'state': 'Maharashtra', 'region': 'West'},

        # Gujarat
        'Ahmedabad': {'lat': 23.0225, 'lng': 72.5714, 'state': 'Gujarat', 'region': 'West'},
        'Surat': {'lat': 21.1702, 'lng': 72.8311, 'state': 'Gujarat', 'region': 'West'},
        'Vadodara': {'lat': 22.3072, 'lng': 73.1812, 'state': 'Gujarat', 'region': 'West'},
        'Rajkot': {'lat': 22.3039, 'lng': 70.8022, 'state': 'Gujarat', 'region': 'West'},
        'Gandhinagar': {'lat': 23.2156, 'lng': 72.6369, 'state': 'Gujarat', 'region': 'West'},

        # Rajasthan
        'Jaipur': {'lat': 26.9124, 'lng': 75.7873, 'state': 'Rajasthan', 'region': 'North'},
        'Jodhpur': {'lat': 26.2389, 'lng': 73.0243, 'state': 'Rajasthan', 'region': 'North'},
        'Udaipur': {'lat': 24.5854, 'lng': 73.7125, 'state': 'Rajasthan', 'region': 'North'},
        'Kota': {'lat': 25.2138, 'lng': 75.8648, 'state': 'Rajasthan', 'region': 'North'},
        'Ajmer': {'lat': 26.4499, 'lng': 74.6399, 'state': 'Rajasthan', 'region': 'North'},

        # Uttar Pradesh
        'Lucknow': {'lat': 26.8467, 'lng': 80.9462, 'state': 'Uttar Pradesh', 'region': 'North'},
        'Kanpur': {'lat': 26.4499, 'lng': 80.3319, 'state': 'Uttar Pradesh', 'region': 'North'},
        'Agra': {'lat': 27.1767, 'lng': 78.0081, 'state': 'Uttar Pradesh', 'region': 'North'},
        'Varanasi': {'lat': 25.3176, 'lng': 82.9739, 'state': 'Uttar Pradesh', 'region': 'North'},
        'Allahabad': {'lat': 25.4358, 'lng': 81.8463, 'state': 'Uttar Pradesh', 'region': 'North'},
        'Meerut': {'lat': 28.9845, 'lng': 77.7064, 'state': 'Uttar Pradesh', 'region': 'North'},
        'Bareilly': {'lat': 28.3670, 'lng': 79.4304, 'state': 'Uttar Pradesh', 'region': 'North'},

        # Karnataka
        'Mysore': {'lat': 12.2958, 'lng': 76.6394, 'state': 'Karnataka', 'region': 'South'},
        'Hubli': {'lat': 15.3647, 'lng': 75.1240, 'state': 'Karnataka', 'region': 'South'},
        'Mangalore': {'lat': 12.9141, 'lng': 74.8560, 'state': 'Karnataka', 'region': 'South'},
        'Belgaum': {'lat': 15.8497, 'lng': 74.4977, 'state': 'Karnataka', 'region': 'South'},

        # Tamil Nadu
        'Coimbatore': {'lat': 11.0168, 'lng': 76.9558, 'state': 'Tamil Nadu', 'region': 'South'},
        'Madurai': {'lat': 9.9252, 'lng': 78.1198, 'state': 'Tamil Nadu', 'region': 'South'},
        'Salem': {'lat': 11.6643, 'lng': 78.1460, 'state': 'Tamil Nadu', 'region': 'South'},
        'Tiruchirappalli': {'lat': 10.7905, 'lng': 78.7047, 'state': 'Tamil Nadu', 'region': 'South'},
        'Tirunelveli': {'lat': 8.7139, 'lng': 77.7567, 'state': 'Tamil Nadu', 'region': 'South'},
        'Vellore': {'lat': 12.9165, 'lng': 79.1325, 'state': 'Tamil Nadu', 'region': 'South'},

        # Andhra Pradesh & Telangana
        'Vijayawada': {'lat': 16.5062, 'lng': 80.6480, 'state': 'Andhra Pradesh', 'region': 'South'},
        'Visakhapatnam': {'lat': 17.6868, 'lng': 83.2185, 'state': 'Andhra Pradesh', 'region': 'South'},
        'Guntur': {'lat': 16.3067, 'lng': 80.4365, 'state': 'Andhra Pradesh', 'region': 'South'},
        'Tirupati': {'lat': 13.6288, 'lng': 79.4192, 'state': 'Andhra Pradesh', 'region': 'South'},
        'Warangal': {'lat': 17.9784, 'lng': 79.6001, 'state': 'Telangana', 'region': 'South'},

        # Kerala
        'Kochi': {'lat': 9.9312, 'lng': 76.2673, 'state': 'Kerala', 'region': 'South'},
        'Thiruvananthapuram': {'lat': 8.5241, 'lng': 76.9366, 'state': 'Kerala', 'region': 'South'},
        'Kozhikode': {'lat': 11.2588, 'lng': 75.7804, 'state': 'Kerala', 'region': 'South'},
        'Thrissur': {'lat': 10.5276, 'lng': 76.2144, 'state': 'Kerala', 'region': 'South'},

        # West Bengal
        'Howrah': {'lat': 22.5958, 'lng': 88.2636, 'state': 'West Bengal', 'region': 'East'},
        'Durgapur': {'lat': 23.5204, 'lng': 87.3119, 'state': 'West Bengal', 'region': 'East'},
        'Asansol': {'lat': 23.6839, 'lng': 86.9523, 'state': 'West Bengal', 'region': 'East'},
        'Siliguri': {'lat': 26.7271, 'lng': 88.3953, 'state': 'West Bengal', 'region': 'East'},

        # Punjab & Haryana
        'Chandigarh': {'lat': 30.7333, 'lng': 76.7794, 'state': 'Chandigarh', 'region': 'North'},
        'Amritsar': {'lat': 31.6340, 'lng': 74.8723, 'state': 'Punjab', 'region': 'North'},
        'Ludhiana': {'lat': 30.9010, 'lng': 75.8573, 'state': 'Punjab', 'region': 'North'},
        'Jalandhar': {'lat': 31.3260, 'lng': 75.5762, 'state': 'Punjab', 'region': 'North'},
        'Patiala': {'lat': 30.3398, 'lng': 76.3869, 'state': 'Punjab', 'region': 'North'},

        # Odisha
        'Bhubaneswar': {'lat': 20.2961, 'lng': 85.8245, 'state': 'Odisha', 'region': 'East'},
        'Cuttack': {'lat': 20.4625, 'lng': 85.8828, 'state': 'Odisha', 'region': 'East'},
        'Rourkela': {'lat': 22.2604, 'lng': 84.8536, 'state': 'Odisha', 'region': 'East'},

        # Madhya Pradesh
        'Bhopal': {'lat': 23.2599, 'lng': 77.4126, 'state': 'Madhya Pradesh', 'region': 'Central'},
        'Indore': {'lat': 22.7196, 'lng': 75.8577, 'state': 'Madhya Pradesh', 'region': 'Central'},
        'Jabalpur': {'lat': 23.1815, 'lng': 79.9864, 'state': 'Madhya Pradesh', 'region': 'Central'},
        'Gwalior': {'lat': 26.2183, 'lng': 78.1828, 'state': 'Madhya Pradesh', 'region': 'Central'},
        'Ujjain': {'lat': 23.1765, 'lng': 75.7885, 'state': 'Madhya Pradesh', 'region': 'Central'},

        # Chhattisgarh
        'Raipur': {'lat': 21.2514, 'lng': 81.6296, 'state': 'Chhattisgarh', 'region': 'Central'},
        'Bhilai': {'lat': 21.1938, 'lng': 81.3509, 'state': 'Chhattisgarh', 'region': 'Central'},

        # Bihar & Jharkhand
        'Patna': {'lat': 25.5941, 'lng': 85.1376, 'state': 'Bihar', 'region': 'East'},
        'Gaya': {'lat': 24.7914, 'lng': 85.0002, 'state': 'Bihar', 'region': 'East'},
        'Ranchi': {'lat': 23.3441, 'lng': 85.3096, 'state': 'Jharkhand', 'region': 'East'},
        'Jamshedpur': {'lat': 22.8046, 'lng': 86.2029, 'state': 'Jharkhand', 'region': 'East'},
        'Dhanbad': {'lat': 23.7957, 'lng': 86.2304, 'state': 'Jharkhand', 'region': 'East'},

        # Northeast India
        'Guwahati': {'lat': 26.1445, 'lng': 91.7362, 'state': 'Assam', 'region': 'Northeast'},
        'Shillong': {'lat': 25.5788, 'lng': 91.8933, 'state': 'Meghalaya', 'region': 'Northeast'},
        'Imphal': {'lat': 24.6637, 'lng': 93.9063, 'state': 'Manipur', 'region': 'Northeast'},
        'Aizawl': {'lat': 23.1645, 'lng': 92.9376, 'state': 'Mizoram', 'region': 'Northeast'},
        'Agartala': {'lat': 23.8315, 'lng': 91.2868, 'state': 'Tripura', 'region': 'Northeast'},
        'Kohima': {'lat': 25.6751, 'lng': 94.1086, 'state': 'Nagaland', 'region': 'Northeast'},
        'Itanagar': {'lat': 27.0844, 'lng': 93.6053, 'state': 'Arunachal Pradesh', 'region': 'Northeast'},

        # Uttarakhand
        'Dehradun': {'lat': 30.3165, 'lng': 78.0322, 'state': 'Uttarakhand', 'region': 'North'},
        'Haridwar': {'lat': 29.9457, 'lng': 78.1642, 'state': 'Uttarakhand', 'region': 'North'},
        'Nainital': {'lat': 29.3803, 'lng': 79.4636, 'state': 'Uttarakhand', 'region': 'North'},

        # Jammu & Kashmir
        'Jammu': {'lat': 32.7266, 'lng': 74.8570, 'state': 'Jammu and Kashmir', 'region': 'North'},
        'Srinagar': {'lat': 34.0837, 'lng': 74.7973, 'state': 'Jammu and Kashmir', 'region': 'North'},

        # Himachal Pradesh
        'Shimla': {'lat': 31.1048, 'lng': 77.1734, 'state': 'Himachal Pradesh', 'region': 'North'},
        'Dharamshala': {'lat': 32.2190, 'lng': 76.3234, 'state': 'Himachal Pradesh', 'region': 'North'},

        # Goa
        'Panaji': {'lat': 15.4909, 'lng': 73.8278, 'state': 'Goa', 'region': 'West'},

        # Union Territories
        'Puducherry': {'lat': 11.9416, 'lng': 79.8083, 'state': 'Puducherry', 'region': 'South'},
        'Port Blair': {'lat': 11.6234, 'lng': 92.7265, 'state': 'Andaman and Nicobar Islands', 'region': 'South'},

        # Sikkim
        'Gangtok': {'lat': 27.3314, 'lng': 88.6134, 'state': 'Sikkim', 'region': 'Northeast'},

        # Ladakh (Union Territory)
        'Leh': {'lat': 34.1526, 'lng': 77.5771, 'state': 'Ladakh', 'region': 'North'},

        # Dadra and Nagar Haveli and Daman and Diu (Union Territory)
        'Silvassa': {'lat': 20.2730, 'lng': 73.0115, 'state': 'Dadra and Nagar Haveli and Daman and Diu', 'region': 'West'},
        'Daman': {'lat': 20.3974, 'lng': 72.8320, 'state': 'Dadra and Nagar Haveli and Daman and Diu', 'region': 'West'},
        'Diu': {'lat': 20.7144, 'lng': 70.9842, 'state': 'Dadra and Nagar Haveli and Daman and Diu', 'region': 'West'},

        # Lakshadweep (Union Territory)
        'Kavaratti': {'lat': 10.5618, 'lng': 72.6369, 'state': 'Lakshadweep', 'region': 'South'},
    }

    return cities


class IndiaAirQualityCollector:
    def __init__(self, api_key, retries=3, backoff_factor=2, rate_limit_sec=1.0):
        # Initialize the collector with API key and parameters controlling retries and rate limits
        self.api_key = api_key
        self.pollution_url = "http://api.openweathermap.org/data/2.5/air_pollution/history"
        self.weather_url = "https://api.openweathermap.org/data/2.5/weather"
        self.cities = get_all_india_cities()
        self.retries = retries
        self.backoff_factor = backoff_factor
        self.rate_limit_sec = rate_limit_sec
        self.error_log_file = "error_log.txt"
        self.data_dir = "city_data_cache"
        if not os.path.exists(self.data_dir):
            os.makedirs(self.data_dir)

    def _retry_request(self, url, params):
        # Utility function to make HTTP requests with retry and exponential backoff
        wait = 1
        for attempt in range(1, self.retries + 1):
            try:
                response = requests.get(url, params=params, timeout=30)
                response.raise_for_status()
                return response.json()
            except Exception as e:
                err_msg = f"Attempt {attempt} failed for URL {url} with params {params}: {e}"
                print(err_msg)
                with open(self.error_log_file, "a") as f:
                    f.write(f"{datetime.now()}: {err_msg}\n")
                if attempt < self.retries:
                    time.sleep(wait)
                    wait *= self.backoff_factor
                else:
                    return None

    def get_historical_pollution_data(self, lat, lng, start_date, end_date):
        # Fetch historical air pollution data for specified coordinates and date range
        print(f"Fetching pollution data for {lat},{lng} from {start_date.date()} to {end_date.date()}")
        params = {
            'lat': lat,
            'lon': lng,
            'start': int(start_date.timestamp()),
            'end': int(end_date.timestamp()),
            'appid': self.api_key
        }
        return self._retry_request(self.pollution_url, params)

    def get_current_weather_data(self, lat, lng):
        # Fetch current weather data for specified coordinates
        print(f"Fetching current weather data for {lat},{lng}")
        params = {
            'lat': lat,
            'lon': lng,
            'appid': self.api_key,
            'units': 'metric'
        }
        return self._retry_request(self.weather_url, params)

    def get_aqi_category(self, aqi_value):
        # Map numeric AQI value to descriptive category string
        categories = {1: 'Good', 2: 'Fair', 3: 'Moderate', 4: 'Poor', 5: 'Very Poor'}
        return categories.get(aqi_value, 'Unknown')

    def save_city_data(self, city_name, data):
        # Save intermediate per-city data as JSON for fault tolerance
        filename = os.path.join(self.data_dir, f"{city_name.replace(' ', '_')}_data.json")
        with open(filename, "w") as f:
            json.dump(data, f)
        print(f"Intermediate data saved for {city_name} to {filename}")

    def collect_comprehensive_data(self, days_back=365, chunk_days=30):
        # Main method to collect data for all cities, handling chunked date ranges and saving intermediate results
        all_data = []
        end_date = datetime.now()
        start_date = end_date - timedelta(days=days_back)

        print(f"\n🚀 Starting data collection for {len(self.cities)} cities")
        print(f"📅 Date range: {start_date.date()} to {end_date.date()}")

        for idx, (city_name, coords) in enumerate(self.cities.items(), 1):
            print(f"\n[{idx:3d}/{len(self.cities)}] Processing {city_name}, {coords['state']}")
            
            
            # Fetch current weather once per city
            weather_data = self.get_current_weather_data(coords['lat'], coords['lng'])
            temp_c = humidity = wind_speed = precipitation = np.nan
            if weather_data and 'main' in weather_data:
                temp_c = weather_data['main'].get('temp', np.nan)
                humidity = weather_data['main'].get('humidity', np.nan)
                wind_speed = weather_data.get('wind', {}).get('speed', np.nan)
                precipitation = 0.0
                if 'rain' in weather_data and '1h' in weather_data['rain']:
                    precipitation = weather_data['rain']['1h']
                elif 'snow' in weather_data and '1h' in weather_data['snow']:
                    precipitation = weather_data['snow']['1h']

            current_start = start_date
            city_records = []
            total_records = 0
            
            # Fetch pollution data chunk-wise
            while current_start < end_date:
                current_end = min(current_start + timedelta(days=chunk_days), end_date)

                pollution_json = self.get_historical_pollution_data(coords['lat'], coords['lng'], current_start, current_end)

                if pollution_json and 'list' in pollution_json and pollution_json['list']:
                    for item in pollution_json['list']:
                        dt_object = datetime.fromtimestamp(item.get('dt', 0))
                        main = item.get('main', {})
                        components = item.get('components', {})
                        aqi_val = main.get('aqi')

                        record = {
                            'City': city_name,
                            'State': coords['state'],
                            'Region': coords['region'],
                            'Latitude': coords['lat'],
                            'Longitude': coords['lng'],
                            'Datetime': dt_object.strftime('%Y-%m-%d %H:%M:%S'),
                            'Date': dt_object.strftime('%Y-%m-%d'),
                            'Year': dt_object.year,
                            'Month': dt_object.month,
                            'Day': dt_object.day,
                            'Hour': dt_object.hour,
                            'Day_of_Week': dt_object.weekday(),
                            'Is_Weekend': 1 if dt_object.weekday() >= 5 else 0,
                            'AQI': aqi_val,
                            'AQI_Category': self.get_aqi_category(aqi_val),
                            'CO': components.get('co'),
                            'NO': components.get('no'),
                            'NO2': components.get('no2'),
                            'O3': components.get('o3'),
                            'SO2': components.get('so2'),
                            'PM2.5': components.get('pm2_5'),
                            'PM10': components.get('pm10'),
                            'NH3': components.get('nh3'),
                            'Temperature_C': temp_c,
                            'Humidity_Percent': humidity,
                            'WindSpeed_mps': wind_speed,
                            'Precipitation_mm': precipitation
                        }
                        city_records.append(record)
                        total_records += 1
                else:
                    print(f"  No pollution data for {city_name} from {current_start.date()} to {current_end.date()}")

                current_start = current_end
                time.sleep(self.rate_limit_sec)    # rate limiting between 

            all_data.extend(city_records)
            print(f"✓ Collected {total_records:,} records for {city_name}")
            
            # Save intermediate city data to JSON
            self.save_city_data(city_name, city_records)

        print(f"\n🎉 Data collection completed! Total records: {len(all_data):,}")
        
        # Convert to DataFrame before returning
        return pd.DataFrame(all_data)


if __name__ == "__main__":
    OPENWEATHER_API_KEY = "7a39513a6de2c9c09352615715060e6a"  # API key

    collector = IndiaAirQualityCollector(
        api_key=OPENWEATHER_API_KEY,
        retries=3,          # Number of retries for failed requests
        backoff_factor=2,   # Backoff multiplier between retries
        rate_limit_sec=1.0  # Delay between API calls in seconds
    )
    # Collect data for last 365 days with 30 days chunks for API limits
    df = collector.collect_comprehensive_data(days_back=365, chunk_days=30)
   
    # Save the full combined dataset as a CSV file on your laptop
    output_filename = f"india_air_quality_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
   
    df.to_csv(output_filename, index=False)    # Save without row indices
    print(f"\n✅ Full dataset saved as: {output_filename}")
   
    # Preview first few rows
    print(df.head())
    
    # Show total records count
    print(f"\nTotal records collected: {len(df):,}")



🚀 Starting data collection for 100 cities
📅 Date range: 2024-08-26 to 2025-08-26

[  1/100] Processing Mumbai, Maharashtra
Fetching current weather data for 19.076,72.8777
Fetching pollution data for 19.076,72.8777 from 2024-08-26 to 2024-09-25
Fetching pollution data for 19.076,72.8777 from 2024-09-25 to 2024-10-25
Fetching pollution data for 19.076,72.8777 from 2024-10-25 to 2024-11-24
Fetching pollution data for 19.076,72.8777 from 2024-11-24 to 2024-12-24
Fetching pollution data for 19.076,72.8777 from 2024-12-24 to 2025-01-23
Fetching pollution data for 19.076,72.8777 from 2025-01-23 to 2025-02-22
Fetching pollution data for 19.076,72.8777 from 2025-02-22 to 2025-03-24
Fetching pollution data for 19.076,72.8777 from 2025-03-24 to 2025-04-23
Fetching pollution data for 19.076,72.8777 from 2025-04-23 to 2025-05-23
Fetching pollution data for 19.076,72.8777 from 2025-05-23 to 2025-06-22
Fetching pollution data for 19.076,72.8777 from 2025-06-22 to 2025-07-22
Fetching pollution data 