In [1]:
import os
%pwd

'd:\\ML-Projects\\03-Air-Quality-Index-Predictor\\research'

In [2]:
os.chdir("../")
%pwd

'd:\\ML-Projects\\03-Air-Quality-Index-Predictor'

In [3]:
from dataclasses import dataclass
from pathlib import Path
from datetime import datetime

@dataclass(frozen=True)
class DataCollectionConfig:
    root_dir: Path
    api_key: str
    city_info: dict
    start_date: datetime
    end_date: datetime
    output_file: Path

In [4]:
from Air_Quality_Predictor.constants import *
from Air_Quality_Predictor.utils.common import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_collection_config(self) -> DataCollectionConfig:
        config = self.config.data_collection
        
        data_collection_config = DataCollectionConfig(
            root_dir = config.root_dir,
            api_key = config.api_key,
            city_info = config.city_info,
            start_date = datetime.strptime(config['start_date'], "%Y-%m-%d"),
            end_date = datetime.strptime(config['end_date'], "%Y-%m-%d"),
            output_file = config.output_file
        )

        return data_collection_config
    

In [6]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time
from pathlib import Path
from Air_Quality_Predictor.logging import logger

In [7]:
class DataCollection:
    def __init__(self, config: DataCollectionConfig):
        self.config = config
        self.pollution_url = "http://api.openweathermap.org/data/2.5/air_pollution/history"
        self.pollution_data_df = pd.DataFrame()

    def fetch_pollution_data(self, lat: float, lon: float, start: datetime, end: datetime):
        params = {
            "lat": lat,
            "lon": lon,
            "start": int(start.timestamp()),
            "end": int(end.timestamp()),
            "appid": self.config.api_key
        }
        response = requests.get(self.pollution_url, params=params)
        return response

    def collect_data(self):
        current_date = self.config.start_date
        months_fetched = 0

        while current_date <= self.config.end_date:
            end_of_month = (current_date.replace(day=1) + timedelta(days=32)).replace(day=1) - timedelta(days=1)
            if end_of_month > self.config.end_date:
                end_of_month = self.config.end_date

            pollution_response = self.fetch_pollution_data(self.config.city_info["lat"], self.config.city_info["lon"], current_date, end_of_month)
            
            if pollution_response.status_code == 200:
                pollution_data = pollution_response.json()
                
                if pollution_data.get("list"):
                    daily_agg_pollution = {
                        "pm2_5": 0,
                        "pm10": 0,
                        "o3": 0,
                        "no2": 0,
                        "so2": 0,
                        "co": 0,
                        "count": 0
                    }
                    for hourly_data in pollution_data["list"]:
                        daily_agg_pollution["pm2_5"] += hourly_data["components"].get("pm2_5", 0)
                        daily_agg_pollution["pm10"] += hourly_data["components"].get("pm10", 0)
                        daily_agg_pollution["o3"] += hourly_data["components"].get("o3", 0)
                        daily_agg_pollution["no2"] += hourly_data["components"].get("no2", 0)
                        daily_agg_pollution["so2"] += hourly_data["components"].get("so2", 0)
                        daily_agg_pollution["co"] += hourly_data["components"].get("co", 0)
                        daily_agg_pollution["count"] += 1
                    
                    if daily_agg_pollution["count"] > 0:
                        pollution_row = {
                            "city": self.config.city_info["city"],
                            "date": current_date.strftime("%Y-%m-%d"),
                            "pm2_5": round(daily_agg_pollution["pm2_5"] / daily_agg_pollution["count"], 3),
                            "pm10": round(daily_agg_pollution["pm10"] / daily_agg_pollution["count"], 3),
                            "o3": round(daily_agg_pollution["o3"] / daily_agg_pollution["count"], 3),
                            "no2": round(daily_agg_pollution["no2"] / daily_agg_pollution["count"], 3),
                            "so2": round(daily_agg_pollution["so2"] / daily_agg_pollution["count"], 3),
                            "co": round(daily_agg_pollution["co"] / daily_agg_pollution["count"], 3),
                        }
                        
                        row_df = pd.DataFrame([pollution_row])
                        self.pollution_data_df = pd.concat([self.pollution_data_df, row_df], ignore_index=True)
            
            else:
                logger.error(f"Error fetching pollution data for {self.config.city_info['city']} on {current_date.strftime('%Y-%m-%d')}")

            current_date += timedelta(days=1)
            if current_date.month != (current_date - timedelta(days=1)).month:
                months_fetched += 1
                logger.info(f"Monthly data fetched: {months_fetched} months")
                logger.info("-" * 50)
            
            # To ensure we do not exceed 60 requests per minute
            time.sleep(1)

    

    def save_data(self):
        city_name = self.config.city_info["city"]
        output_file = Path(self.config.output_file) / f"{city_name}_pollutant_data.csv"
        self.pollution_data_df.to_csv(output_file, index=False)
        logger.info(f"Pollution data for {city_name} has been saved to {output_file}")


In [8]:
try:
    config_manager = ConfigurationManager()
    data_collection_config = config_manager.get_data_collection_config()
    data_collection = DataCollection(config=data_collection_config)
    
    data_collection.collect_data()
    data_collection.save_data()
    
except Exception as e:
    raise e

[2024-06-03 07:35:28,447 : INFO : common : yaml file: config\config.yaml loaded successfully]
[2024-06-03 07:35:28,447 : INFO : common : yaml file: params.yaml loaded successfully]
[2024-06-03 07:35:28,447 : INFO : common : Created directory at: artifacts]
[2024-06-03 07:36:28,152 : INFO : 2817069000 : Monthly data fetched: 1 months]
[2024-06-03 07:36:28,152 : INFO : 2817069000 : --------------------------------------------------]
[2024-06-03 07:37:22,958 : INFO : 2817069000 : Monthly data fetched: 2 months]
[2024-06-03 07:37:22,958 : INFO : 2817069000 : --------------------------------------------------]
[2024-06-03 07:38:20,881 : INFO : 2817069000 : Monthly data fetched: 3 months]
[2024-06-03 07:38:20,881 : INFO : 2817069000 : --------------------------------------------------]
[2024-06-03 07:39:14,618 : INFO : 2817069000 : Monthly data fetched: 4 months]
[2024-06-03 07:39:14,618 : INFO : 2817069000 : --------------------------------------------------]
[2024-06-03 07:40:21,608 : INFO

TypeError: unsupported operand type(s) for /: 'str' and 'str'