In [1]:
import os
%pwd

'd:\\ML-Projects\\03-Air-Quality-Index-Predictor\\research'

In [2]:
os.chdir("../")
%pwd

'd:\\ML-Projects\\03-Air-Quality-Index-Predictor'

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [5]:
from Air_Quality_Predictor.constants import *
from Air_Quality_Predictor.utils.common import read_yaml, create_directories

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path
        )

        return data_transformation_config

In [7]:
import os
from Air_Quality_Predictor.utils.common import breakpoints_dict, get_subindex
from Air_Quality_Predictor.logging import logger
import pandas as pd
import numpy as np 
from datetime import datetime,timedelta

In [8]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
    
    def rename_pollutants_columns(self, df):
        df.rename(columns={'co': 'CO', 'no2': 'NO2', 'o3': 'O3', 'pm10': 'PM10', 'pm2_5': 'PM2.5', 'so2': 'SO2'}, inplace=True)
        return df
    
    def convert_date_column(self, df, date_column, date_format):
        df[date_column] = pd.to_datetime(df[date_column], format=date_format)
        return df   

    def set_negative_to_zero(self, df, column_name):
        df[column_name] = df[column_name].apply(lambda x: max(x, 0))
        return df
    
    def calculate_subindices(self, df, breakpoints_dict):
        subindices = {}
        for column, breakpoints in breakpoints_dict.items():
            subindices[column] = df[column].apply(lambda x: get_subindex(x, breakpoints))
        return subindices

    def calculate_AQI(self, new_df):
        new_df["AQI_calculated"] = round(new_df[["PM2.5_SubIndex", "PM10_SubIndex", "SO2_SubIndex", "NO2_SubIndex",
                                        "CO_SubIndex", "O3_SubIndex"]].max(axis=1))
        new_df.loc[new_df["PM2.5_SubIndex"] + new_df["PM10_SubIndex"] <= 0, "AQI_calculated"] = np.NaN
        return new_df
    
    def convert_CO_to_mg_per_m3(self,dataframe):
        # Convert CO column from μg/m3 to mg/m3
        dataframe['CO'] = dataframe['CO'] / 1000
        return dataframe
    
    def select_columns(self, df, columns_to_keep):
        """
        Selects only the specified columns and drops the rest from the DataFrame.
        
        Parameters:
        df (pd.DataFrame): The DataFrame containing the data.
        columns_to_keep (list): A list of column names to keep in the DataFrame.
        
        Returns:
        pd.DataFrame: The DataFrame with only the selected columns.
        """
        df = df[columns_to_keep]
        return df
    
    def convert(self):
        dataset = pd.read_csv(self.config.data_path)
        logger.info("Data read successfully")

        dataset = self.rename_pollutants_columns(dataset)
        logger.info("Data renamed successfully")

        dataset = self.convert_date_column(dataset, 'date', "%d-%m-%Y")
        logger.info("Data date adjusted successfully")

        dataset = self.set_negative_to_zero(dataset, 'O3')
        logger.info("O3 data adjusted successfully")

        dataset = self.convert_CO_to_mg_per_m3(dataset)

        subindices = self.calculate_subindices(dataset, breakpoints_dict)
        logger.info("Data subindices calculated successfully")

        for column, subindex_values in subindices.items():
            dataset[f"{column}_SubIndex"] = subindex_values
        logger.info("Data subindex added  successfully")

        dataset = self.calculate_AQI(dataset)
        logger.info("Data AQI calculated successfully")
        
        dataset = self.select_columns(dataset, ['date','city','CO', 'NO2', 'O3', 'PM10', 'PM2.5', 'SO2','AQI_calculated'])
        logger.info("Column selected successfully")

        dataset.set_index('date', inplace=True)
        dataset = dataset.sort_index(ascending=True)

        train_dataset_end = pd.Timestamp(datetime(2023, 12, 30))
        test_dataset_start = train_dataset_end + timedelta(days=1) 
        test_dataset_end = pd.Timestamp(datetime(2024, 5, 31))

        train_data = dataset.loc[:train_dataset_end]
        test_data = dataset.loc[test_dataset_start:test_dataset_end]
        
        train_data.to_csv(os.path.join(self.config.root_dir, "train_dataset.csv"))
        test_data.to_csv(os.path.join(self.config.root_dir, "test_dataset.csv"))
        logger.info("Train and Test data made successfully")


In [9]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e

[2024-06-03 09:59:36,340 : INFO : common : yaml file: config\config.yaml loaded successfully]
[2024-06-03 09:59:36,341 : INFO : common : yaml file: params.yaml loaded successfully]
[2024-06-03 09:59:36,343 : INFO : common : Created directory at: artifacts]
[2024-06-03 09:59:36,343 : INFO : common : Created directory at: artifacts/data_transformation]
[2024-06-03 09:59:36,354 : INFO : 2892427142 : Data read successfully]
[2024-06-03 09:59:36,355 : INFO : 2892427142 : Data renamed successfully]
[2024-06-03 09:59:36,375 : INFO : 2892427142 : Data date adjusted successfully]
[2024-06-03 09:59:36,378 : INFO : 2892427142 : O3 data adjusted successfully]


TypeError: DataTransformation.convert_CO_to_mg_per_m3() takes 1 positional argument but 2 were given