In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import os

# Load the dataset
file_path = 'AirQuality.csv'

if not os.path.exists(file_path):
    print(f"Error: File not found at {file_path}")
else:
    # Read the CSV with semicolon separator
    data = pd.read_csv(file_path, encoding="latin1", sep=';')

    # Display the first few rows and column names
    print(data.head())
    print(data.columns)

    # 1. Data Cleaning
    # Updated num_features based on the actual column names in AirQuality.csv
    # Exclude 'Date', 'Time', and the 'Unnamed' columns
    num_features = ['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)',
                    'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)',
                    'T', 'RH', 'AH']

    # Replace commas with dots and convert to numeric for the numerical features
    for col in num_features:
        data[col] = data[col].astype(str).str.replace(',', '.', regex=False)
        data[col] = pd.to_numeric(data[col], errors='coerce')


    num_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # No categorical features available in this dataset based on the column list
    cat_features = []
    cat_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # Combine preprocessing steps of Input Data
    # Only include the numerical transformer as there are no categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", num_transformer, num_features),
            # Removed categorical transformer as there are no categorical features
            # ("cat", cat_transformer, cat_features)
        ],
        remainder='passthrough' # Keep other columns (like 'Date', 'Time', and Unnamed)
    )
    preprocessor.set_output(transform="pandas")

    # Apply the transformations to the Input data
    data_preprocessed = preprocessor.fit_transform(data)

    # Generate more readable column names - this might need adjustment if column names are complex after preprocessing
    # For this simple case with only numerical features and passthrough, original names might be kept or slightly modified
    # Let's inspect the columns after preprocessing to decide on renaming
    print(data_preprocessed.head())


    # 3. Data Splitting
    # No target variable specified, so we will not split into X and y
    # X = data_preprocessed.drop(columns=['remainder__Potability'])
    # y = data_preprocessed['remainder__Potability']


    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Display the first few rows
    # print(X_train.head())
    # print(y_train.head())

         Date      Time CO(GT)  PT08.S1(CO)  NMHC(GT) C6H6(GT)  PT08.S2(NMHC)  \
0  10/03/2004  18.00.00    2,6       1360.0     150.0     11,9         1046.0   
1  10/03/2004  19.00.00      2       1292.0     112.0      9,4          955.0   
2  10/03/2004  20.00.00    2,2       1402.0      88.0      9,0          939.0   
3  10/03/2004  21.00.00    2,2       1376.0      80.0      9,2          948.0   
4  10/03/2004  22.00.00    1,6       1272.0      51.0      6,5          836.0   

   NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)     T    RH  \
0    166.0        1056.0    113.0        1692.0       1268.0  13,6  48,9   
1    103.0        1174.0     92.0        1559.0        972.0  13,3  47,7   
2    131.0        1140.0    114.0        1555.0       1074.0  11,9  54,0   
3    172.0        1092.0    122.0        1584.0       1203.0  11,0  60,0   
4    131.0        1205.0    116.0        1490.0       1110.0  11,2  59,6   

       AH  Unnamed: 15  Unnamed: 16  
0  0,7578         