# Data Cleaning - Air Quality Dataset
This notebook performs cleaning and preprocessing of the Air Quality dataset.

In [None]:

import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("dataset/air_quality.csv")
print("Initial shape:", df.shape)
df.head()


In [None]:

# Check missing values
print("Missing values per column:")
print(df.isnull().sum())


In [None]:

# Fill missing numeric values with column mean
for col in df.select_dtypes(include=[np.number]).columns:
    df[col].fillna(df[col].mean(), inplace=True)

print("Missing values after filling:")
print(df.isnull().sum())


In [None]:

# Drop duplicate rows
df.drop_duplicates(inplace=True)
print("Shape after dropping duplicates:", df.shape)


In [None]:

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_cols = df.select_dtypes(include=[np.number]).columns

df[num_cols] = scaler.fit_transform(df[num_cols])
print("Dataset normalized.")
df.head()


In [None]:

# Save cleaned dataset
df.to_csv("dataset/air_quality_cleaned.csv", index=False)
print("Cleaned dataset saved as dataset/air_quality_cleaned.csv")
