<a href="https://colab.research.google.com/github/2303A51072/AIML-2025_B16/blob/main/2303A51072_27_B16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, silhouette_score

# Load dataset correctly
file_path = "/content/AirQualityUCI.xlsx"
data = pd.read_excel(file_path, decimal=",", engine='openpyxl')  # Use openpyxl for .xlsx files

# Data cleaning
data = data.rename(columns=lambda x: x.strip())  # Remove spaces from column names
if 'Unnamed: 15' in data.columns and 'Unnamed: 16' in data.columns:
    data = data.drop(["Unnamed: 15", "Unnamed: 16"], axis=1)  # Drop unnecessary columns if they exist

# Ensure 'Date' and 'Time' are strings before concatenating
data['Datetime'] = pd.to_datetime(data['Date'].astype(str) + ' ' + data['Time'].astype(str), errors='coerce')
data = data.drop(columns=['Date', 'Time'])  # Drop redundant columns
data = data.dropna()  # Remove rows with missing values

# 1. Identify the top 5 reasons for air quality issues
correlations = data.corr()['CO(GT)'].abs().sort_values(ascending=False).iloc[1:6]
top_reasons = correlations.index.tolist()
print("Top 5 reasons for air quality issues:", top_reasons)

# 2. Identify the day of the week with most air quality issues
data['DayOfWeek'] = data['Datetime'].dt.day_name()
most_issues_day = data.groupby('DayOfWeek')['CO(GT)'].mean().idxmax()
print("Day with most air quality issues:", most_issues_day)

# 3. Find the max and min air quality levels
max_air_quality = data['CO(GT)'].max()
min_air_quality = data['CO(GT)'].min()
print(f"Max air quality: {max_air_quality}, Min air quality: {min_air_quality}")

# 4. Identify the highest and lowest temperatures of air quality
highest_temp = data.loc[data['T'].idxmax()]
lowest_temp = data.loc[data['T'].idxmin()]
print("Highest temperature for air quality:")
print(highest_temp)
print("\nLowest temperature for air quality:")
print(lowest_temp)

# 5. Apply Classification or Clustering Model
# Feature Selection
features = data[['CO(GT)', 'T', 'RH', 'AH']]
labels = np.where(data['CO(GT)'] > 5, 1, 0)  # Binary classification (High pollution > 5)

# Preprocessing
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Classification Model
X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.3, random_state=42)
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Clustering Model
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(features_scaled)
data['Cluster'] = clusters
print("\nClustering Silhouette Score:", silhouette_score(features_scaled, clusters))


Top 5 reasons for air quality issues: ['NO2(GT)', 'NOx(GT)', 'Datetime', 'NMHC(GT)', 'PT08.S3(NOx)']
Day with most air quality issues: Friday
Max air quality: 11.9, Min air quality: -200.0
Highest temperature for air quality:
CO(GT)                           2.3
PT08.S1(CO)                   1186.0
NMHC(GT)                        -200
C6H6(GT)                   15.068536
PT08.S2(NMHC)                1150.75
NOx(GT)                        166.0
PT08.S3(NOx)                   680.0
NO2(GT)                        157.0
PT08.S4(NO2)                  1686.0
PT08.S5(O3)                  1234.75
T                               44.6
RH                            12.625
AH                          1.168406
Datetime         2004-07-22 16:00:00
DayOfWeek                   Thursday
Name: 3214, dtype: object

Lowest temperature for air quality:
CO(GT)                           1.7
PT08.S1(CO)                   -200.0
NMHC(GT)                         222
C6H6(GT)                      -200.0
PT08.S2(