In [None]:
!pip install kaggle



In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"shettysuhas","key":"a284ba6a431fffb287363740a5f36fb8"}'}

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d adarshrouniyar/air-pollution-image-dataset-from-india-and-nepal


Dataset URL: https://www.kaggle.com/datasets/adarshrouniyar/air-pollution-image-dataset-from-india-and-nepal
License(s): Attribution-NonCommercial-ShareAlike 3.0 IGO (CC BY-NC-SA 3.0 IGO)
Downloading air-pollution-image-dataset-from-india-and-nepal.zip to /content
 75% 480M/636M [00:13<00:05, 32.7MB/s]

In [None]:
import os
new_folder = '/content/extracted_dataset/'
os.makedirs(new_folder, exist_ok=True)
!unzip -q /content/air-pollution-image-dataset-from-india-and-nepal.zip -d /content/extracted_dataset/

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import layers, Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Load data
train_df = pd.read_csv('/content/extracted_dataset/Dataset_for_AQI_Classification/Dataset_for_AQI_Classification/train_data.csv')
val_df = pd.read_csv('/content/extracted_dataset/Dataset_for_AQI_Classification/Dataset_for_AQI_Classification/val_data.csv')
test_df = pd.read_csv('/content/extracted_dataset/Dataset_for_AQI_Classification/Dataset_for_AQI_Classification/testing_data.csv')

# Numerical features (exclude AQI to avoid leakage)
numerical_cols = ['PM2.5', 'PM10', 'O3', 'CO', 'SO2', 'NO2', 'Year', 'Month', 'Day', 'Hour']
scaler = StandardScaler()

# Convert Hour column to numeric
train_df['Hour'] = train_df['Hour'].astype(str).apply(lambda x: x.split(':')[0] if ':' in x else x)
val_df['Hour'] = val_df['Hour'].astype(str).apply(lambda x: x.split(':')[0] if ':' in x else x)
test_df['Hour'] = test_df['Hour'].astype(str).apply(lambda x: x.split(':')[0] if ':' in x else x)

# Convert all columns to numeric, coercing errors to NaN
for col in numerical_cols:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    val_df[col] = pd.to_numeric(val_df[col], errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')

# Fill NaN with mean
train_df[numerical_cols] = train_df[numerical_cols].fillna(train_df[numerical_cols].mean())
val_df[numerical_cols] = val_df[numerical_cols].fillna(val_df[numerical_cols].mean())
test_df[numerical_cols] = test_df[numerical_cols].fillna(test_df[numerical_cols].mean())

# Scale numerical data
X_train_num = scaler.fit_transform(train_df[numerical_cols])
X_val_num = scaler.transform(val_df[numerical_cols])
X_test_num = scaler.transform(test_df[numerical_cols])

# Data generator for images
datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    zoom_range=0.2,
    fill_mode='nearest'
)

batch_size = 64

train_generator = datagen.flow_from_dataframe(
    train_df,
    directory='/content/extracted_dataset/Air Pollution Image Dataset/Air Pollution Image Dataset/Combined_Dataset/All_img/',
    x_col='Filename',
    y_col='AQI_Class',
    target_size=(224, 224),
    class_mode='categorical',
    batch_size=batch_size,
    shuffle=False
)

val_generator = datagen.flow_from_dataframe(
    val_df,
    directory='/content/extracted_dataset/Air Pollution Image Dataset/Air Pollution Image Dataset/Combined_Dataset/All_img/',
    x_col='Filename',
    y_col='AQI_Class',
    target_size=(224, 224),
    class_mode='categorical',
    batch_size=batch_size,
    shuffle=False
)

test_generator = datagen.flow_from_dataframe(
    test_df,
    directory='/content/extracted_dataset/Air Pollution Image Dataset/Air Pollution Image Dataset/Combined_Dataset/All_img/',
    x_col='Filename',
    y_col='AQI_Class',
    target_size=(224, 224),
    class_mode='categorical',
    batch_size=batch_size,
    shuffle=False
)

# CNN Branch (MobileNetV2)
cnn_input = layers.Input(shape=(224, 224, 3), name='image_input')
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = True
for layer in base_model.layers[:-20]:
    layer.trainable = False

x = base_model(cnn_input)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(128, activation='relu')(x)
cnn_features = layers.Dropout(0.5)(x)

# Numerical Branch
num_input = layers.Input(shape=(len(numerical_cols),), name='num_input')
y = layers.Dense(64, activation='relu')(num_input)
y = layers.Dense(32, activation='relu')(y)
num_features = layers.Dense(16, activation='relu')(y)

# Combine Branches
combined = layers.concatenate([cnn_features, num_features])
z = layers.Dense(64, activation='relu')(combined)
z = layers.Dropout(0.5)(z)
output = layers.Dense(6, activation='softmax')(z)

# Build and compile model
model = Model(inputs=[cnn_input, num_input], outputs=output)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss='categorical_crossentropy',
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)
model.summary()

# Custom generator for hybrid model
def hybrid_generator(img_generator, num_data, batch_size):
    img_generator.reset()
    while True:
        img_batch, labels = next(img_generator)
        batch_indices = img_generator.index_array[img_generator.batch_index * img_generator.batch_size:(img_generator.batch_index + 1) * img_generator.batch_size]
        num_batch = num_data[batch_indices]
        if len(img_batch) != batch_size:
            continue  # Skip incomplete batches
        if len(num_batch) != len(img_batch):
            continue  # Skip if numerical batch doesn't match image batch size
        yield (img_batch, num_batch), labels

# Define the output signature
output_signature = (
    (
        tf.TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32),  # Image batch
        tf.TensorSpec(shape=(None, len(numerical_cols)), dtype=tf.float32)  # Numerical batch
    ),
    tf.TensorSpec(shape=(None, 6), dtype=tf.float32)  # Labels (one-hot encoded for 6 classes)
)

# Create tf.data.Dataset from the generator
train_dataset = tf.data.Dataset.from_generator(
    lambda: hybrid_generator(train_generator, X_train_num, batch_size),
    output_signature=output_signature
)

val_dataset = tf.data.Dataset.from_generator(
    lambda: hybrid_generator(val_generator, X_val_num, batch_size),
    output_signature=output_signature
)

test_dataset = tf.data.Dataset.from_generator(
    lambda: hybrid_generator(test_generator, X_test_num, batch_size),
    output_signature=output_signature
)

# Calculate the number of complete batches
train_steps = len(train_df) // batch_size
val_steps = len(val_df) // batch_size
test_steps = len(test_df) // batch_size

# Train the model
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.00001)

history = model.fit(
    train_dataset,
    steps_per_epoch=train_steps,
    validation_data=val_dataset,
    validation_steps=val_steps,
    epochs=20,
    callbacks=[early_stopping, reduce_lr]
)

# Evaluate on test set
test_loss, test_acc, test_precision, test_recall = model.evaluate(test_dataset, steps=test_steps)
print(f"Test accuracy: {test_acc:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}")

# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Predict on the test set, ensuring all samples are included
total_test_samples = len(test_df)
test_steps_exact = (total_test_samples + batch_size - 1) // batch_size  # Ceiling division to include all samples
y_pred = model.predict(test_dataset, steps=test_steps_exact)

# Flatten predictions if necessary (in case of extra batch dimension)
y_pred = y_pred[:total_test_samples]  # Truncate to match exact number of samples
y_pred_classes = y_pred.argmax(axis=1)

# Get true labels from the test set
y_true = test_generator.classes[:total_test_samples]  # Ensure alignment with predicted samples

# Confusion matrix and classification report
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred_classes))
print("Classification Report:\n", classification_report(y_true, y_pred_classes, target_names=['a_Good', 'b_Moderate', 'c_Unhealthy_for_Sensitive_Groups', 'd_Unhealthy', 'e_Very_Unhealthy', 'f_Severe']))

In [None]:
# Holdout set evaluation
all_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
train_val_df, holdout_df = train_test_split(all_df, test_size=0.2, random_state=42, stratify=all_df['AQI_Class'])
X_holdout_num = scaler.transform(holdout_df[numerical_cols])

holdout_generator = datagen.flow_from_dataframe(
    holdout_df,
    directory='/content/extracted_dataset/Air Pollution Image Dataset/Air Pollution Image Dataset/Combined_Dataset/All_img/',
    x_col='Filename',
    y_col='AQI_Class',
    target_size=(224, 224),
    class_mode='categorical',
    batch_size=64,
    shuffle=False
)

# Create holdout dataset
holdout_dataset = tf.data.Dataset.from_generator(
    lambda: hybrid_generator(holdout_generator, X_holdout_num, batch_size),
    output_signature=output_signature
)

# Calculate steps for the holdout set
total_holdout_samples = len(holdout_df)
holdout_steps = (total_holdout_samples + batch_size - 1) // batch_size  # Ceiling division

# Evaluate on holdout set
holdout_loss, holdout_acc, holdout_precision, holdout_recall = model.evaluate(holdout_dataset, steps=holdout_steps)
print(f"Holdout accuracy: {holdout_acc:.4f}, Precision: {holdout_precision:.4f}, Recall: {holdout_recall:.4f}")

# Predict on holdout set
y_pred_holdout = model.predict(holdout_dataset, steps=holdout_steps)
y_pred_holdout = y_pred_holdout[:total_holdout_samples]  # Truncate to match exact number of samples
y_pred_classes_holdout = y_pred_holdout.argmax(axis=1)
y_true_holdout = holdout_generator.classes[:total_holdout_samples]

print("Holdout Confusion Matrix:\n", confusion_matrix(y_true_holdout, y_pred_classes_holdout))
print("Holdout Classification Report:\n", classification_report(y_true_holdout, y_pred_classes_holdout, target_names=['a_Good', 'b_Moderate', 'c_Unhealthy_for_Sensitive_Groups', 'd_Unhealthy', 'e_Very_Unhealthy', 'f_Severe']))