In [1]:
from sklearn.preprocessing import RobustScaler
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import torch
import os
from torchvision import transforms
import torch.nn as nn
import torchvision.models as models
from sklearn.preprocessing import MinMaxScaler
import joblib

In [2]:
df = pd.read_csv(
    "/Users/avanigupta/pm-estimation-from-images/data/final_data.csv"
)

In [3]:
df.dtypes
df["Hour"] = pd.to_numeric(df["Hour"], errors="coerce")

In [4]:
df.dtypes

Location              object
Filename              object
Year                   int64
Month                  int64
Day                    int64
Hour                 float64
AQI                    int64
PM2.5                float64
PM10                 float64
O3                   float64
CO                   float64
SO2                  float64
NO2                  float64
AQI_Class             object
AQI_Class_encoded      int64
exists                  bool
dtype: object

In [5]:
label_cols = ['AQI','PM2.5','PM10','O3','CO','SO2','NO2']

scaler = MinMaxScaler()

# Fit only on label columns
df[label_cols] = scaler.fit_transform(df[label_cols])

# Save scaler for inference later
joblib.dump(scaler, "/Users/avanigupta/pm-estimation-from-images/models/label_scaler.save")

['/Users/avanigupta/pm-estimation-from-images/models/label_scaler.save']

In [6]:
class AirQualityDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform
        self.label_cols = ['AQI','PM2.5','PM10','O3','CO','SO2','NO2']

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        filename = row["Filename"].strip()  # remove leading/trailing spaces
        img_path = os.path.join(self.img_dir, filename)

        if not os.path.exists(img_path):
            print(f"File not found: {img_path}")
            return None  # optionally skip this sample

        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)

        # Convert labels to float
        labels = torch.tensor(
            row[self.label_cols].astype(float).values,
            dtype=torch.float32
        )

        return img, labels


In [7]:
img_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        [0.485, 0.456, 0.406],
        [0.229, 0.224, 0.225]
    )
])


In [8]:
img_dir = "/Users/avanigupta/pm-estimation-from-images/data/archive/Air Pollution Image Dataset/Air Pollution Image Dataset/Combined_Dataset/All_img"  # update if needed

dataset = AirQualityDataset(
    df=df,
    img_dir=img_dir,
    transform=img_transforms
)


In [9]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(
    dataset,
    [train_size, test_size]
)


In [10]:
batch_size = 32

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False
)


In [11]:
def get_model(model_name):
    if model_name == "resnet18":
        model = models.resnet18(
            weights=models.ResNet18_Weights.IMAGENET1K_V1
        )
        model.fc = nn.Linear(512, 7)

    elif model_name == "resnet34":
        model = models.resnet34(
            weights=models.ResNet34_Weights.IMAGENET1K_V1
        )
        model.fc = nn.Linear(512, 7)

    elif model_name == "mobilenet_v2":
        model = models.mobilenet_v2(
            weights=models.MobileNet_V2_Weights.IMAGENET1K_V1
        )
        model.classifier[1] = nn.Linear(
            model.classifier[1].in_features, 7
        )

    else:
        raise ValueError("Unknown model name")

    return model


In [12]:
model_names = [
    "resnet34",
    "resnet18",
    "mobilenet_v2"
]

In [13]:


model_dir = "/Users/avanigupta/pm-estimation-from-images/models"
os.makedirs(model_dir, exist_ok=True)
torch.save(test_dataset, "/Users/avanigupta/pm-estimation-from-images/models/test_dataset.pt")
num_epochs = 25
results = {}

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
save_dir = "/Users/avanigupta/pm-estimation-from-images/models"
os.makedirs(save_dir, exist_ok=True)

In [16]:
for model_name in model_names:
    print(f"\nTraining model: {model_name}")

    model = get_model(model_name).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    # ---------- TRAIN ----------
    for epoch in range(num_epochs):
        print("yes")
        model.train()
        total_loss = 0

        for imgs, labels in train_loader:
            imgs = imgs.to(device)
            labels = labels.to(device)

            preds = model(imgs)
            loss = criterion(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"[{model_name}] Epoch {epoch+1}: Loss = {avg_loss:.4f}")

    # ---------- EVALUATE ----------
    model.eval()
    test_loss = 0

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for imgs, labels in test_loader:
            imgs = imgs.to(device)
            labels = labels.to(device)

            preds = model(imgs)
            loss = criterion(preds, labels)
            test_loss += loss.item()

            all_preds.append(preds.cpu())
            all_labels.append(labels.cpu())

    test_loss /= len(test_loader)
    print(f"[{model_name}] Test MSE (scaled): {test_loss:.4f}")

    # Stack all batches
    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()

    # Load scaler
    scaler = joblib.load("/Users/avanigupta/pm-estimation-from-images/models/label_scaler.save")

    # Inverse transform
    all_preds_unscaled = scaler.inverse_transform(all_preds)
    all_labels_unscaled = scaler.inverse_transform(all_labels)

    # ---------- SAVE MODEL ----------
    model_path = os.path.join(save_dir, f"{model_name}_aqi.pth")
    torch.save(model.state_dict(), model_path)
    print(f"ðŸ’¾ Saved model â†’ {model_path}")


Training model: resnet34
yes
[resnet34] Epoch 1: Loss = 0.0270
yes
[resnet34] Epoch 2: Loss = 0.0052
yes
[resnet34] Epoch 3: Loss = 0.0036
yes
[resnet34] Epoch 4: Loss = 0.0030
yes
[resnet34] Epoch 5: Loss = 0.0026
yes
[resnet34] Epoch 6: Loss = 0.0022
yes
[resnet34] Epoch 7: Loss = 0.0019
yes
[resnet34] Epoch 8: Loss = 0.0017
yes
[resnet34] Epoch 9: Loss = 0.0014
yes
[resnet34] Epoch 10: Loss = 0.0014
yes
[resnet34] Epoch 11: Loss = 0.0014
yes
[resnet34] Epoch 12: Loss = 0.0013
yes
[resnet34] Epoch 13: Loss = 0.0012
yes
[resnet34] Epoch 14: Loss = 0.0011
yes
[resnet34] Epoch 15: Loss = 0.0011
yes
[resnet34] Epoch 16: Loss = 0.0011
yes
[resnet34] Epoch 17: Loss = 0.0010
yes
[resnet34] Epoch 18: Loss = 0.0010
yes
[resnet34] Epoch 19: Loss = 0.0012
yes
[resnet34] Epoch 20: Loss = 0.0010
yes
[resnet34] Epoch 21: Loss = 0.0010
yes
[resnet34] Epoch 22: Loss = 0.0008
yes
[resnet34] Epoch 23: Loss = 0.0008
yes
[resnet34] Epoch 24: Loss = 0.0010
yes
[resnet34] Epoch 25: Loss = 0.0009
[resnet3