In [81]:
from sklearn.preprocessing import RobustScaler
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import torch
import os
from torchvision import transforms
import torch.nn as nn
import torchvision.models as models


In [None]:
df = pd.read_csv(
    "/Users/avanigupta/pm-estimation-from-images/data/final_data.csv"
)

In [84]:
df.dtypes
df["Hour"] = pd.to_numeric(df["Hour"], errors="coerce")

In [85]:
df.dtypes

Location              object
Filename              object
Year                   int64
Month                  int64
Day                    int64
Hour                 float64
AQI                  float64
PM2.5                float64
PM10                 float64
O3                   float64
CO                   float64
SO2                  float64
NO2                  float64
AQI_Class             object
AQI_Class_encoded      int64
exists                  bool
dtype: object

In [86]:
class AirQualityDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform
        self.label_cols = ['AQI','PM2.5','PM10','O3','CO','SO2','NO2']

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        filename = row["Filename"].strip()  # remove leading/trailing spaces
        img_path = os.path.join(self.img_dir, filename)

        if not os.path.exists(img_path):
            print(f"File not found: {img_path}")
            return None  # optionally skip this sample

        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)

        # Convert labels to float
        labels = torch.tensor(
            row[self.label_cols].astype(float).values,
            dtype=torch.float32
        )

        return img, labels


In [87]:
img_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        [0.485, 0.456, 0.406],
        [0.229, 0.224, 0.225]
    )
])


In [88]:
img_dir = "/Users/avanigupta/pm-estimation-from-images/data/archive/Air Pollution Image Dataset/Air Pollution Image Dataset/Combined_Dataset/All_img"  # update if needed

dataset = AirQualityDataset(
    df=df,
    img_dir=img_dir,
    transform=img_transforms
)


In [89]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(
    dataset,
    [train_size, test_size]
)


In [90]:
batch_size = 32

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False
)


In [91]:
class AQIModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
        self.model.fc = nn.Linear(512, 7)

    def forward(self, x):
        return self.model(x)


In [92]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AQIModel().to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


In [93]:
for epoch in range(30):
    model.train()
    total_loss = 0

    for imgs, labels in train_loader:
        imgs = imgs.to(device)
        labels = labels.to(device)
        
        preds = model(imgs)
        loss = criterion(preds, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(
        f"Epoch {epoch+1}, "
        f"Loss = {total_loss / len(train_loader):.4f}"
    )


Epoch 1, Loss = 0.1050
Epoch 2, Loss = 0.0415
Epoch 3, Loss = 0.0321
Epoch 4, Loss = 0.0254
Epoch 5, Loss = 0.0219
Epoch 6, Loss = 0.0206
Epoch 7, Loss = 0.0186
Epoch 8, Loss = 0.0170
Epoch 9, Loss = 0.0150
Epoch 10, Loss = 0.0142
Epoch 11, Loss = 0.0135
Epoch 12, Loss = 0.0122
Epoch 13, Loss = 0.0113
Epoch 14, Loss = 0.0110
Epoch 15, Loss = 0.0101
Epoch 16, Loss = 0.0105
Epoch 17, Loss = 0.0096
Epoch 18, Loss = 0.0094
Epoch 19, Loss = 0.0091
Epoch 20, Loss = 0.0081
Epoch 21, Loss = 0.0078
Epoch 22, Loss = 0.0081
Epoch 23, Loss = 0.0074
Epoch 24, Loss = 0.0071
Epoch 25, Loss = 0.0074
Epoch 26, Loss = 0.0066
Epoch 27, Loss = 0.0073
Epoch 28, Loss = 0.0067
Epoch 29, Loss = 0.0064
Epoch 30, Loss = 0.0064


In [43]:
import os

def filter_existing_images(df, img_dir):
    before_size = len(df)

    # Remove leading/trailing spaces from filenames
    df["Filename"] = df["Filename"].astype(str).str.strip()

    mask = df["Filename"].apply(
        lambda x: os.path.exists(os.path.join(img_dir, x))
    )

    df_filtered = df[mask].reset_index(drop=True)
    after_size = len(df_filtered)

    print(f"Number of samples before filtering: {before_size}")
    print(f"Number of samples after filtering:  {after_size}")
    print(f"Removed samples: {before_size - after_size}")

    return df_filtered



df = filter_existing_images(df, img_dir)

dataset = AirQualityDataset(
    df=df,
    img_dir=img_dir,
    transform=img_transforms
)

Number of samples before filtering: 12240
Number of samples after filtering:  0
Removed samples: 12240
