
# 🌍 Milestone 1: Data Collection, Exploration, and Preprocessing

This notebook performs:
- Dataset loading
- Data exploration (class distribution, image samples)
- Image preprocessing
- Dataset splitting (Train / Validation / Test)


In [3]:

# 📦 Import required libraries
import os
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torchvision.datasets import ImageFolder
from torchvision import transforms
from torch.utils.data import random_split
from collections import Counter
import numpy as np


In [4]:

# 📁 Load the EuroSAT dataset (update path if needed)
data_dir = r'C:\users\a\downloads\EuroSAT'  # <- Edit this to match your local path

transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # normalize to [-1, 1]
])

from torch.utils.data import Subset

full_dataset = ImageFolder(root=data_dir, transform=transform)
subset_indices = list(range(0, len(full_dataset), 10))  # load every 10th image only
dataset = Subset(full_dataset, subset_indices)
class_names = full_dataset.classes  # ✔️ access from the original full dataset
print(f"Total images: {len(dataset)}")
print(f"Classes: {class_names}")


Total images: 2700
Classes: ['2750']


In [None]:

# 📊 Class distribution analysis
labels = [label for _, label in dataset]
label_counts = Counter(labels)

plt.figure(figsize=(10, 5))
sns.barplot(x=[class_names[i] for i in label_counts.keys()], y=list(label_counts.values()))
plt.title("Class Distribution")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:

# 🖼️ Display sample images from the dataset
def show_samples(dataset, class_names, n=5):
    plt.figure(figsize=(15, 5))
    for i in range(n):
        img, label = dataset[i * 200]  # spread out samples
        img = img.permute(1, 2, 0).numpy()  # reshape to HWC
        img = img * 0.5 + 0.5  # unnormalize
        plt.subplot(1, n, i + 1)
        plt.imshow(img)
        plt.title(class_names[label])
        plt.axis('off')
    plt.show()

show_samples(dataset, class_names)


In [None]:

# ✂️ Split the dataset into training, validation, and testing sets
train_size = int(0.7 * len(dataset))
val_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")
print(f"Test size: {len(test_dataset)}")
