# Dataloader File

This file's purpose is to get, prepare, explore, transform and load the images from the dataset.

In [1]:
import torch 
from torch import nn

torch.__version__

'2.2.2'

In [2]:
import torchvision

torchvision.__version__

'0.17.2'

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

## Data Setup

Since we are using kaggle to access this dataset, here is the documentation to setup the kaggle API to make it easier:
https://www.kaggle.com/settings

Additionally, here is the link to the dataset website: https://www.kaggle.com/datasets/puneet6060/intel-image-classification

### Getting Data

In [None]:
import os
import zipfile
import random
from pathlib import Path
import subprocess

# Define the dataset and paths
dataset = 'puneet6060/intel-image-classification'

# Navigate one level up to place the data folder at the same level as the src folder
base_path = Path("..")  # Represents one level above the current directory
data_path = base_path / 'data'
zip_path = data_path / 'intel-image-classification.zip'
subset_path = data_path / 'intel-image-classification-subset'

# Create data directory if it doesn't exist
data_path.mkdir(parents=True, exist_ok=True)

# Download the zip file if it doesn't already exist
if not zip_path.exists():
    print(f"Downloading {dataset} dataset...")
    subprocess.run(['kaggle', 'datasets', 'download', '-d', dataset, '-p', str(data_path)])
else:
    print(f"{zip_path} already exists. Skipping download.")

# Extract a subset of the dataset
if not subset_path.exists():
    subset_path.mkdir(parents=True, exist_ok=True)
    print("Extracting a 25% subset of the dataset...")

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # List all files in the zip
        all_files = [f for f in zip_ref.namelist() if f.endswith(('.jpg', '.png'))]
        
        # Select a random 25% of the files
        subset_files = random.sample(all_files, int(len(all_files) * 0.25))
        
        # Extract the selected files
        for file in subset_files:
            zip_ref.extract(file, subset_path)
else:
    print(f"Subset already exists at {subset_path}. Skipping extraction.")

print(f"Subset extracted to: {subset_path}")