In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import sys

from torch.utils.data import DataLoader
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import torch.optim as optim
import json
import glob
import os
from PIL import Image
import random

import bottleneck
from resnet_pytorch import ResNet
from datasets import load_dataset, load_metric, Dataset, DatasetDict

# load .env file
from dotenv import load_dotenv
load_dotenv()

sys.path.insert(0, '../')
from data_loader import get_data_to_load, split_json_and_image_files, load_json_files, load_image_files, load_json_file, load_image_file

## Pre-processing data

In [None]:
# get list with local data and file paths
list_files = get_data_to_load(loading_file='../3_data_preparation/4_data_cleaning/updated_data_list', file_location='../3_data_preparation/01_enriching/.data', image_file_location='../1_data_collection/.data', allow_new_file_creation=True, from_remote_only=True, download_link='env', limit=100, shuffle_seed=43, allow_file_location_env=True, allow_json_file_location_env=True, allow_image_file_location_env=True)

json_files, image_files = split_json_and_image_files(list_files)
paired_files = list(zip(json_files, image_files))
# data = load_json_files(json_files)
# countries = [item['country_name'] for item in data]
# coordinates = [item['coordinates'] for item in data]
# images = load_image_files(image_files)

Getting files list from remote
Got files list from remote
Parsed files list from remote
All remote files: 274796
Filtering out unpaired files
Filtered out 17666 unpaired files
Relevant files: 257130
Limited files: 200
200


In [None]:
import torch
import json
from torchvision import transforms
from torch.utils.data import Dataset
from PIL import Image
import numpy as np

class CustomImageNameDataset(Dataset):
    def __init__(self, image_paths, json_paths, transform=None):
        self.image_paths = image_paths
        self.json_paths = json_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        return self.image_paths[idx], self.json_paths[idx]

# Define transformations
transform = transforms.Compose([
    transforms.Resize((100, 100)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

In [None]:
input_images = image_files
labels = json_files
assert len(input_images) == len(labels), "Mismatch in number of images and labels"

In [None]:
labels.__getitem__(5)

'../.data/geoguessr_result_singleplayer_VROl268f9c1R4jWy_0.json'

In [None]:
file_name_dataset = CustomImageNameDataset(input_images, labels, transform=transform)
file_name_loader = DataLoader(file_name_dataset, batch_size=64, shuffle=True)

In [None]:
countries = []
coordinates = []
transformed_images = []

for image_files, label_files in file_name_loader:
    images = load_image_files(image_files)
    labels = load_json_files(label_files)
    countries.extend([item['country_name'] for item in labels])
    coordinates.extend([item['coordinates'] for item in labels])
    transformed_images.extend(transform(images))
    print("Images batch shape:", images.shape)
    print("Labels batch shape:", labels.shape)
    break  # After printing the first batch, exit the loop

FileNotFoundError: [Errno 2] No such file or directory: '../.data/geoguessr_location_singleplayer_NVY1TBOqwZCElNOL_2.png'

In [None]:
# Preload images and labels into tensors
images = []
labels_data = []

for img_path, json_path in zip(input_images, labels):
    # Load and transform the image
    image = Image.open(img_path).convert('RGB')
    image = transform(image)
    images.append(image)

    # Load and process the JSON file for labels
    with open(json_path, 'r') as file:
        file_data = json.load(file)
        label = file_data['coordinates'] if 'coordinates' in file_data else [0, 0]  # Default to [0,0] if no coordinates
        labels_data.append(torch.tensor(label, dtype=torch.float32))

# Create dataset from preloaded data
class CustomImageDataset(Dataset):
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.images[idx], self.labels[idx]

# Initialize dataset and DataLoader
dataset = CustomImageDataset(images, labels_data)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [None]:
# Example usage of the DataLoader
for images, labels in loader:
    print(images.shape, labels)
    break  # Break after printing first batch for demonstration

In [None]:
class CustomImageDataset(Dataset):
    def __init__(self, images, labels):
        """
        Args:
            images (list of torch.Tensor): List of images as tensors.
            labels (list): List of labels.
        """
        self.images = images
        self.labels = labels

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        # Retrieve image and label already in the correct format
        image = self.images[idx]
        label = self.labels[idx]

        return image, label

In [None]:
from torchvision import transforms
from PIL import Image
import glob
import numpy as np

# Define transformations
transform = transforms.Compose([
    transforms.Resize((100, 100)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load images
image_paths = glob.glob('../../1_data_collection/.data/*.png')
images = []
for img_path in image_paths:
    image = Image.open(img_path).convert('RGB')
    image = transform(image)
    images.append(image)

# Assume labels are loaded and processed into a corresponding list `labels`
labels = [...]  # This should match the length and order of `images`

# Create dataset
dataset = CustomImageDataset(images, labels)

In [None]:
import glob
import json

# Path where the JSON files are stored
json_path = '../../1_data_collection/.data/*.json'
files = glob.glob(json_path)
number_of_files = 20  # Limit the number of files to process

labels = []  # Initialize a list to store labels from the files

if not files:
    print("No files found. Check the directory path.")
else:
    for file_path in files[:number_of_files]:
        try:
            with open(file_path, 'r') as file:
                # Load the content of the current file
                file_data = json.load(file)
                # Check if 'coordinates' key exists
                if "coordinates" in file_data:
                    # Append the coordinates or the whole data, depending on need
                    labels.append(file_data['coordinates'])
                else:
                    # Handle cases where 'coordinates' might not exist
                    labels.append(None)  # Or some default value, e.g., {}
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from {file_path}: {e}")
        except Exception as e:
            print(f"Error opening or reading {file_path}: {e}")

# Output how many files were successfully loaded with labels
print(f"Loaded labels from {len(labels)} files.")


### Splitting dataset into training, validation and testing sets

In [None]:
# create dataset
data = {}
data["input"] = labels

# Define split ratios
train_ratio = 0.7
val_ratio = 0.20
test_ratio = 0.10

# Shuffle data randomly
random.shuffle(data["input"])

# Calculate split indices
total_images = len(data["input"])
train_end = int(train_ratio * total_images)
val_end = train_end + int(val_ratio * total_images)

# Split the data
train_data = data["input"][:train_end]
val_data = data["input"][train_end:val_end]
test_data = data["input"][val_end:]

print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")


In [None]:
#Loading dataet and crearting, split before!!!
dataset = DatasetDict({
    "train": train_data,
    "validation": val_data,
    "test": test_data
})

In [None]:
dataset

In [None]:
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    #transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

### Creating Dataloaders

In [None]:
train = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)

trainloader = torch.utils.data.DataLoader(train, batch_size=128, shuffle=True, num_workers=2)

test = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

testloader = torch.utils.data.DataLoader(test, batch_size=128,shuffle=False, num_workers=2)

## Model

In [None]:
model = ResNet.from_pretrained('resnet18', num_classes=2)

In [None]:
#print(model)

## Training

In [None]:
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

In [None]:
net = ResNet50(10).to('cuda')

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = 0.1, patience=5)

In [None]:
EPOCHS = 200
for epoch in range(EPOCHS):
    losses = []
    running_loss = 0
    for i, inp in enumerate(trainloader):
        inputs, labels = inp
        inputs, labels = inputs.to('cuda'), labels.to('cuda')
        optimizer.zero_grad()
    
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if i%100 == 0 and i > 0:
            print(f'Loss [{epoch+1}, {i}](epoch, minibatch): ', running_loss / 100)
            running_loss = 0.0

    avg_loss = sum(losses)/len(losses)
    scheduler.step(avg_loss)
            
print('Training Done')