In [71]:
import torch
import torchvision
import torchvision.transforms as transforms
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from torch.utils.data import DataLoader
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import torch.optim as optim
import json
import glob
import os
from PIL import Image
import random

import bottleneck
from resnet_pytorch import ResNet

## Pre-processing data

### Example of processing labels

In [72]:
# loading labels
files = glob.glob('../1_data_collection/.data/*.json')
number_of_files = 20

labels = []  # Initialize a list to store data from all files

if not files:
    print("No files found. Check the directory path.")
else:
    for file_path in files[:number_of_files]:
        try:
            with open(file_path, 'r') as file:
                # Load the content of the current file
                file_data = json.load(file)
                if "coordinates" in file_data:
                    labels.append(file_data)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from {file_path}: {e}")
        except Exception as e:
            print(f"Error opening or reading {file_path}: {e}")

# Output how many files were successfully loaded
print(f"Loaded data from {len(labels)} files.")


Loaded data from 19 files.


In [73]:
labels

[{'coordinates': [25.727112, 30.561267], 'duration': 10.427},
 {'coordinates': [35.596395, 64.120085], 'duration': 9.881},
 {'coordinates': [48.02805, 32.543271], 'duration': 10.303},
 {'coordinates': [44.324729, 2.441296], 'duration': 10.366},
 {'coordinates': [9.940982, 33.602745], 'duration': 10.401},
 {'coordinates': [47.887935, -27.093841], 'duration': 10.309},
 {'coordinates': [34.571072, 31.374634], 'duration': 9.925},
 {'coordinates': [-12.265574, 27.792694], 'duration': 9.905},
 {'coordinates': [30.621082, 22.407837], 'duration': 9.81},
 {'coordinates': [28.444196, 30.43578], 'duration': 10.419},
 {'coordinates': [27.150664, -19.849945], 'duration': 10.317},
 {'coordinates': [8.535778, -59.270355], 'duration': 10.068},
 {'coordinates': [33.491815, 20.167575], 'duration': 10.122},
 {'coordinates': [20.132692, -72.791385], 'duration': 10.217},
 {'coordinates': [17.895419, -11.1704], 'duration': 9.982},
 {'coordinates': [28.949394, 18.223732], 'duration': 10.443},
 {'coordinates'

In [74]:
# Just load the first image, avoid Multiplayer
labels[0]["coordinates"]

[25.727112, 30.561267]

### Example of processing input images

In [75]:
# loading images
files = glob.glob('../1_data_collection/.data/*.png')

if not files:
    print("No files found. Check the directory path.")
else:
    images = [] 
    for file_path in files[:number_of_files]:
        try:
            # channels, height, width is the pytorch convention
            with Image.open(file_path) as img:
                img_array = np.array(img)
                img_array = img_array.transpose((2, 0, 1))
                #images.append(img_array)
                images.append(file_path)
        except IOError as e:
            print(f"Error opening or reading {file_path}: {e}")

# Output how many files were successfully processed
print(f"Processed {len(images)} image files.")
#display(Image.open(images[0]))

Processed 20 image files.


In [76]:
# Inside the loop, convert the image to a numpy array and store it
with Image.open(file_path) as img:
    width, height = img.size
    img_array = np.array(img)
    print(f"Image size (height, width): {height}x{width}")
    print(f"Image array shape: {img_array.shape}")
    print(f"Image mode: {img.mode}")
    print(f"Image format: {img.format}")

Image size (height, width): 180x320
Image array shape: (180, 320, 4)
Image mode: RGBA
Image format: PNG


In [77]:
#images[0].shape
images[0]

'../1_data_collection/.data/geoguessr_location_singleplayer_rm8goISVT6SLjGR9_4_resized.png'

### Class of creating dataset

In [78]:
class CustomImageDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)

        return image, label

# Define transformations
transform = transforms.Compose([
    transforms.Resize((100, 100)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

In [79]:
# split before to train, validation, and test
dataset = CustomImageDataset(images, labels, transform=transform)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [80]:
loader

<torch.utils.data.dataloader.DataLoader at 0x136dfe9f0>

In [81]:
loader.dataset[0][0].shape

torch.Size([3, 100, 100])

## new code

In [82]:
import json
import sys
import os
sys.path.insert(0, '../')
from data_loader import get_data_to_load


list = get_data_to_load(loading_file='./data_list', file_location='../1_data_collection/.data/', image_file_location='../1_data_collection/.data/', allow_new_file_creation=True, from_remote_only=True, download_link='env', limit=100, shuffle_seed=43)

# List of data to load
print(len(list))

Getting files list from remote
Got files list from remote
Parsed files list from remote
All remote files: 274796
Filtering out unpaired files
Filtered out 17666 unpaired files
Relevant files: 257130
Limited files: 200
200


In [83]:
list[0]

'../1_data_collection/.data/geoguessr_result_singleplayer_vDB6RsXTYiw5LgQu_1.json'

In [84]:
with open(list[0], 'r') as file:
      file_data = json.load(file)
      label = file_data['coordinates'] if 'coordinates' in file_data else None

FileNotFoundError: [Errno 2] No such file or directory: '../1_data_collection/.data/geoguessr_result_singleplayer_vDB6RsXTYiw5LgQu_1.json'

In [60]:
import torch
import json
from torchvision import transforms
from torch.utils.data import Dataset
from PIL import Image
import glob
import numpy as np

class CustomImageDataset(Dataset):
    def __init__(self, image_paths, json_paths, transform=None):
        self.image_paths = image_paths
        self.json_paths = json_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load image
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        # Load labels from JSON file
        json_path = self.json_paths[idx]
        with open(json_path, 'r') as file:
            file_data = json.load(file)
            label = file_data['coordinates'] if 'coordinates' in file_data else None

        return image, label

# Define transformations
transform = transforms.Compose([
    transforms.Resize((100, 100)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

In [61]:
input_images = [item for item in list if item.endswith('.png')]
labels = [item for item in list if item.endswith('.json')]

assert len(input_images) == len(labels), "Mismatch in number of images and labels"

In [62]:
labels[0]

'../1_data_collection/.data/geoguessr_result_singleplayer_vDB6RsXTYiw5LgQu_1.json'

In [65]:
dataset = CustomImageDataset(input_images, labels, transform=transform)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [66]:
for images, labels in loader:
    print("Images batch shape:", images.shape)
    print("Labels batch shape:", labels.shape)
    break  # After printing the first batch, exit the loop

FileNotFoundError: [Errno 2] No such file or directory: '../1_data_collection/.data/geoguessr_location_singleplayer_oftZ2slpVUv3ipAr_0.png'

In [38]:
# Preload images and labels into tensors
images = []
labels_data = []

for img_path, json_path in zip(input_images, labels):
    # Load and transform the image
    image = Image.open(img_path).convert('RGB')
    image = transform(image)
    images.append(image)

    # Load and process the JSON file for labels
    with open(json_path, 'r') as file:
        file_data = json.load(file)
        label = file_data['coordinates'] if 'coordinates' in file_data else [0, 0]  # Default to [0,0] if no coordinates
        labels_data.append(torch.tensor(label, dtype=torch.float32))

# Create dataset from preloaded data
class CustomImageDataset(Dataset):
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.images[idx], self.labels[idx]

# Initialize dataset and DataLoader
dataset = CustomImageDataset(images, labels_data)
loader = DataLoader(dataset, batch_size=2, shuffle=True)

FileNotFoundError: [Errno 2] No such file or directory: '../1_data_collection/.data/geoguessr_location_singleplayer_3jd33hbXzAHBrYDj_4.png'

In [None]:
# Example usage of the DataLoader
for images, labels in loader:
    print(images.shape, labels)
    break  # Break after printing first batch for demonstration

In [135]:
class CustomImageDataset(Dataset):
    def __init__(self, images, labels):
        """
        Args:
            images (list of torch.Tensor): List of images as tensors.
            labels (list): List of labels.
        """
        self.images = images
        self.labels = labels

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        # Retrieve image and label already in the correct format
        image = self.images[idx]
        label = self.labels[idx]

        return image, label

In [137]:
from torchvision import transforms
from PIL import Image
import glob
import numpy as np

# Define transformations
transform = transforms.Compose([
    transforms.Resize((100, 100)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load images
image_paths = glob.glob('../../1_data_collection/.data/*.png')
images = []
for img_path in image_paths:
    image = Image.open(img_path).convert('RGB')
    image = transform(image)
    images.append(image)

# Assume labels are loaded and processed into a corresponding list `labels`
labels = [...]  # This should match the length and order of `images`

# Create dataset
dataset = CustomImageDataset(images, labels)

KeyboardInterrupt: 

In [138]:
import glob
import json

# Path where the JSON files are stored
json_path = '../../1_data_collection/.data/*.json'
files = glob.glob(json_path)
number_of_files = 20  # Limit the number of files to process

labels = []  # Initialize a list to store labels from the files

if not files:
    print("No files found. Check the directory path.")
else:
    for file_path in files[:number_of_files]:
        try:
            with open(file_path, 'r') as file:
                # Load the content of the current file
                file_data = json.load(file)
                # Check if 'coordinates' key exists
                if "coordinates" in file_data:
                    # Append the coordinates or the whole data, depending on need
                    labels.append(file_data['coordinates'])
                else:
                    # Handle cases where 'coordinates' might not exist
                    labels.append(None)  # Or some default value, e.g., {}
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON from {file_path}: {e}")
        except Exception as e:
            print(f"Error opening or reading {file_path}: {e}")

# Output how many files were successfully loaded with labels
print(f"Loaded labels from {len(labels)} files.")


Loaded labels from 20 files.


### Splitting dataset into training, validation and testing sets

In [43]:
# create dataset
data = {}
data["input"] = labels

# Define split ratios
train_ratio = 0.7
val_ratio = 0.20
test_ratio = 0.10

# Shuffle data randomly
random.shuffle(data["input"])

# Calculate split indices
total_images = len(data["input"])
train_end = int(train_ratio * total_images)
val_end = train_end + int(val_ratio * total_images)

# Split the data
train_data = data["input"][:train_end]
val_data = data["input"][train_end:val_end]
test_data = data["input"][val_end:]

print(f"Train set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")


Train set size: 14
Validation set size: 4
Test set size: 2


In [44]:
#Loading dataet and crearting, split before!!!
dataset = DatasetDict({
    "train": train_data,
    "validation": val_data,
    "test": test_data
})

In [45]:
dataset

DatasetDict({
    train: [{'coordinates': [9.940982, 33.602745], 'duration': 10.401}, {'coordinates': [10.23696, -28.960029], 'duration': 10.434}, {'coordinates': [17.895419, -11.1704], 'duration': 9.982}, {'coordinates': [28.444196, 30.43578], 'duration': 10.419}, {'coordinates': [30.621082, 22.407837], 'duration': 9.81}, {'coordinates': [25.727112, 30.561267], 'duration': 10.427}, {'coordinates': [47.887935, -27.093841], 'duration': 10.309}, {'coordinates': [48.02805, 32.543271], 'duration': 10.303}, {'country': 'Tunisia', 'guesses': {'1': {'incorrect': []}, '2': {'incorrect': []}, '3': {'incorrect': []}, '4': {'incorrect': []}, '5': {'incorrect': []}, '6': {'incorrect': []}, '7': {'incorrect': []}, '8': {'incorrect': []}, '9': {'incorrect': []}, '10': {'incorrect': []}, '11': {'incorrect': []}, '12': {'incorrect': ['Türkiye', 'United States']}, '13': {'incorrect': ['Türkiye', 'United States']}, '14': {'incorrect': ['Türkiye', 'United States']}, '15': {'incorrect': ['Türkiye', 'Unite

In [None]:
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    #transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

### Creating Dataloaders

In [None]:
train = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)

trainloader = torch.utils.data.DataLoader(train, batch_size=128, shuffle=True, num_workers=2)

test = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

testloader = torch.utils.data.DataLoader(test, batch_size=128,shuffle=False, num_workers=2)

## Model

In [None]:
model = ResNet.from_pretrained('resnet18', num_classes=2)

Loaded pretrained weights for resnet18.


In [None]:
#print(model)

## Training

In [None]:
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

In [None]:
net = ResNet50(10).to('cuda')

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor = 0.1, patience=5)

In [None]:
EPOCHS = 200
for epoch in range(EPOCHS):
    losses = []
    running_loss = 0
    for i, inp in enumerate(trainloader):
        inputs, labels = inp
        inputs, labels = inputs.to('cuda'), labels.to('cuda')
        optimizer.zero_grad()
    
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if i%100 == 0 and i > 0:
            print(f'Loss [{epoch+1}, {i}](epoch, minibatch): ', running_loss / 100)
            running_loss = 0.0

    avg_loss = sum(losses)/len(losses)
    scheduler.step(avg_loss)
            
print('Training Done')