In [1]:
# important for gpuhub
# !pip install -r ../../requirements.txt --upgrade

In [2]:
import wandb
import sys
import os

import torch
from torchvision import transforms

# load .env file
from dotenv import load_dotenv
from geo_model_trainer import GeoModelTrainer
from image_data_handler import ImageDataHandler

sys.path.insert(0, "../")
from data_loader import get_data_to_load, hash_filenames

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
WANDB_TOKEN = os.getenv("WANDB_TOKEN")
# Define where to run
env_path = "../../.env"
if not WANDB_TOKEN and os.path.exists(env_path):
    load_dotenv(env_path)
    WANDB_TOKEN = os.getenv("WANDB_TOKEN")

In [4]:
# Check if GPU is available
if torch.cuda.is_available():
    print("GPU is available.")

    # Print the name of the GPU
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

    # Print the total and available memory
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9  # Convert bytes to GB
    print(f"Total Memory: {total_memory:.2f} GB")

    allocated_memory = torch.cuda.memory_allocated(0) / 1e9  # Convert bytes to GB
    print(f"Allocated Memory: {allocated_memory:.2f} GB")

    cached_memory = torch.cuda.memory_reserved(0) / 1e9  # Convert bytes to GB
    print(f"Cached Memory: {cached_memory:.2f} GB")

    # Print other properties
    device_properties = torch.cuda.get_device_properties(0)
    print(f"CUDA Capability: {device_properties.major}.{device_properties.minor}")
    print(f"Multi-Processor Count: {device_properties.multi_processor_count}")
else:
    print("No GPU found.")

No GPU found.


# Decide on which gpu to run with best settings

In [3]:
entity = "nlp_ls"

original_image_size = [180, 320]

NUMBER_OF_FILES = 0
BATCH_SIZE = 400
USE_MAPPED = True

running_device = "colab_T4"
image_size = [80, 130]
data_augmentation = "full_augmentation_v2"  # or "base_augmentation", "full_augmentation_v2"
predict_coordinates = True
predict_regions = False

if running_device == "colab_T4":
    # Run unmapped images with low image resolution on colab
    BATCH_SIZE = 300
    USE_MAPPED = False

elif running_device == "colab_A100":
    # Run mapped images with high image resolution on colab
    image_size = original_image_size
    BATCH_SIZE = 200
    NUMBER_OF_FILES = 79000

elif running_device == "gpuHub":
    # Run unmapped images with low image resolution on gpuHub
    BATCH_SIZE = 200
    USE_MAPPED = False

elif running_device == "gpuHub_augmentedv2":
    # Run unmapped images with low image resolution on gpuHub
    BATCH_SIZE = 100
    USE_MAPPED = False

## Loading data

In [5]:
# get list with local data and file paths
list_files, load_callback, additional_save_callback = get_data_to_load(loading_file="../3_data_preparation/04_data_cleaning/updated_data_list_more" if USE_MAPPED else "../3_data_preparation/04_data_cleaning/updated_data_list_non_mapped", file_location="../3_data_preparation/01_enriching/.data", image_file_location="../1_data_collection/.data", allow_new_file_creation=False, from_remote_only=True, download_link="default", limit=NUMBER_OF_FILES, shuffle_seed=42, allow_file_location_env=True, allow_json_file_location_env=True, allow_image_file_location_env=True, allow_download_link_env=True, return_load_and_additional_save_callback=True)

All local files: 705681
Relevant files: 705681


In [None]:
import shutil
# Get directory of the first file
first_file = list_files[0]
first_file_dir = os.path.dirname(first_file)[:-4]
# Get all files in the same directory
files_in_dir = [os.path.join(first_file_dir, f) for f in os.listdir(first_file_dir) if os.path.isfile(os.path.join(first_file_dir, f))]
# Get basenames of all files
basenames_list = [os.path.basename(f) for f in list_files]
basenames = [os.path.basename(f) for f in files_in_dir]
# Create a set of basenames
basenames_list_set = set(basenames_list)
basenames_set = set(basenames)
# Get all basenames that are in basenames_set but not in basenames_list_set
missing_files = basenames_set - basenames_list_set
# Delete the files that are missing
for missing_file in missing_files:
    if missing_file.startswith("geoguessr"):
        missing_file = files_in_dir[basenames.index(missing_file)]
        os.remove(missing_file)
json_files = [file for file in list_files if file.endswith(".json")]
image_files = [file for file in list_files if file.endswith(".png")]
for index, file in enumerate(json_files):
    # Rename to "location_<count(with leading 0 up to 6 digits)>.json
    os.replace(file, os.path.join(first_file_dir, f"location_{index:06d}.json"))
for index, file in enumerate(image_files):
    # Rename to "location_<count(with leading 0 up to 6 digits)>.png
    os.replace(file, os.path.join(first_file_dir, f"location_{index:06d}.png"))
# Delete the files that are not .json or .png
for file in os.listdir(first_file_dir):
    file = os.path.join(first_file_dir, file)
    if not (file.endswith(".png") or file.endswith(".json")):
        if os.path.isfile(file):
            os.remove(file)
        elif os.path.isdir(file):
            shutil.rmtree(file)
# Create a data.zip file
os.system(f"cd {first_file_dir} && zip -qr data.zip .")
# Copy the data.zip file to /content/drive/MyDrive/
os.system(f"cp {first_file_dir}/data.zip /content/drive/MyDrive/")

In [6]:
NUMBER_OF_FILES = len(list_files) // 2
print(NUMBER_OF_FILES)

81505


## Processing and loading data

In [7]:
prediction_type = "regions" if predict_regions else ("coordinates" if predict_coordinates else "countries")

train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

preprocessing_config = {"data_augmentation": data_augmentation, "height": image_size[0], "width": image_size[1], "train_ratio": train_ratio, "val_ratio": val_ratio, "test_ratio": test_ratio}

augmented_transform = None  # Happens before base_transform
base_transform = transforms.Compose([transforms.Resize((image_size[0], image_size[1])), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

if data_augmentation == "full_augmentation_v2":
    # Happens before base_transform
    augmented_transform = transforms.Compose(
        [
            # Disabled because black bars really hurt the performance at this size (only for v2)
            # transforms.RandomPerspective(distortion_scale=0.75, p=0.5),  # Randomly apply perspective transformation
            transforms.RandomResizedCrop((original_image_size[0], original_image_size[1]), scale=(0.75, 1.0)),  # Randomly crop the image and resize it to the original size
            transforms.RandomRotation(10),  # Randomly rotate the image by up to 10 degrees, sadly also causes black borders
            transforms.ColorJitter(brightness=(0.5, 1.5), contrast=(0.5, 1.5), saturation=(0.5, 1.5), hue=(-0.1, 0.1)),  # Randomly change brightness (lower limit to simulate night, upper limit for bright daylight)  # Randomly change contrast  # Randomly change saturation  # Randomly change hue
        ]
    )

In [8]:
# Creating Dataloasders with the classes

# Hash the files list to get a unique identifier for the data
hashed_filenames = hash_filenames(list_files)

cache = True
move_files = False
# Move .pth and .zip files instead of copying them, could cause issues but saves space and should be fine

# Check if the code is running in a notebook
running_in_notebook = False
try:
    get_ipython()
    running_in_notebook = True
    print("Running in a notebook.")
except NameError:
    print("Running in a script.")

data_handler = ImageDataHandler(list_files, augmented_transform, base_transform, preprocessing_config, prediction_type, batch_size=BATCH_SIZE, train_ratio=train_ratio, val_ratio=val_ratio, test_ratio=test_ratio, cache=cache, cache_load_callback=load_callback, cache_additional_save_callback=additional_save_callback, save_test_data=True, inspect_transformed=running_in_notebook and (data_augmentation == "full_augmentation_v2"), move_files=move_files, get_cache=True)
train_dataloader = data_handler.train_loader
val_dataloader = data_handler.val_loader
test_dataloader = data_handler.test_loader
country_to_index = data_handler.country_to_index
region_to_index = data_handler.region_to_index
region_index_to_middle_point = data_handler.region_index_to_middle_point
region_index_to_country_index = data_handler.region_index_to_country_index
# Path of test data if it should be pushed to wandb
test_data_path = data_handler.test_data_path
# Previous run id if the test data was already pushed to wandb (to save space)
run_link = data_handler.run_link
# Path to the run link file to be created if it was not previously (if test data should be pushed)
run_link_path = data_handler.run_link_path

# Load the country_to_index mapping and print the count of different countries
print("Dataset size:", NUMBER_OF_FILES)
print("Dataset identifier:", hashed_filenames)
print(f"Count of different countries: {len(country_to_index)}")
print(f"Count of different regions: {len(region_to_index)}")

Running in a notebook.
Using cached data from: data_81505_data_augmentation=base_augmentationheight=1test_ratio=0.1train_ratio=0.7val_ratio=0.2width=1&63289b51067a4c6ede4c44c23a329d82ab4964ed43942794430a9b71ec685b5c.pth
Data loaded.
Creating new run link at run_81505_data_augmentation=base_augmentationheight=1test_ratio=0.1train_ratio=0.7val_ratio=0.2width=1&63289b51067a4c6ede4c44c23a329d82ab4964ed43942794430a9b71ec685b5c.wandb
Saving test data to test_data.pth
Test data saved.
Dataset size: 81505
Dataset identifier: 63289b51067a4c6ede4c44c23a329d82ab4964ed43942794430a9b71ec685b5c
Count of different countries: 75
Count of different regions: 4596


In [9]:
print("Number of train batches:", len(train_dataloader.dataset), "")

# Print first batch as an example, to see the structure
for images, coordinates, country_indices, region_indices in train_dataloader:
    print("Images batch shape:", images.shape)
    print("Coordinates batch shape:", coordinates.shape)
    print(coordinates[0])
    print("Country indices:", country_indices.shape)
    print(country_indices[0])
    print("Region handler:", region_indices.shape)
    print(region_indices[0])
    break

Number of train batches: 57053 
Images batch shape: torch.Size([200, 3, 1, 1])
Coordinates batch shape: torch.Size([200, 2])
tensor([12.9247, 77.8241])
Country indices: torch.Size([200])
tensor(28)
Region handler: torch.Size([200])
tensor(1476)


## Training

In [10]:
model_types = ["resnet50", "efficientnet_b3"]  # "efficientnet_b1", "resnet50", "mobilenet_v2"
wandb.login(key=WANDB_TOKEN) if WANDB_TOKEN else wandb.login()

for model_type in model_types:
    if predict_coordinates:
        project_name = "predicting-coordinates"
        num_classes = 3
        sweep_goal = "minimize"
        sweep_metric_name = "Validation Distance (km)"
    elif predict_regions:
        project_name = "predicting-region"
        num_classes = len(region_to_index)
        sweep_goal = "minimize"
        sweep_metric_name = "Validation Distance (km)"
    else:
        num_classes = len(country_to_index)
        project_name = "predicting-country"
        sweep_goal = "maximize"
        sweep_metric_name = "Validation Accuracy Top 1"

    sweep_config = {
        "name": f"dspro2-basemodel-{model_type}-datasize-{NUMBER_OF_FILES}-input_imagesize-{image_size[0]}x{image_size[1]}",
        "method": "grid",
        "metric": {"goal": sweep_goal, "name": sweep_metric_name},
        # fmt: off
        "parameters": {"learning_rate": {"values": [1e-3]}, 
                       "optimizer": {"values": ["adamW"]}, "weight_decay": {"values": [0, 1e-1, 1e-2, 1e-3]}, 
                       "epochs": {"values": [20]}, "dataset_size": {"values": [NUMBER_OF_FILES]}, 
                       "dataset_identifier": {"values": [hashed_filenames]}, 
                       "seed": {"values": [42]}, 
                       "model_name": {"values": [model_type]}, 
                       "input_image_size": {"values": [image_size]}, 
                       "predict_coordinates": {"values": [predict_coordinates]}, 
                       "mapped_data": {"values": [USE_MAPPED]}, 
                       "different_countries": {"values": [len(country_to_index) if country_to_index is not None else 0]}, 
                       "different_regions": {"values": [len(region_to_index) if region_to_index is not None else 0]}, 
                       "data_augmentation": {"values": [data_augmentation]}, 
                       "predict_regions": {"values": [predict_regions]}, 
                       "batch_size": {"values": [BATCH_SIZE]}},
        # fmt: on
    }

    sweep_id = wandb.sweep(sweep=sweep_config, project=f"dspro2-{project_name}", entity=entity)

    def set_run_link(config, run):
        global run_link
        global run_link_path
        if run_link_path is not None:
            run_link = run.id
            with open(run_link_path, "w") as f:
                f.write(run_link)
            # Only write once
            run_link_path = None
            if additional_save_callback is not None:
                additional_save_callback()
        elif run_link is not None:
            wandb.log({"test_data_run_id": run_link})

    trainer = GeoModelTrainer(datasize=NUMBER_OF_FILES, train_dataloader=train_dataloader, val_dataloader=val_dataloader, num_classes=num_classes, predict_coordinates=predict_coordinates, country_to_index=country_to_index, region_to_index=region_to_index, region_index_to_middle_point=region_index_to_middle_point, region_index_to_country_index=region_index_to_country_index, predict_regions=predict_regions if not predict_coordinates else None, test_data_path=test_data_path, run_start_callback=set_run_link)

    wandb.agent(sweep_id, function=trainer.train)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkillusions[0m. Use [1m`wandb login --relogin`[0m to force relogin


Create sweep with ID: azze6syd
Sweep URL: https://wandb.ai/nlp_ls/dspro2-predicting-temp/sweeps/azze6syd


[34m[1mwandb[0m: Agent Starting Run: ifp48yu5 with config:
[34m[1mwandb[0m: 	data_augmentation: base_augmentation
[34m[1mwandb[0m: 	dataset_identifier: 63289b51067a4c6ede4c44c23a329d82ab4964ed43942794430a9b71ec685b5c
[34m[1mwandb[0m: 	dataset_size: 81505
[34m[1mwandb[0m: 	different_countries: 75
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	input_image_size: [1, 1]
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	mapped_data: True
[34m[1mwandb[0m: 	model_name: efficientnet_b1
[34m[1mwandb[0m: 	optimizer: adamW
[34m[1mwandb[0m: 	predict_coordinates: False
[34m[1mwandb[0m: 	seed: 42
[34m[1mwandb[0m: 	weight_decay: 0.1
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Create sweep with ID: 8r6jm1p1
Sweep URL: https://wandb.ai/nlp_ls/dspro2-predicting-temp/sweeps/8r6jm1p1
VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


[34m[1mwandb[0m: Agent Starting Run: uvae6ird with config:
[34m[1mwandb[0m: 	data_augmentation: base_augmentation
[34m[1mwandb[0m: 	dataset_identifier: 63289b51067a4c6ede4c44c23a329d82ab4964ed43942794430a9b71ec685b5c
[34m[1mwandb[0m: 	dataset_size: 81505
[34m[1mwandb[0m: 	different_countries: 75
[34m[1mwandb[0m: 	epochs: 50
[34m[1mwandb[0m: 	input_image_size: [1, 1]
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	mapped_data: True
[34m[1mwandb[0m: 	model_name: mobilenet_v2
[34m[1mwandb[0m: 	optimizer: adamW
[34m[1mwandb[0m: 	predict_coordinates: False
[34m[1mwandb[0m: 	seed: 42
[34m[1mwandb[0m: 	weight_decay: 0.1
