In [1]:
# important for gpuhub
# !pip install -r ../../requirements.txt --upgrade

## Importing Libraries and tokens

In [236]:
import wandb
import sys
import os

import torch
from torchvision import transforms

# load .env file
from dotenv import load_dotenv
from wandb_best_runs_downloader import WandbDownloader
#from geo_model_trainer import GeoModelTrainer
#from image_data_handler import TestImageDataHandler

#torch.backends.cudnn.benchmark = False
#torch.backends.cudnn.deterministic = True

sys.path.insert(0, '../')
from data_loader import get_data_to_load, hash_filenames

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [237]:
WANDB_TOKEN = os.getenv('WANDB_TOKEN')
# Define where to run
env_path = '../../.env'
if not WANDB_TOKEN and os.path.exists(env_path):
  load_dotenv(env_path)
  WANDB_TOKEN = os.getenv('WANDB_TOKEN')

In [238]:
# Check if GPU is available
if torch.cuda.is_available():
    print("GPU is available.")
    
    # Print the name of the GPU
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    
    # Print the total and available memory
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9  # Convert bytes to GB
    print(f"Total Memory: {total_memory:.2f} GB")

    allocated_memory = torch.cuda.memory_allocated(0) / 1e9  # Convert bytes to GB
    print(f"Allocated Memory: {allocated_memory:.2f} GB")

    cached_memory = torch.cuda.memory_reserved(0) / 1e9  # Convert bytes to GB
    print(f"Cached Memory: {cached_memory:.2f} GB")

    # Print other properties
    device_properties = torch.cuda.get_device_properties(0)
    print(f"CUDA Capability: {device_properties.major}.{device_properties.minor}")
    print(f"Multi-Processor Count: {device_properties.multi_processor_count}")
else:
    print("No GPU found.")

No GPU found.


## Loading files from wandb

In [239]:
wandb.login(key=WANDB_TOKEN) if WANDB_TOKEN else wandb.login()

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/lukasstoeckli/.netrc


True

In [240]:
# Example usage:
entity = "nlp_ls"
project = "dspro2-predicting-country"
metric_name = "Validation Accuracy Top 1"
data_augmentation = "full_augmentation_v2"  # Replace with the desired augmentation
datasize = 81505  # Replace with the desired datasize
file_names_to_download = [".pth", ".json"]
image_size = [80, 130]

downloader = WandbBestRunsDownloader(entity, project, data_augmentation, datasize, image_size)
run_data = downloader.get_and_collect_best_runs(metric_name, file_names_to_download)

# Now run_data contains all necessary information for further processing
import pprint
pprint.pprint(run_data)

{'Best Run 1': {'files': {'artifact/858477759/wandb_manifest.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/artifact/858477759/wandb_manifest.json',
                          'best_model': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/best_model_checkpointmodel_efficientnet_b3_lr_0.01_opt_adamW_weightDecay_0.1_imgSize_[80, '
                                        '130]_predict_coordinates_False.pth',
                          'country_to_index.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/country_to_index.json',
                          'files/country_to_index.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/files/country_to_index.json',
                          'test_data': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/yns8ucfa/test_data.pth',
                          'wandb-metadata.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10ad

In [251]:
for i in [1, 3, 5]:
    print(f"Validation Accuracy Top {i}: ", run_data["Best Run 1"]["metrics"][f"Validation Accuracy Top {i}"])


Validation Accuracy Top 1:  0.38654070302435434
Validation Accuracy Top 3:  0.6087970063186308
Validation Accuracy Top 5:  0.7050487700141096


In [245]:
run_data["Best Run 1"]["files"]

{'artifact/858477759/wandb_manifest.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/artifact/858477759/wandb_manifest.json',
 'best_model': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/best_model_checkpointmodel_efficientnet_b3_lr_0.01_opt_adamW_weightDecay_0.1_imgSize_[80, 130]_predict_coordinates_False.pth',
 'country_to_index.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/country_to_index.json',
 'files/country_to_index.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/files/country_to_index.json',
 'wandb-metadata.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/wandb-metadata.json',
 'wandb-summary.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/wandb-summary.json',
 'test_data': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/yns8ucfa/test_data.pth'}

## Loading data

In [5]:
# set number of files to load
NUMBER_OF_FILES = 79000 # 100000
# Set to False to use non-mapped data (singleplayer distribution), has more data
USE_MAPPED = True

# get list with local data and file paths
list_files, zip_load_callback, additional_save_callback = get_data_to_load(loading_file='../3_data_preparation/04_data_cleaning/updated_data_list_more' if USE_MAPPED else '../3_data_preparation/04_data_cleaning/updated_data_list_non_mapped', 
                              file_location='../3_data_preparation/01_enriching/.data', image_file_location='../1_data_collection/.data', allow_new_file_creation=False, 
                              from_remote_only=True, download_link='default', limit=NUMBER_OF_FILES, shuffle_seed=42, allow_file_location_env=True, allow_json_file_location_env=True, 
                              allow_image_file_location_env=True, allow_download_link_env=True, return_zip_load_and_additional_save_callback=True)

Getting files list from remote
Got files list from remote
Parsed files list from remote
All remote files: 705681
All local files: 705681
Relevant files: 705681
Limited files: 158000


In [6]:
print(len(list_files) // 2)

(79000, 79000, 79000)

## Processing and loading data

In [7]:
data_augmentation = "base_augmentation"

# Default was 50, 50
image_size = [80, 130]
# Original size is  pixelHeight: 180, pixelWidth: 320
# image_size = [180, 320]

preprocessing_config = { 'data_augmentation': data_augmentation, 'height': image_size[0], 'width': image_size[1] }

base_transform = transforms.Compose([
          transforms.Resize((image_size[0], image_size[1])),
        ])
augmented_transform = None
final_transform = transforms.Compose([
          transforms.ToTensor(),
          transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

if data_augmentation == "full_augmentation":
    augmented_transform = transforms.Compose([
        transforms.RandomRotation(10),          # Randomly rotate the image by up to 10 degrees
        transforms.ColorJitter(
            brightness=(0.5, 1.5),  # Randomly change brightness (lower limit to simulate night, upper limit for bright daylight)
            contrast=(0.5, 1.5),    # Randomly change contrast
            saturation=(0.5, 1.5),  # Randomly change saturation
            hue=(-0.1, 0.1)         # Randomly change hue
        )
    ])

In [8]:
# Creating Dataloasders with the classes

# Hash the files list to get a unique identifier for the data
hashed_filenames = hash_filenames(list_files)

cache = True

data_handler = TestImageDataHandler(list_files, base_transform, augmented_transform, final_transform, preprocessing_config, batch_size=200, cache=cache, cache_zip_load_callback=zip_load_callback, cache_additional_save_callback=additional_save_callback)
test_dataloader = data_handler.test_loader
country_to_index = data_handler.country_to_index
test_data_path = data_handler.test_data_path

# Load the country_to_index mapping and print the count of different countries
print("Dataset size:", NUMBER_OF_FILES)
print("Dataset identifier:", hashed_filenames)
print(f"Count of different countries: {len(country_to_index)}")

Loading train images and labels: 100%|██████████| 277/277 [03:06<00:00,  1.48it/s]
Loading val images and labels:  16%|█▋        | 13/79 [00:16<01:58,  1.79s/it]

: 

In [None]:
print("Number of train batches:", len(test_dataloader.dataset), "")

# Print first batch as an example, to see the structure
for images, coordinates, country_indices in test_dataloader:
    print("Images batch shape:", images.shape)
    print("Coordinates batch shape:", coordinates.shape)
    print(coordinates[0])
    print("Country indices:", country_indices.shape)
    print(country_indices[0])
    break

## Evaluating the model

In [None]:
# TODO: Load model and files from wandb before the Datahandeler is created
# TODO: Create the Dataloader
# TODO: Test the model
# TODO: Show the different models with the best results (also do it for different data sizes and mapped/non-mapped data)