In [1]:
# important for gpuhub
# !pip install -r ../../requirements.txt --upgrade

## Importing Libraries and tokens

In [1]:
import wandb
import os
import pickle
import torch

# load .env file
from dotenv import load_dotenv
from wandb_downloader import WandbDownloader
from geo_model_tester import GeoModelTester
from image_data_handler_test import TestImageDataHandler

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [2]:
WANDB_TOKEN = os.getenv('WANDB_TOKEN')
# Define where to run
env_path = '../../.env'
if not WANDB_TOKEN and os.path.exists(env_path):
  load_dotenv(env_path)
  WANDB_TOKEN = os.getenv('WANDB_TOKEN')

In [3]:
# Check if GPU is available
if torch.cuda.is_available():
    print("GPU is available.")
    
    # Print the name of the GPU
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    
    # Print the total and available memory
    total_memory = torch.cuda.get_device_properties(0).total_memory / 1e9  # Convert bytes to GB
    print(f"Total Memory: {total_memory:.2f} GB")

    allocated_memory = torch.cuda.memory_allocated(0) / 1e9  # Convert bytes to GB
    print(f"Allocated Memory: {allocated_memory:.2f} GB")

    cached_memory = torch.cuda.memory_reserved(0) / 1e9  # Convert bytes to GB
    print(f"Cached Memory: {cached_memory:.2f} GB")

    # Print other properties
    device_properties = torch.cuda.get_device_properties(0)
    print(f"CUDA Capability: {device_properties.major}.{device_properties.minor}")
    print(f"Multi-Processor Count: {device_properties.multi_processor_count}")
else:
    print("No GPU found.")

No GPU found.


## Loading files from wandb

In [4]:
wandb.login(key=WANDB_TOKEN) if WANDB_TOKEN else wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mluki-st[0m ([33mnlp_ls[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/lukasstoeckli/.netrc


True

In [5]:
# Example usage:
entity = "nlp_ls"
project = "dspro2-predicting-country"
metric_name = "Validation Accuracy Top 1"
data_augmentation = "full_augmentation_v2"  # Replace with the desired augmentation
datasize = 81505  # Replace with the desired datasize
file_names_to_download = [".pth", ".json"]
image_size = [80, 130]

downloader = WandbDownloader(entity, project, data_augmentation, datasize, image_size)
run_data = downloader.get_and_collect_best_runs(metric_name, file_names_to_download)

# Now run_data contains all necessary information for further processing
import pprint
pprint.pprint(run_data)

{'Best Run 1': {'files': {'artifact/858477759/wandb_manifest.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/artifact/858477759/wandb_manifest.json',
                          'best_model': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/best_model_checkpointmodel_efficientnet_b3_lr_0.01_opt_adamW_weightDecay_0.1_imgSize_[80, '
                                        '130]_predict_coordinates_False.pth',
                          'country_to_index.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/country_to_index.json',
                          'files/country_to_index.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/files/country_to_index.json',
                          'test_data': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/yns8ucfa/test_data.pth',
                          'wandb-metadata.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10ad

In [10]:
run_data.keys()

dict_keys(['Best Run 1', 'Best Run 2', 'Best Run 3', 'Best Run 4', 'Best Run 5'])

In [6]:
# Print the validation accuracy for the top 1, 3, and 5 predictions
for i in [1, 3, 5]:
    print(f"Validation Accuracy Top {i}: ", run_data["Best Run 1"]["metrics"][f"Validation Accuracy Top {i}"])

Validation Accuracy Top 1:  0.38654070302435434
Validation Accuracy Top 3:  0.6087970063186308
Validation Accuracy Top 5:  0.7050487700141096


In [11]:
run_data["Best Run 1"]["parameters"]

{'seed': 42,
 'epochs': 50,
 'optimizer': 'adamW',
 'model_name': 'efficientnet_b3',
 'mapped_data': True,
 'dataset_size': 81505,
 'weight_decay': 0.1,
 'learning_rate': 0.01,
 'input_image_size': [80, 130],
 'data_augmentation': 'full_augmentation_v2',
 'dataset_identifier': '63289b51067a4c6ede4c44c23a329d82ab4964ed43942794430a9b71ec685b5c',
 'different_countries': 75,
 'predict_coordinates': False}

In [7]:
run_data["Best Run 1"]["files"]

{'artifact/858477759/wandb_manifest.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/artifact/858477759/wandb_manifest.json',
 'best_model': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/best_model_checkpointmodel_efficientnet_b3_lr_0.01_opt_adamW_weightDecay_0.1_imgSize_[80, 130]_predict_coordinates_False.pth',
 'country_to_index.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/country_to_index.json',
 'files/country_to_index.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/files/country_to_index.json',
 'wandb-metadata.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/wandb-metadata.json',
 'wandb-summary.json': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/10adb1sx/wandb-summary.json',
 'test_data': 'https://api.wandb.ai/files/nlp_ls/dspro2-predicting-country/yns8ucfa/test_data.pth'}

## Loading metrics for best runs for each datasize, with and without augmentation

In [20]:
# Load best runs
def load_best_runs(entity, project, metric_name, data_augmentation, datasize, image_size, file_names_to_download):
    downloader = WandbDownloader(entity, project, data_augmentation, datasize, image_size)
    return downloader.get_and_collect_best_runs(metric_name, file_names_to_download)

In [48]:
# Parameters & Configurations
entity = "nlp_ls"
metric_name = "Validation Accuracy Top 1"
file_names_to_download = [".pth", ".json"]

configurations = [
    {"project": "dspro2-predicting-country", "data_augmentation": "base_augmentation", "datasize": 81505, "image_size": [80, 130]},
    {"project": "dspro2-predicting-country", "data_augmentation": "full_augmentation_v2", "datasize": 81505, "image_size": [80, 130]},
    {"project": "dspro2-basemodel-predicting-country", "data_augmentation": "base_augmentation", "datasize": 330000, "image_size": [80, 130]},
    {"project": "dspro2-basemodel-predicting-country", "data_augmentation": "base_augmentation", "datasize": 79000, "image_size": [180, 320]},
]

# Collect results and save them
results = {}
for config in configurations:
    key = f"{config['project']}_{config['data_augmentation']}_{config['datasize']}_{config['image_size']}"
    results[key] = load_best_runs(entity, config['project'], metric_name, config['data_augmentation'], config['datasize'], config['image_size'], file_names_to_download)

In [49]:
results

{'dspro2-predicting-country_base_augmentation_81505_[80, 130]': {'Best Run 1': {'id': '5mag34f1',
   'parameters': {'seed': 42,
    'epochs': 50,
    'optimizer': 'adamW',
    'model_name': 'efficientnet_b1',
    'mapped_data': True,
    'dataset_size': 81505,
    'weight_decay': 0.1,
    'learning_rate': 0.01,
    'input_image_size': [80, 130],
    'data_augmentation': 'base_augmentation',
    'dataset_identifier': '63289b51067a4c6ede4c44c23a329d82ab4964ed43942794430a9b71ec685b5c',
    'different_countries': 75,
    'predict_coordinates': False},
   'metrics': {'Validation Accuracy Top 1': 0.49414146371388257, 'test_data_run_id': 'axsmkdio', 'Validation Accuracy Top 3': 0.7262131157597693, '_step': 24, '_runtime': 4649.802455186844, '_timestamp': 1717797350.018479, 'Train Accuracy Top 3': 0.999193732143796, 'Train Accuracy Top 5': 0.9997020314444464, 'Validation Accuracy Top 5': 0.8084780074842034, '_wandb': {'runtime': 4843}, 'Train Loss': 0.03310475957623512, 'Validation Loss': 3.25

In [51]:
import json

# Save results to a JSON file
with open('best_runs.json', 'w') as json_file:
    json.dump(results, json_file, indent=4)

print("Results saved to best_runs.json")

TypeError: Object of type HTTPSummary is not JSON serializable

## Loading data and creating data loader

In [12]:
# Creating Dataloaders with the classes
test_dataset = run_data["Best Run 1"]["files"]['test_data']
files = run_data["Best Run 1"]["files"]
country_to_index = files.get('country_to_index.json', None)
region_to_index = files.get('region_to_index.json', None)
region_index_to_middle_point = files.get('region_index_to_middle_point.json', None)
region_index_to_country_index = files.get('region_index_to_country_index.json', None)

data_handler = TestImageDataHandler(test_dataset, country_to_index, region_to_index, region_index_to_middle_point, region_index_to_country_index)
test_dataloader = data_handler.test_loader
country_to_index = data_handler.country_to_index
region_to_index = data_handler.region_to_index
region_index_to_middle_point = data_handler.region_index_to_middle_point
region_index_to_country_index = data_handler.region_index_to_country_index

Loading test data from test_data.pth
Test data loaded.


AttributeError: 'TestImageDataHandler' object has no attribute 'region_handler'

## Evaluating the model

In [None]:
num_classes = run_data["Best Run 1"]["parameters"]["different_countries"]

geo_model_tester = GeoModelTester(
    datasize=1000, 
    train_dataloader=None, 
    val_dataloader=None, 
    test_dataloader=test_dataloader, 
    num_classes=num_classes, 
    predict_coordinates=False, 
    country_to_index=country_to_index,
    region_to_index=region_to_index,
    region_index_to_middle_point=region_index_to_middle_point,
    region_index_to_country_index=region_index_to_country_index,
    predict_regions=False
)

In [11]:
# TODO: Debug the region problem because old dataset does not have regions
# TODO: Test the model from best runs
# TODO: Show the different models with the best results (also do it for different data sizes and mapped/non-mapped data)
model_name = run_data["Best Run 1"]["parameters"]["model_name"]
pretrained_weights = run_data["Best Run 1"]["files"]['best_model']

geo_model_tester.test(model_type=model_name, model_path=pretrained_weights)

TypeError: 'bool' object is not subscriptable