In [1]:
import os
import torch
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.metrics import f1_score

In [2]:
class TrainDataset(Dataset):
    def __init__(self, data_dir, metadata, subset, transform=None):
        self.subset = subset  # Type de sous-ensemble (ex: train, val, test)
        self.transform = transform  # Transformation à appliquer aux échantillons
        self.data_dir = data_dir  # Répertoire contenant les fichiers de données
        self.metadata = metadata  # Données de métadonnées
        # Supprimer les lignes sans speciesId et réinitialiser les index
        self.metadata = self.metadata.dropna(subset=["speciesId"]).reset_index(drop=True)
        self.metadata['speciesId'] = self.metadata['speciesId'].astype(int)  # Convertir speciesId en entier
        
        self.label_dict = self.metadata.groupby('surveyId')['speciesId'].apply(list).to_dict()
        self.presence_dict = self.metadata.groupby(['surveyId', 'speciesId'])['presence'].first().to_dict()
        
        self.metadata = self.metadata.drop_duplicates(subset="surveyId").reset_index(drop=True)

    def __len__(self):
        # Retourne le nombre de surveyId uniques
        return len(self.metadata)

    def __getitem__(self, idx):
        
        survey_id = self.metadata.surveyId[idx]
        sample = torch.nan_to_num(torch.load(os.path.join(self.data_dir, f"GLC24-PA-{self.subset}-landsat-time-series_{survey_id}_cube.pt")))
        species_ids = self.label_dict.get(survey_id, [])  # Obtenir la liste des species IDs pour le survey ID
        label = torch.zeros(num_classes)  
        for species_id in species_ids:
            label[species_id] = self.presence_dict.get((survey_id, species_id), 0.0)

        # Assurer que l'échantillon est dans le bon format pour la transformation
        if isinstance(sample, torch.Tensor):
            sample = sample.permute(1, 2, 0)  # Changer la forme du tenseur de (C, H, W) à (H, W, C)
            sample = sample.numpy()  

        if self.transform:
            sample = self.transform(sample)

        return sample, label, survey_id

In [3]:
batch_size = 64
transform = transforms.Compose([
    transforms.ToTensor()
])

num_classes = 11255
#Load Training metadata
train_data_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-train-landsat-time-series"
train_metadata_path = "/home/dakbarin/data/data/GEOLIFECLEF/metadata_for_presence_all.csv"
train_metadata = pd.read_csv(train_metadata_path)
train_dataset = TrainDataset(train_data_path, train_metadata, subset="train", transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=16)

test_data_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-train-landsat-time-series"
test_metadata_path = "/home/dakbarin/data/data/GEOLIFECLEF/metadata_for_presence_all.csv"
test_metadata = pd.read_csv(train_metadata_path)
test_dataset = TrainDataset(test_data_path, test_metadata, subset="train", transform=transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

print(len(train_loader))


985


In [4]:
class ModifiedResNet18(nn.Module):
    def __init__(self, num_classes):
        super(ModifiedResNet18, self).__init__()

        self.norm_input = nn.LayerNorm([6,4,21])
        self.resnet18 = models.resnet18(weights=None)
        # We have to modify the first convolutional layer to accept 4 channels instead of 3
        self.resnet18.conv1 = nn.Conv2d(6, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.resnet18.maxpool = nn.Identity()
        self.ln = nn.LayerNorm(1000)
        self.fc1 = nn.Linear(1000, 2056)
        self.fc2 = nn.Linear(2056, num_classes)

    def forward(self, x):
        x = self.norm_input(x)
        x = self.resnet18(x)
        x = self.ln(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

In [5]:
# Check if cuda is available
device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("DEVICE = CUDA")

num_classes = 11255 # Number of all unique classes within the PO and PA data.
model = ModifiedResNet18(num_classes).to(device)


DEVICE = CUDA


In [6]:
# Hyperparameters
learning_rate = 0.0002
num_epochs = 20
positive_weigh_factor = 1.0

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingLR(optimizer, T_max=25, verbose=True)



In [9]:
import time

print(f"Training for {num_epochs} epochs started.")

for epoch in range(num_epochs):
    start_time = time.time()  # Start the timer for this epoch
    
    model.train()
    # Wrap the train_loader with tqdm
    for batch_idx, (data, targets, _) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")):
        data = data.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(data)

        pos_weight = targets * positive_weigh_factor  # All positive weights are equal to 10
        criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        loss = criterion(outputs, targets)

        loss.backward()
        optimizer.step()

        if batch_idx % 278 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item()}")

    scheduler.step()
    print("Scheduler:", scheduler.state_dict())

    end_time = time.time()  # Stop the timer for this epoch
    epoch_duration = end_time - start_time
    print(f"Epoch {epoch+1} completed in {epoch_duration:.2f} seconds.")

# Save the trained model
model.eval()
torch.save(model.state_dict(), "resnet18-with-bioclimatic-cubes3.pth")

Training for 20 epochs started.


Epoch 1/20:   0%|          | 0/985 [00:00<?, ?it/s]

Epoch 1/20, Batch 0/985, Loss: 0.7045856714248657
Epoch 1/20, Batch 278/985, Loss: 0.011697283945977688
Epoch 1/20, Batch 556/985, Loss: 0.012349762953817844
Epoch 1/20, Batch 834/985, Loss: 0.011131570674479008
Scheduler: {'T_max': 25, 'eta_min': 0, 'base_lrs': [0.0002], 'last_epoch': 1, 'verbose': True, '_step_count': 2, '_get_lr_called_within_step': False, '_last_lr': [0.0001992114701314478]}
Epoch 1 completed in 52.03 seconds.


Epoch 2/20:   0%|          | 0/985 [00:00<?, ?it/s]

Epoch 2/20, Batch 0/985, Loss: 0.011535607278347015
Epoch 2/20, Batch 278/985, Loss: 0.010793199762701988


KeyboardInterrupt: 

In [28]:
num_species = 11255
batch_size = 64
device = 'cuda'
def remove_module_prefix(state_dict):
    new_state_dict = {}
    for k, v in state_dict.items():
        if k.startswith('module.'):
            new_state_dict[k[7:]] = v
        else:
            new_state_dict[k] = v
    return new_state_dict

# Charger l'état du modèle entraîné
model = ModifiedResNet18(num_species).to(device)
state_dict = torch.load("/home/dakbarin/data/models/resnet18_with_bioclimatic_cubes_epoch_16.pth")
state_dict = remove_module_prefix(state_dict)
model.load_state_dict(state_dict)


<All keys matched successfully>

In [8]:

#odel = ModifiedResNet18(num_classes).to(device)
#state_dict = torch.load("/home/dakbarin/data/models/resnet18-with-bioclimatic-cubes3.pth")
#model.load_state_dict(state_dict)

def test_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in test_loader:
            inputs, labels, _ = data
            inputs, labels = inputs.to(device), labels.to(device)  # Transférer les données sur le même dispositif
            outputs = model(inputs)
            preds = torch.sigmoid(outputs)  # Utiliser la fonction sigmoid pour obtenir des probabilités
            
            # Convertir les prédictions et les labels en valeurs binaires
            preds_binary = (preds > 0.5).int()
            
            all_preds.append(preds_binary.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    
    return np.concatenate(all_preds), np.concatenate(all_labels)

# Tester le modèle
preds, labels = test_model(model, test_loader, device)

preds = preds.astype(int)
labels = labels.astype(int)

# Calculer le F1 score micro
f1_micro = f1_score(labels, preds, average='micro')
print(f"F1 Score Micro: {f1_micro}")

KeyboardInterrupt: 

In [23]:
class TestDataset(TrainDataset):
    def __init__(self, data_dir, metadata, subset, transform=None):
        self.subset = subset
        self.transform = transform
        self.data_dir = data_dir
        self.metadata = metadata
        
    def __getitem__(self, idx):
        
        survey_id = self.metadata.surveyId[idx]
        sample = torch.nan_to_num(torch.load(os.path.join(self.data_dir, f"GLC24-PA-{self.subset}-landsat_time_series_{survey_id}_cube.pt")))

        if isinstance(sample, torch.Tensor):
            sample = sample.permute(1, 2, 0)  # Change tensor shape from (C, H, W) to (H, W, C)
            sample = sample.numpy()

        if self.transform:
            sample = self.transform(sample)

        return sample, survey_id
    
# Load Test metadata
test_data_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-test-landsat_time_series"
test_metadata_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-metadata-test.csv"
test_metadata = pd.read_csv(test_metadata_path, delimiter =';')
test_dataset = TestDataset(test_data_path, test_metadata, subset="test", transform=transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [37]:
from tqdm.notebook import tqdm
import torch

with torch.no_grad():
    all_preds = []
    surveys = []
    for data, surveyID in tqdm(test_loader, total=len(test_loader)):
        data = data.to(device)
        predictions = model(data)
        
        preds = torch.sigmoid(predictions)  # Appliquer sigmoid avant la conversion en numpy
        preds_binary = (preds > 0.9).int()
        
        all_preds.append(preds_binary.cpu().numpy())
        surveys.extend(surveyID.cpu().numpy())

  0%|          | 0/74 [00:00<?, ?it/s]

In [38]:
with torch.no_grad():
    all_predictions = []
    surveys = []
    top_k_indices = None
    for data, surveyID in tqdm(test_loader, total=len(test_loader)):

        data = data.to(device)
        predictions = model(data)
        predictions = torch.sigmoid(predictions).cpu().numpy()
        print(predictions[0,642])

        # Sellect top-25 values as predictions
        top_25 = np.argsort(-predictions, axis=1)[:, :500] 
        if top_k_indices is None:
            top_k_indices = top_25
        else:
            top_k_indices = np.concatenate((top_k_indices, top_25), axis=0)

        surveys.extend(surveyID.cpu().numpy())

  0%|          | 0/74 [00:00<?, ?it/s]

0.0020326113
0.0012975612
0.002156962
0.001861093
0.0018279508
0.002208804
0.0013880328
0.0021042519
0.0013884115
0.0017737538
0.0014538981
5.0315593e-05
0.0027619451
0.0013386263
0.0015388025
0.0024310711
0.0031582278
0.001251995
0.0011549818
0.0015126732
0.002023811
0.001910845
0.0017412904
0.0022508393
0.0019879558
0.0024842292
0.00097367127
0.0008113582
0.0017538437
0.0009668477
0.0016370277
0.0022209045
0.0016761761
0.0002811433
0.0018631986
0.0007181519
0.0015413117
0.0020072258
0.0025743723
0.0016864135
0.0020383
0.0019478108
0.0021896085
0.0010005254
0.0016190639
0.0015712319
0.001437789
0.0031574876
0.0023934497
0.00076946785
0.004016435
0.0007154298
0.0019806686
0.0010469322
0.0009526873
0.0008430004
0.0013295043
0.002196231
0.0012516696
0.0014193463
0.00039052436
0.0006419512
0.0009329487
0.002703704
0.0019057964
0.0015103264
0.0018504629
0.0023980765
0.0014161653
0.0022701996
0.0015969276
0.0029920165
0.002217706
0.002770718


In [12]:
print(len(all_preds))
print(len(surveys))

74
4716


In [39]:
data_concatenated = [' '.join(map(str, row)) for row in top_k_indices]

pd.DataFrame(
    {'surveyId': surveys,
     'predictions': data_concatenated,
    }).to_csv("submission.csv", index = False)