In [1]:
import os
import torch
import tqdm
import numpy as np
import pandas as pd
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.metrics import f1_score

In [2]:
class TrainDataset(Dataset):
    def __init__(self, data_dir, metadata, subset, transform=None):
        self.subset = subset
        self.transform = transform
        self.data_dir = data_dir
        self.metadata = metadata
        self.metadata = self.metadata.dropna(subset=["speciesId"]).reset_index(drop=True)
        self.metadata['speciesId'] = self.metadata['speciesId'].astype(int)
        self.label_dict = self.metadata.groupby('surveyId')['speciesId'].apply(list).to_dict()
        
        self.metadata = self.metadata.drop_duplicates(subset="surveyId").reset_index(drop=True)


    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        
        survey_id = self.metadata.surveyId[idx]
        sample = torch.nan_to_num(torch.load(os.path.join(self.data_dir, f"GLC24-PA-{self.subset}-landsat-time-series_{survey_id}_cube.pt")))

        species_ids = self.label_dict.get(survey_id, [])  # Get list of species IDs for the survey ID
        label = torch.zeros(num_classes)  # Initialize label tensor
        for species_id in species_ids:
            #label_id = self.species_mapping[species_id]  # Get consecutive integer label
            label_id = species_id
            label[label_id] = 1  # Set the corresponding class index to 1 for each species

        # Ensure the sample is in the correct format for the transform
        if isinstance(sample, torch.Tensor):
            sample = sample.permute(1, 2, 0)  # Change tensor shape from (C, H, W) to (H, W, C)
            sample = sample.numpy()  # Convert tensor to numpy array
            #print(sample.shape)

        if self.transform:
            sample = self.transform(sample)

        return sample, label, survey_id
    

In [12]:
batch_size = 64
transform = transforms.Compose([
    transforms.ToTensor()
])

cols = ['surveyId', 'speciesId']

num_classes = 11255
#Load Training metadata
train_data_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-train-landsat-time-series"
train_metadata_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24_PA_metadata_train.csv"
train_metadata = pd.read_csv(train_metadata_path, delimiter = ';').iloc[:200000][cols]
train_dataset = TrainDataset(train_data_path, train_metadata, subset="train", transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

test_data_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-train-landsat-time-series"
test_metadata_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24_PA_metadata_train.csv"
test_metadata = pd.read_csv(train_metadata_path, delimiter = ';').iloc[500000:600001][cols]
test_dataset = TrainDataset(test_data_path, test_metadata, subset="train", transform=transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

sample, label, surveyId = train_dataset[0]

print(sample.shape)


torch.Size([6, 4, 21])


In [4]:
class ModifiedResNet18(nn.Module):
    def __init__(self, num_classes):
        super(ModifiedResNet18, self).__init__()

        self.norm_input = nn.LayerNorm([6,4,21])
        self.resnet18 = models.resnet18(weights=None)
        # We have to modify the first convolutional layer to accept 4 channels instead of 3
        self.resnet18.conv1 = nn.Conv2d(6, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.resnet18.maxpool = nn.Identity()
        self.ln = nn.LayerNorm(1000)
        self.fc1 = nn.Linear(1000, 2056)
        self.fc2 = nn.Linear(2056, num_classes)

    def forward(self, x):
        x = self.norm_input(x)
        x = self.resnet18(x)
        x = self.ln(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

In [5]:
# Check if cuda is available
device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("DEVICE = CUDA")

num_classes = 11255 # Number of all unique classes within the PO and PA data.
model = ModifiedResNet18(num_classes).to(device)


DEVICE = CUDA


In [6]:
# Hyperparameters
learning_rate = 0.0002
num_epochs = 20
positive_weigh_factor = 1.0

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingLR(optimizer, T_max=25, verbose=True)



In [7]:
print(f"Training for {num_epochs} epochs started.")


for epoch in range(num_epochs):
    model.train()
    for batch_idx, (data, targets, _) in enumerate(train_loader):

        data = data.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(data)

        pos_weight = targets*positive_weigh_factor  # All positive weights are equal to 10
        criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        loss = criterion(outputs, targets)

        loss.backward()
        optimizer.step()

        if batch_idx % 278 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Batch {batch_idx}/{len(train_loader)}, Loss: {loss.item()}")

    scheduler.step()
    print("Scheduler:",scheduler.state_dict())

# Save the trained model
model.eval()
torch.save(model.state_dict(), "resnet18-with-bioclimatic-cubes1.pth")

Training for 20 epochs started.
Epoch 1/20, Batch 0/189, Loss: 0.7075073719024658
Scheduler: {'T_max': 25, 'eta_min': 0, 'base_lrs': [0.0002], 'last_epoch': 1, 'verbose': True, '_step_count': 2, '_get_lr_called_within_step': False, '_last_lr': [0.0001992114701314478]}
Epoch 2/20, Batch 0/189, Loss: 0.006964107975363731
Scheduler: {'T_max': 25, 'eta_min': 0, 'base_lrs': [0.0002], 'last_epoch': 2, 'verbose': True, '_step_count': 3, '_get_lr_called_within_step': False, '_last_lr': [0.0001968583161128631]}
Epoch 3/20, Batch 0/189, Loss: 0.006491994485259056
Scheduler: {'T_max': 25, 'eta_min': 0, 'base_lrs': [0.0002], 'last_epoch': 3, 'verbose': True, '_step_count': 4, '_get_lr_called_within_step': False, '_last_lr': [0.00019297764858882514]}
Epoch 4/20, Batch 0/189, Loss: 0.005732167977839708
Scheduler: {'T_max': 25, 'eta_min': 0, 'base_lrs': [0.0002], 'last_epoch': 4, 'verbose': True, '_step_count': 5, '_get_lr_called_within_step': False, '_last_lr': [0.00018763066800438636]}
Epoch 5/20, 

In [8]:
device='cuda'
model = ModifiedResNet18(num_classes).to(device)
model.load_state_dict(torch.load('/home/dakbarin/data/models/resnet18-with-bioclimatic-cubes1.pth'))
model.to(device)

def test_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in test_loader:
            inputs, labels, _ = data
            inputs, labels = inputs.to(device), labels.to(device)  # Transférer les données sur le même dispositif
            outputs = model(inputs)
            preds = torch.sigmoid(outputs)  # Utiliser la fonction sigmoid pour obtenir des probabilités
            
            # Convertir les prédictions et les labels en valeurs binaires
            preds_binary = (preds > 0.5).int()
            
            all_preds.append(preds_binary.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    
    return np.concatenate(all_preds), np.concatenate(all_labels)

# Tester le modèle
preds, labels = test_model(model, test_loader, device)

preds = preds.astype(int)
labels = labels.astype(int)

# Calculer le F1 score micro
f1_micro = f1_score(labels, preds, average='micro')
print(f"F1 Score Micro: {f1_micro}")

F1 Score Micro: 0.2493468773626571
