In [9]:
import os
import torch
from tqdm.notebook import tqdm
import numpy as np
import time
import pandas as pd
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.metrics import f1_score

In [10]:
num_species = 11255
seed = 32
class TrainDataset(Dataset):
    def __init__(self, data_dir, metadata, subset, transform=None):
        self.subset = subset  # Type de sous-ensemble (ex: train, val, test)
        self.transform = transform  # Transformation à appliquer aux échantillons
        self.data_dir = data_dir  # Répertoire contenant les fichiers de données
        self.metadata = metadata
        # Supprimer les lignes sans speciesId et réinitialiser les index
        self.metadata = self.metadata.dropna(subset=["speciesId"]).reset_index(drop=True)
        self.metadata['speciesId'] = self.metadata['speciesId'].astype(int)  # Convertir speciesId en entier
        
        self.metadata = self.metadata.drop_duplicates(subset=["group_number","speciesId"]).reset_index(drop=True)
        
        self.species_dict = self.metadata.groupby('group_number')['speciesId'].apply(list).to_dict()
        self.surveyId_dict = self.metadata.groupby('group_number')['surveyId'].apply(list).to_dict()
        
        self.metadata = self.metadata.drop_duplicates(subset="group_number").reset_index(drop=True)

    def __len__(self):
        # Retourne le nombre de surveyId uniques
        return len(self.metadata)

    def __getitem__(self, idx):
        
        group_number = self.metadata.loc[idx,"group_number"]
        final_sample = torch.zeros((6,4,21))
        survey_ids = self.surveyId_dict.get(group_number, [])
        
        # Charger tous les tenseurs en une seule fois et les empiler
        samples = [torch.nan_to_num(torch.load(os.path.join(self.data_dir, f"GLC24-PO-{self.subset}-landsat_time_series_{survey_id}_cube.pt"))) 
                   for survey_id in survey_ids]

        # Si aucun tenseur n'a été chargé, renvoyer un tenseur nul
        if len(samples) == 0:
            final_sample = torch.zeros((6, 4, 21))
        else:
            stacked_samples = torch.stack(samples)
            final_sample = stacked_samples.mean(dim=0)
        
        species_ids = self.species_dict.get(group_number, [])  # Obtenir la liste des species IDs pour le group_number
        label = torch.zeros(num_species)  
        
        for species_id in species_ids:
            label[species_id] = 1

        # Assurer que l'échantillon est dans le bon format pour la transformation
        if isinstance(final_sample, torch.Tensor):
            final_sample = final_sample.permute(1, 2, 0)  # Changer la forme du tenseur de (C, H, W) à (H, W, C)
            final_sample = final_sample.numpy()  

        if self.transform:
            final_sample = self.transform(final_sample)

        return final_sample, label, group_number

In [11]:
batch_size = 32
transform = transforms.Compose([
    transforms.ToTensor()
])
NUM_WORKERS = os.cpu_count()
print(NUM_WORKERS)
num_species = 11255
#Load Training metadata
train_data_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PO-train-landsat_time_series"
train_metadata_path = "/home/dakbarin/data/data/GEOLIFECLEF/PO_grouped.csv"
train_metadata = pd.read_csv(train_metadata_path)
train_dataset = TrainDataset(train_data_path, train_metadata, subset="train", transform=transform)
training, validation = random_split(train_dataset,
                                    [int(len(train_dataset)*0.85), len(train_dataset)-int(len(train_dataset)*0.85)],
                                    generator=torch.Generator().manual_seed(seed))
train_loader = DataLoader(training, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS)
val_loader = DataLoader(validation, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)



128


In [12]:
final_sample, label, group_number = train_dataset[1000]
print(len(train_dataset))
print(final_sample.shape)

52873
torch.Size([6, 4, 21])


In [13]:
class ModifiedResNet18(nn.Module):
    def __init__(self, num_classes):
        super(ModifiedResNet18, self).__init__()

        self.norm_input = nn.LayerNorm([6,4,21])
        self.resnet18 = models.resnet18(weights=None)
        # We have to modify the first convolutional layer to accept 4 channels instead of 3
        self.resnet18.conv1 = nn.Conv2d(6, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.resnet18.maxpool = nn.Identity()
        self.ln = nn.LayerNorm(1000)
        self.fc1 = nn.Linear(1000, 2056)
        self.fc2 = nn.Linear(2056, num_species)

    def forward(self, x):
        x = self.norm_input(x)
        x = self.resnet18(x)
        x = self.ln(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

In [14]:
# Check if cuda is available
device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("DEVICE = CUDA")

#model = ModifiedResNet18(num_species).to(device)
model = ModifiedResNet18(num_species).to(device)
state_dict = torch.load("/home/dakbarin/data/models/resnet18_with_bioclimatic_cubes_epoch_20.pth")
model.load_state_dict(state_dict)

DEVICE = CUDA


<All keys matched successfully>

In [15]:
# Hyperparameters
learning_rate = 0.0002
num_epochs = 11
positive_weigh_factor = 1.0

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingLR(optimizer, T_max=25, verbose=True)

In [16]:
print(f"Training for {num_epochs} epochs started.")

# Structure pour stocker les pertes
losses = {
    "epoch": [],
    "train_loss": [],
    "val_loss": []
}
    
# Training loop
for epoch in range(num_epochs):
    start_time = time.time()
    print(f"Epoch {epoch+1}/{num_epochs} | Training...")
    model.train()
    running_loss = 0.0
    for data, targets, _ in tqdm(train_loader, desc=f"Training", leave=False):
        data = data.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        out = model(data)

        loss = criterion(out, targets)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    scheduler.step()
    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {running_loss/len(train_loader)}, Time: {epoch_time:.2f} seconds")

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        print(f"Epoch {epoch+1}/{num_epochs} | Validation...")
        for data, targets, _ in tqdm(val_loader, desc=f"Validation)", leave=False):
            data = data.to(device)
            targets = targets.to(device)

            out = model(data)

            loss = criterion(out, targets)

            val_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss/len(test_loader)}")

    # Enregistrer les pertes dans la structure
    losses["epoch"].append(epoch + 1)
    losses["train_loss"].append(running_loss / len(train_loader))
    losses["val_loss"].append(val_loss / len(test_loader))

    # Sauvegarder les pertes dans un fichier CSV
    df = pd.DataFrame(losses)
    df.to_csv("training_losses.csv", index=False)

    # Save the model checkpoint
    if epoch % save_every == 0:
        torch.save(model.state_dict(), f"resnet18_with_bioclimatic_cubes_epoch_{epoch}_fine_tuned.pth")

Training for 11 epochs started.
Epoch 1/11 | Training...


Training:   0%|          | 0/1405 [00:00<?, ?it/s]

KeyboardInterrupt: 