In [1]:
import os
import torch
import time
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.metrics import f1_score

In [2]:
num_species = 11255
seed = 32
class TrainDataset(Dataset):
    def __init__(self, data_dir, metadata, subset, transform=None):
        self.subset = subset  # Type de sous-ensemble (ex: train, val, test)
        self.transform = transform  # Transformation à appliquer aux échantillons
        self.data_dir = data_dir  # Répertoire contenant les fichiers de données
        self.metadata = metadata
        # Supprimer les lignes sans speciesId et réinitialiser les index
        self.metadata = self.metadata.dropna(subset=["speciesId"]).reset_index(drop=True)
        self.metadata['speciesId'] = self.metadata['speciesId'].astype(int)  # Convertir speciesId en entier
        
        self.metadata = self.metadata.drop_duplicates(subset=["group_number","speciesId"]).reset_index(drop=True)
        
        self.species_dict = self.metadata.groupby('group_number')['speciesId'].apply(list).to_dict()
        self.surveyId_dict = self.metadata.groupby('group_number')['surveyId'].apply(list).to_dict()
        
        self.metadata = self.metadata.drop_duplicates(subset="group_number").reset_index(drop=True)

    def __len__(self):
        # Retourne le nombre de surveyId uniques
        return len(self.metadata)

    def __getitem__(self, idx):
        
        group_number = self.metadata.loc[idx,"group_number"]
        final_sample = torch.zeros((6,4,21))
        survey_ids = self.surveyId_dict.get(group_number, [])
        
        # Charger tous les tenseurs en une seule fois et les empiler
        samples = [torch.nan_to_num(torch.load(os.path.join(self.data_dir, f"GLC24-PA-{self.subset}-landsat-time-series_{survey_id}_cube.pt"))) 
                   for survey_id in survey_ids]

        # Si aucun tenseur n'a été chargé, renvoyer un tenseur nul
        if len(samples) == 0:
            final_sample = torch.zeros((6, 4, 21))
        else:
            stacked_samples = torch.stack(samples)
            final_sample = stacked_samples.mean(dim=0)
        
        species_ids = self.species_dict.get(group_number, [])  # Obtenir la liste des species IDs pour le group_number
        label = torch.zeros(num_species)  
        
        for species_id in species_ids:
            label[species_id] = 1

        # Assurer que l'échantillon est dans le bon format pour la transformation
        if isinstance(final_sample, torch.Tensor):
            final_sample = final_sample.permute(1, 2, 0)  # Changer la forme du tenseur de (C, H, W) à (H, W, C)
            final_sample = final_sample.numpy()  

        if self.transform:
            final_sample = self.transform(final_sample)

        return final_sample, label, group_number

In [3]:
batch_size = 128
transform = transforms.Compose([
    transforms.ToTensor()
])
NUM_WORKERS = os.cpu_count()
print(NUM_WORKERS)
num_species = 11255
#Load Training metadata
train_data_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-train-landsat-time-series"
train_metadata_path = "/home/dakbarin/data/data/GEOLIFECLEF/PA_grouped_2.csv"
train_metadata = pd.read_csv(train_metadata_path)
train_dataset = TrainDataset(train_data_path, train_metadata, subset="train", transform=transform)
training, validation = random_split(train_dataset,
                                    [int(len(train_dataset)*0.85), len(train_dataset)-int(len(train_dataset)*0.85)],
                                    generator=torch.Generator().manual_seed(seed))
train_loader = DataLoader(training, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS)
val_loader = DataLoader(validation, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)



128


  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
class ModifiedResNet18(nn.Module):
    def __init__(self, num_classes):
        super(ModifiedResNet18, self).__init__()

        self.norm_input = nn.LayerNorm([6,4,21])
        self.resnet18 = models.resnet18(weights=None)
        # We have to modify the first convolutional layer to accept 4 channels instead of 3
        self.resnet18.conv1 = nn.Conv2d(6, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.resnet18.maxpool = nn.Identity()
        self.ln = nn.LayerNorm(1000)
        self.fc1 = nn.Linear(1000, 2056)
        self.fc2 = nn.Linear(2056, num_species)

    def forward(self, x):
        x = self.norm_input(x)
        x = self.resnet18(x)
        x = self.ln(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

In [5]:
# Check if cuda is available
device = torch.device("cpu")

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("DEVICE = CUDA")

model = ModifiedResNet18(num_species).to(device)

DEVICE = CUDA


In [6]:
# Hyperparameters
learning_rate = 0.0002
num_epochs = 21
positive_weigh_factor = 1.0

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingLR(optimizer, T_max=25, verbose=True)



In [163]:
print(f"Training for {num_epochs} epochs started.")
losses = {
    "epoch": [],
    "train_loss": [],
    "val_loss": []
    }
    
# Training loop
for epoch in range(num_epochs):
    start_time = time.time()
    print(f"Epoch {epoch+1}/{num_epochs} | Training...")
    model.train()
    running_loss = 0.0
    for data, targets, _ in tqdm(train_loader, desc=f"Training", leave=False):
        data = data.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        out = model(data)

        loss = criterion(out, targets)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    scheduler.step()
    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {running_loss/len(train_loader)}, Time: {epoch_time:.2f} seconds")

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        print(f"Epoch {epoch+1}/{num_epochs} | Validation...")
        for data, targets, _ in tqdm(val_loader, desc=f"Validation)", leave=False):
            data = data.to(device)
            targets = targets.to(device)

            out = model(data)

            loss = criterion(out, targets)

            val_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss/len(val_loader)}")

    # Enregistrer les pertes dans la structure
    losses["epoch"].append(epoch + 1)
    losses["train_loss"].append(running_loss / len(train_loader))
    losses["val_loss"].append(val_loss / len(val_loader))

    # Sauvegarder les pertes dans un fichier CSV
    df = pd.DataFrame(losses)
    df.to_csv("training_losses.csv", index=False)

    # Save the model checkpoint
    if epoch % 5 == 0:
        torch.save(model.state_dict(), f"resnet18_with_bioclimatic_cubes_epoch_{epoch}.pth")

Training for 21 epochs started.
Epoch 1/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 1/21, Training Loss: 0.026662551651426246, Time: 28.42 seconds
Epoch 1/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 1/21, Validation Loss: 0.009456161875277758
Epoch 2/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 2/21, Training Loss: 0.009106248766288495, Time: 27.20 seconds
Epoch 2/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 2/21, Validation Loss: 0.008340551430592313
Epoch 3/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 3/21, Training Loss: 0.00817587808444259, Time: 27.89 seconds
Epoch 3/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 3/21, Validation Loss: 0.007877044525230303
Epoch 4/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 4/21, Training Loss: 0.007660949160606506, Time: 27.09 seconds
Epoch 4/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 4/21, Validation Loss: 0.007489879382774234
Epoch 5/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 5/21, Training Loss: 0.007269490758752387, Time: 27.24 seconds
Epoch 5/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 5/21, Validation Loss: 0.007376850946457125
Epoch 6/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 6/21, Training Loss: 0.00693398478357226, Time: 27.38 seconds
Epoch 6/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 6/21, Validation Loss: 0.007311321154702455
Epoch 7/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 7/21, Training Loss: 0.006646078930163149, Time: 27.62 seconds
Epoch 7/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 7/21, Validation Loss: 0.007136419095331803
Epoch 8/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 8/21, Training Loss: 0.006400505726989568, Time: 27.48 seconds
Epoch 8/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 8/21, Validation Loss: 0.007112213686923496
Epoch 9/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 9/21, Training Loss: 0.006150297527567724, Time: 28.07 seconds
Epoch 9/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 9/21, Validation Loss: 0.007141460970160551
Epoch 10/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 10/21, Training Loss: 0.005916293519496751, Time: 27.40 seconds
Epoch 10/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 10/21, Validation Loss: 0.007156037667300552
Epoch 11/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 11/21, Training Loss: 0.005713557241107808, Time: 27.23 seconds
Epoch 11/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 11/21, Validation Loss: 0.007209551476989873
Epoch 12/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 12/21, Training Loss: 0.005527837835042999, Time: 27.35 seconds
Epoch 12/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 12/21, Validation Loss: 0.0072095445793820545
Epoch 13/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 13/21, Training Loss: 0.005351420999452305, Time: 27.25 seconds
Epoch 13/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 13/21, Validation Loss: 0.0072499740635976195
Epoch 14/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 14/21, Training Loss: 0.005195136681680431, Time: 27.17 seconds
Epoch 14/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 14/21, Validation Loss: 0.0073005020240088925
Epoch 15/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 15/21, Training Loss: 0.005036946312848772, Time: 27.39 seconds
Epoch 15/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 15/21, Validation Loss: 0.007389370031887665
Epoch 16/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 16/21, Training Loss: 0.004901002407115832, Time: 27.26 seconds
Epoch 16/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 16/21, Validation Loss: 0.007417170316330157
Epoch 17/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 17/21, Training Loss: 0.00478205628421032, Time: 27.41 seconds
Epoch 17/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 17/21, Validation Loss: 0.007436480198521167
Epoch 18/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 18/21, Training Loss: 0.004660098447223727, Time: 27.65 seconds
Epoch 18/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 18/21, Validation Loss: 0.0075590949127217755
Epoch 19/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 19/21, Training Loss: 0.00456693164711253, Time: 27.60 seconds
Epoch 19/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 19/21, Validation Loss: 0.00756289427226875
Epoch 20/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 20/21, Training Loss: 0.004480961460379486, Time: 27.47 seconds
Epoch 20/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 20/21, Validation Loss: 0.007619827301823534
Epoch 21/21 | Training...


Training:   0%|          | 0/178 [00:00<?, ?it/s]

Epoch 21/21, Training Loss: 0.004410194855685649, Time: 27.44 seconds
Epoch 21/21 | Validation...


Validation):   0%|          | 0/32 [00:00<?, ?it/s]

Epoch 21/21, Validation Loss: 0.0076485860627144575


In [9]:
num_species = 11255
batch_size = 64
device = 'cuda'
def remove_module_prefix(state_dict):
    new_state_dict = {}
    for k, v in state_dict.items():
        if k.startswith('module.'):
            new_state_dict[k[7:]] = v
        else:
            new_state_dict[k] = v
    return new_state_dict

# Charger l'état du modèle entraîné
model = ModifiedResNet18(num_species).to(device)
state_dict = torch.load("/home/dakbarin/data/models/resnet18_with_bioclimatic_cubes_epoch_4_fine-tuned.pth")
state_dict = remove_module_prefix(state_dict)
model.load_state_dict(state_dict)

<All keys matched successfully>

In [10]:
model = ModifiedResNet18(num_species).to(device)
state_dict = torch.load("/home/dakbarin/data/models/resnet18_with_bioclimatic_cubes_epoch_4_fine-tuned.pth")
model.load_state_dict(state_dict)

def test_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in test_loader:
            inputs, labels, _ = data
            inputs, labels = inputs.to(device), labels.to(device)  # Transférer les données sur le même dispositif
            outputs = model(inputs)
            preds = torch.sigmoid(outputs)  # Utiliser la fonction sigmoid pour obtenir des probabilités
            
            # Convertir les prédictions et les labels en valeurs binaires
            preds_binary = (preds > 0.6).int()
            
            all_preds.append(preds_binary.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    
    return np.concatenate(all_preds), np.concatenate(all_labels)

# Tester le modèle
preds, labels = test_model(model, val_loader, device)

preds = preds.astype(int)
labels = labels.astype(int)

# Calculer le F1 score micro
f1_micro = f1_score(labels, preds, average='micro')
print(f"F1 Score Micro: {f1_micro}")

F1 Score Micro: 0.0


In [11]:
class TestDataset(TrainDataset):
    def __init__(self, data_dir, metadata, subset, transform=None):
        self.subset = subset
        self.transform = transform
        self.data_dir = data_dir
        self.metadata = metadata
        
    def __getitem__(self, idx):
        
        survey_id = self.metadata.surveyId[idx]
        sample = torch.nan_to_num(torch.load(os.path.join(self.data_dir, f"GLC24-PA-{self.subset}-landsat_time_series_{survey_id}_cube.pt")))

        if isinstance(sample, torch.Tensor):
            sample = sample.permute(1, 2, 0)  # Change tensor shape from (C, H, W) to (H, W, C)
            sample = sample.numpy()

        if self.transform:
            sample = self.transform(sample)

        return sample, survey_id
    
# Load Test metadata
test_data_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-test-landsat_time_series"
test_metadata_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-metadata-test.csv"
test_metadata = pd.read_csv(test_metadata_path, delimiter =';')
test_dataset = TestDataset(test_data_path, test_metadata, subset="test", transform=transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [124]:
from tqdm.notebook import tqdm
import torch

with torch.no_grad():
    all_preds = []
    surveys = []
    for data, surveyID in tqdm(test_loader, total=len(test_loader)):
        data = data.to(device)
        predictions = model(data)
        
        preds = torch.sigmoid(predictions)  # Appliquer sigmoid avant la conversion en numpy
        preds_binary = (preds > 0.5).int()
        
        all_preds.append(preds_binary.cpu().numpy())
        surveys.extend(surveyID.cpu().numpy())

  0%|          | 0/148 [00:00<?, ?it/s]

In [12]:
with torch.no_grad():
    all_predictions = []
    surveys = []
    top_k_indices = None
    for data, surveyID in tqdm(test_loader, total=len(test_loader)):

        data = data.to(device)
        predictions = model(data)
        predictions = torch.sigmoid(predictions).cpu().numpy()

        # Sellect top-25 values as predictions
        top_25 = np.argsort(-predictions, axis=1)[:, :20] 
        if top_k_indices is None:
            top_k_indices = top_25
        else:
            top_k_indices = np.concatenate((top_k_indices, top_25), axis=0)

        surveys.extend(surveyID.cpu().numpy())

  0%|          | 0/74 [00:00<?, ?it/s]

In [13]:
data_concatenated = [' '.join(map(str, row)) for row in top_k_indices]

pd.DataFrame(
    {'surveyId': surveys,
     'predictions': data_concatenated,
    }).to_csv("submission.csv", index = False)