In [1]:
import os
import torch
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.metrics import f1_score
import torch.nn.functional as F
from torch import optim

In [2]:
num_species = 11255 # Nombre de toutes les classes uniques dans les données PO et PA.
num_classes = 5
seed = 42

class TrainDataset(Dataset):
    def __init__(self, data_dir, metadata, subset, transform=None):
        self.subset = subset  # Type de sous-ensemble (ex: train, val, test)
        self.transform = transform  # Transformation à appliquer aux échantillons
        self.data_dir = data_dir  # Répertoire contenant les fichiers de données
        self.metadata = metadata  # Données de métadonnées
        # Supprimer les lignes sans speciesId et réinitialiser les index
        self.metadata = self.metadata.dropna(subset=["speciesId"]).reset_index(drop=True)
        self.metadata['speciesId'] = self.metadata['speciesId'].astype(int)  # Convertir speciesId en entier
        
        # Colonnes des labels de présence
        self.label_columns = ['absence', 'presence_a_2_digit', 
                              'presence_a_3_digit', 'presence_a_4_digit', 'presence_seule']
        
        # Créer un dictionnaire des labels par surveyId
        self.label_dict = self.metadata.groupby('surveyId', group_keys=False).apply(
            lambda x: x.set_index('speciesId')[self.label_columns].to_dict(orient='index')
        ).to_dict()
        
        # Supprimer les doublons de surveyId et réinitialiser les index
        self.metadata = self.metadata.drop_duplicates(subset="surveyId").reset_index(drop=True)

    def __len__(self):
        # Retourne le nombre de surveyId uniques
        return len(self.metadata)

    def __getitem__(self, idx):
        survey_id = self.metadata.surveyId[idx]
        sample = torch.nan_to_num(torch.load(os.path.join(self.data_dir, f"GLC24-PA-{self.subset}-landsat-time-series_{survey_id}_cube.pt")))
        labels = self.label_dict.get(survey_id, {})
        label = torch.zeros((num_species, len(self.label_columns)))  
        label[:,0] = 1
        # Remplir le tenseur de labels avec les données de présence
        for species_id, presence_data in labels.items():
            if species_id < num_species:  # S'assurer que species_id est dans la plage valide
                label[species_id] = torch.tensor(list(presence_data.values()), dtype=torch.float32)

        # S'assurer que l'échantillon est au bon format pour la transformation
        if isinstance(sample, torch.Tensor):
            # Changer la forme du tenseur de (C, H, W) à (H, W, C)
            sample = sample.permute(1, 2, 0)  
            sample = sample.numpy()  

        # Appliquer la transformation si elle est définie
        if self.transform:
            sample = self.transform(sample)

        # Retourner l'échantillon, les labels et le surveyId
        return sample, label, survey_id


In [3]:
# Dataset and DataLoader
batch_size = 64
transform = transforms.Compose([
    transforms.ToTensor()
])

#Load Training metadata
train_data_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-train-landsat-time-series"
train_metadata_path = "/home/dakbarin/data/data/GEOLIFECLEF/metadata_with_classes.csv"
train_metadata = pd.read_csv(train_metadata_path)
train_dataset = TrainDataset(train_data_path, train_metadata, subset="train", transform=transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

test_data_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-train-landsat-time-series"
test_metadata_path = "/home/dakbarin/data/data/GEOLIFECLEF/metadata_with_classes_test.csv"
test_metadata = pd.read_csv(test_metadata_path)
test_dataset = TrainDataset(test_data_path, test_metadata, subset="train", transform=transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=4)



In [4]:
class ModifiedResNet18(nn.Module):
    def __init__(self, num_species):
        super(ModifiedResNet18, self).__init__()
        self.norm_input = nn.LayerNorm([6, 4, 21])
        self.resnet18 = models.resnet18(weights=None)
        self.resnet18.conv1 = nn.Conv2d(6, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.resnet18.maxpool = nn.Identity()
        self.ln = nn.LayerNorm(1000)
        self.fc0 = nn.Linear(1000, 2056)
        self.fc1 = nn.Linear(2056, num_species*num_classes)
        

    def forward(self, x):
        x = self.norm_input(x)
        x = self.resnet18(x)
        x = self.ln(x)
        x = self.fc0(x)
        x = self.fc1(x)
        x = x.view(-1, num_species, num_classes)
        
        return x

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ModifiedResNet18(num_species).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.005)
criterion = nn.CrossEntropyLoss()
scheduler = CosineAnnealingLR(optimizer, T_max=25, verbose=True)



In [None]:
num_epochs = 28
for epoch in tqdm(range(num_epochs), desc="Epochs"):
    model.train()
    running_loss = 0.0
    for data, targets, _ in tqdm(train_loader, desc="Training", leave=False):
        data = data.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        out = model(data)
        
        # Convertir les targets en indices de classe
        target_indices = torch.argmax(targets, dim=2)
        
        # Calculer la perte pour chaque espèce individuellement
        loss = 0.0
        for i in range(num_species):
            loss += criterion(out[:, i, :], target_indices[:, i])

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {running_loss/len(train_loader)}")

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for data, targets, _ in tqdm(test_loader, desc="Validation", leave=False):
            data = data.to(device)
            targets = targets.to(device)

            out = model(data)
            
            target_indices = torch.argmax(targets, dim=2)

            # Calculer la perte pour chaque espèce individuellement
            loss = 0.0
            for i in range(num_species):
                loss += criterion(out[:, i, :], target_indices[:, i])
            
            val_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss/len(test_loader)}")

# Save the trained model
torch.save(model.state_dict(), "resnet18_with_bioclimatic_cubes2.pth")

Epochs:   0%|          | 0/28 [00:00<?, ?it/s]

Training:   0%|          | 0/189 [00:00<?, ?it/s]

Epoch 1/28, Training Loss: 8690.983835371713


Validation:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/28, Validation Loss: 5566.052783203125


Training:   0%|          | 0/189 [00:00<?, ?it/s]

Epoch 2/28, Training Loss: 3751.296443394252


Validation:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 2/28, Validation Loss: 236.05105743408203


Training:   0%|          | 0/189 [00:00<?, ?it/s]

In [12]:
num_species = 11255
batch_size = 64
device = 'cuda'
def remove_module_prefix(state_dict):
    new_state_dict = {}
    for k, v in state_dict.items():
        if k.startswith('module.'):
            new_state_dict[k[7:]] = v
        else:
            new_state_dict[k] = v
    return new_state_dict

# Charger l'état du modèle entraîné
model = ModifiedResNet18(num_species).to(device)
state_dict = torch.load("/home/dakbarin/data/models/resnet18_with_bioclimatic_cubes_epoch_16.pth")
state_dict = remove_module_prefix(state_dict)
model.load_state_dict(state_dict)


<All keys matched successfully>

In [13]:
class TestDataset(Dataset):
    def __init__(self, data_dir, metadata, subset, transform=None):
        self.subset = subset
        self.transform = transform
        self.data_dir = data_dir
        self.metadata = metadata
        self.metadata = self.metadata.dropna(subset=["speciesId"]).reset_index(drop=True)
        self.metadata['speciesId'] = self.metadata['speciesId'].astype(int)
        self.label_dict = self.metadata.groupby('surveyId')['speciesId'].apply(list).to_dict()
        
        self.metadata = self.metadata.drop_duplicates(subset="surveyId").reset_index(drop=True)

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        survey_id = self.metadata.surveyId[idx]
        sample = torch.nan_to_num(torch.load(os.path.join(self.data_dir, f"GLC24-PA-{self.subset}-landsat-time-series_{survey_id}_cube.pt")))

        species_ids = self.label_dict.get(survey_id, [])
        label = torch.zeros(num_species)
        for species_id in species_ids:
            label_id = species_id
            label[label_id] = 1

        if isinstance(sample, torch.Tensor):
            sample = sample.permute(1, 2, 0)
            sample = sample.numpy()

        if self.transform:
            sample = self.transform(sample)

        return sample, label, survey_id

# Définir la transformation
transform = transforms.Compose([
    transforms.ToTensor()
])

# Charger les métadonnées et les données de test
test_data_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-train-landsat-time-series"
test_metadata_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24_PA_metadata_train.csv"
test_metadata = pd.read_csv(test_metadata_path, delimiter=';').iloc[500000:600001]
test_dataset = TestDataset(test_data_path, test_metadata, subset="train", transform=transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True, num_workers=4)


In [14]:
import torch
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import f1_score
from torchvision import transforms
import pandas as pd
import numpy as np


def evaluate_model(model, data_loader, device, num_species):
    model.eval()
    all_targets = []
    all_preds = []

    with torch.no_grad():
        for data, targets, survey_id in data_loader:
            data = data.to(device)
            targets = targets.to(device).float()
            
            # Obtenir les sorties du modèle
            out = model(data)
            
            # Appliquer la fonction softmax pour obtenir des probabilités
            m = nn.Softmax(dim=2)
            probs = m(out)
            
            topk_preds = torch.topk(probs[:, :, 4], 25, dim=1).indices
            topk_labels = torch.zeros_like(probs[:, :, 4])
            topk_labels.scatter_(1, topk_preds, 1)

            all_preds.append(topk_labels.cpu().numpy())
            all_targets.append(targets.cpu().numpy())

    # Convertir les listes en tableaux numpy et les aplatir
    all_preds = np.vstack(all_preds).reshape(-1, num_species)
    all_targets = np.vstack(all_targets).reshape(-1, num_species)

    # Calculer le F1 score
    f1 = f1_score(all_targets, all_preds, average='micro')

    return f1


# Évaluer le modèle
f1 = evaluate_model(model, test_loader, device, num_species)
print(f"F1 Score: {f1}")


F1 Score: 0.019883471765074007


In [15]:
#tests Kaggle
class TestDataset(TrainDataset):
    def __init__(self, data_dir, metadata, subset, transform=None):
        self.subset = subset
        self.transform = transform
        self.data_dir = data_dir
        self.metadata = metadata
        
    def __getitem__(self, idx):
        
        survey_id = self.metadata.surveyId[idx]
        sample = torch.nan_to_num(torch.load(os.path.join(self.data_dir, f"GLC24-PA-{self.subset}-landsat_time_series_{survey_id}_cube.pt")))

        if isinstance(sample, torch.Tensor):
            sample = sample.permute(1, 2, 0)  # Change tensor shape from (C, H, W) to (H, W, C)
            sample = sample.numpy()

        if self.transform:
            sample = self.transform(sample)

        return sample, survey_id
    
# Load Test metadata
test_data_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-test-landsat_time_series"
test_metadata_path = "/home/dakbarin/data/data/GEOLIFECLEF/GLC24-PA-metadata-test.csv"
test_metadata = pd.read_csv(test_metadata_path, delimiter =';')
test_dataset = TestDataset(test_data_path, test_metadata, subset="test", transform=transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=5)

In [16]:
from tqdm.notebook import tqdm
with torch.no_grad():
    all_predictions = []
    surveys = []
    top_k_indices = None
    for data, surveyID in tqdm(test_loader, total=len(test_loader)):

        data = data.to(device)
        preds = model(data)
        m = nn.Softmax(dim=2)
        probs = m(preds)

        predictions = torch.zeros(probs[:, :, 0].shape, device=device)

        # Mettez à jour predictions avec les valeurs de probs pour la classe 4
        predictions = torch.where(probs.argmax(dim=2) == 0, probs[:, :, 0], predictions)
        print(predictions[0,111])

        # Convertir predictions sur la mémoire de l'hôte pour utiliser np.argsort
        predictions_cpu = predictions.cpu().numpy()

        # Sélectionner les top-25 valeurs comme prédictions
        top_25 = np.argsort(-predictions_cpu, axis=1)[:, :25]
        if top_k_indices is None:
            top_k_indices = top_25
        else:
            top_k_indices = np.concatenate((top_k_indices, top_25), axis=0)

        surveys.extend(surveyID.cpu().numpy())


  0%|          | 0/74 [00:00<?, ?it/s]

tensor(0.9775, device='cuda:0')
tensor(0.9839, device='cuda:0')
tensor(0.9844, device='cuda:0')
tensor(0.9781, device='cuda:0')
tensor(0.9830, device='cuda:0')
tensor(0.9764, device='cuda:0')
tensor(0.9829, device='cuda:0')
tensor(0.9771, device='cuda:0')
tensor(0.9778, device='cuda:0')
tensor(0.9847, device='cuda:0')
tensor(0.9843, device='cuda:0')
tensor(0.9828, device='cuda:0')
tensor(0.9836, device='cuda:0')
tensor(0.9795, device='cuda:0')
tensor(0.9842, device='cuda:0')
tensor(0.9835, device='cuda:0')
tensor(0.9820, device='cuda:0')
tensor(0.9770, device='cuda:0')
tensor(0.9461, device='cuda:0')
tensor(0.9768, device='cuda:0')
tensor(0.9795, device='cuda:0')
tensor(0.9796, device='cuda:0')
tensor(0.9770, device='cuda:0')
tensor(0.9847, device='cuda:0')
tensor(0.9771, device='cuda:0')
tensor(0.9783, device='cuda:0')
tensor(0.9813, device='cuda:0')
tensor(0.9419, device='cuda:0')
tensor(0.9844, device='cuda:0')
tensor(0.9786, device='cuda:0')
tensor(0.9784, device='cuda:0')
tensor(0

In [17]:
data_concatenated = [' '.join(map(str, row)) for row in top_k_indices]

pd.DataFrame(
    {'surveyId': surveys,
     'predictions': data_concatenated,
    }).to_csv("submission.csv", index = False)
