<a href="https://colab.research.google.com/github/ElCald/CIFAR10/blob/main/projet_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Projet LSTM
Eliot Calderon y Mora & Clément Jourd'heuil

## Datastet

In [1]:
# imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [2]:
# Chargement du dataset Atlantic Hurrican Track
url_dataset = "https://raw.githubusercontent.com/ElCald/CIFAR10/refs/heads/main/Projet/Atlantic%20Hurricane%20Tracks%201851%20to%202017.csv"
df = pd.read_csv(url_dataset)

df = df.dropna(subset=['Observation_Latitude', 'Observation_Longitude'])
df['Observation_Latitude'] = df['Observation_Latitude'].astype(float)
df['Observation_Longitude'] = df['Observation_Longitude'].astype(float)

print(df.head())
print(df.info())

   index                 Date  Storm_ID  Observation_Latitude  \
0     16  1851-08-16 00:00:00  AL041851                  13.4   
1     17  1851-08-16 06:00:00  AL041851                  13.7   
2     18  1851-08-16 12:00:00  AL041851                  14.0   
3     19  1851-08-16 18:00:00  AL041851                  14.4   
4     20  1851-08-17 00:00:00  AL041851                  14.9   

   Observation_Longitude  
0                  -48.0  
1                  -49.5  
2                  -51.0  
3                  -52.8  
4                  -54.6  
<class 'pandas.core.frame.DataFrame'>
Index: 45511 entries, 0 to 45563
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   index                  45511 non-null  int64  
 1   Date                   45511 non-null  object 
 2   Storm_ID               45511 non-null  object 
 3   Observation_Latitude   45511 non-null  float64
 4   Observation_Longitude  4551

In [3]:
print(df.isna().sum())
print(df[['Observation_Latitude', 'Observation_Longitude']].describe())

index                    0
Date                     0
Storm_ID                 0
Observation_Latitude     0
Observation_Longitude    0
dtype: int64
       Observation_Latitude  Observation_Longitude
count          45511.000000           45511.000000
mean              27.086342             -65.161328
std               10.110653              19.281995
min                7.200000            -109.500000
25%               19.000000             -80.400000
50%               26.400000             -67.400000
75%               33.300000             -52.000000
max               75.500000               0.000000


## Fonctions

In [4]:
def create_sequences(df, seq_len=15, pred_step=3):
    """
    Crée des séquences pour chaque Storm_ID.
    X : séquences de longueur seq_len
    Y : observation à pred_step après la séquence
    """
    sequences = []
    targets = []

    for storm_id, group in df.groupby('Storm_ID'):
        group = group.sort_values('Date')
        scaler = StandardScaler()
        coords = df[['Observation_Latitude', 'Observation_Longitude']].values
        coords_scaled = scaler.fit_transform(coords)
        df[['Observation_Latitude', 'Observation_Longitude']] = coords_scaled

        # sliding window
        for i in range(len(coords) - seq_len - pred_step + 1):
            seq_x = coords[i:i + seq_len]
            seq_y = coords[i + seq_len + pred_step - 1]  # prédiction t + pred_step
            sequences.append(seq_x)
            targets.append(seq_y)

    X = torch.tensor(np.array(sequences), dtype=torch.float32)
    Y = torch.tensor(np.array(targets), dtype=torch.float32)
    return X, Y



In [5]:
class HurricaneDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]


In [6]:
# Exemple : split 70% train, 15% val, 15% test
def create_dataloaders(X, Y, batch_size=32):
    X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=42)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

    train_dataset = HurricaneDataset(X_train, Y_train)
    val_dataset = HurricaneDataset(X_val, Y_val)
    test_dataset = HurricaneDataset(X_test, Y_test)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader


In [7]:
class LSTMModelSimple(nn.Module):
    def __init__(self, input_size=1, hidden_size=50, output_size=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size=2, hidden_size=64, num_layers=2, batch_first=True, dropout=0.2)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]  # dernière sortie + ReLU
        out = self.fc(out)
        return out


## Entrainement

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train(model, train_loader, val_loader, epochs=20):
    for epoch in range(epochs):
        model.train()
        for X_batch, Y_batch in train_loader:
            X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, Y_batch)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

seq_lengths = [5, 10, 15, 20]  # différentes longueurs de séquence à tester
pred_step = 3

results = {}


for seq_len in seq_lengths:
    print(f"Testing seq_len = {seq_len}")

    # Créer les séquences
    X, Y = create_sequences(df, seq_len=seq_len, pred_step=pred_step)

    # Créer DataLoaders
    train_loader, val_loader, _ = create_dataloaders(X, Y, batch_size=32)

    # Instancier un modèle
    model = LSTMModelSimple(input_size=2, hidden_size=50, output_size=2).to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Entraîner quelques epochs rapidement
    model.train()
    for epoch in range(10):
        for X_batch, Y_batch in train_loader:
            X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, Y_batch)
            loss.backward()
            optimizer.step()

    # Évaluer sur le jeu de validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, Y_batch in val_loader:
            X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
            output = model(X_batch)
            val_loss += criterion(output, Y_batch).item() * X_batch.size(0)
    val_loss /= len(val_loader.dataset)
    results[seq_len] = val_loss
    print(f"Validation MSE for seq_len={seq_len}: {val_loss:.4f}")

print("Résultats :", results)


Testing seq_len = 5


## Evaluation

In [None]:
X_test, Y_test = create_sequences(df, seq_len=5, pred_step=3)
test_dataset = HurricaneDataset(X_test, Y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model.eval()
test_loss = 0
with torch.no_grad():
    for X_batch, Y_batch in test_loader:
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
        output = model(X_batch)
        loss = criterion(output, Y_batch)
        test_loss += loss.item() * X_batch.size(0)

test_loss /= len(test_loader.dataset)
print(f"Test MSE: {test_loss:.4f}")
