In [57]:
import torch
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import pandas as pd
import numpy as np
from dataclasses import dataclass

In [44]:
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import pandas as pd
import numpy as np
from dataclasses import dataclass
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
# Function to pad an array to a specific shape
def to_shape(a, shape):
    # Unpack the target shape
    y_, x_ = shape

    # Get the current shape of the array
    y, x = a.shape

    # Calculate the padding needed in the y and x directions
    y_pad = y_ - y
    x_pad = x_ - x
    output = np.zeros()
    # Pad the array using numpy's pad function
    return np.pad(
        a,
        [(0, 1), (0, 1)],
        # Calculate the padding for each dimension
        #((y_pad // 2, y_pad // 2 + y_pad % 2), (x_pad // 2, x_pad // 2 + x_pad % 2)),
        mode="constant",
    )


# Function to pad data and labels to a specific shape
def apply_padding(data_df, N, T_max):
    # Define the final shape of the data and labels
    final_shape = (N, T_max, 3)

    # Initialize the final data and labels with zeros
    final_data = np.zeros(final_shape)
    final_label = np.zeros((N, T_max, 3))

    # Select a random subset of trajectory indices
    if len(data_df["traj_idx"].unique()) < N:
        selected_ids = np.random.choice(
            data_df["traj_idx"].unique(), size=N, replace=True
        )
    else:
        selected_ids = np.random.choice(
            data_df["traj_idx"].unique(), size=N, replace=False
        )

    # Iterate over the selected trajectory indices
    for n, id in enumerate(selected_ids):
        # Filter the data for the current trajectory index
        exp = data_df[data_df["traj_idx"] == id]
        
        # Extract the data and labels for the current trajectory
        data = exp[["frame", "x", "y"]].to_numpy()
        data[:,0] = data[:,0] - data[0,0] + 1 #putting first frame rate to 1
        data[:,1] = data[:,1] - data[0,1] #putting initial position to 0
        data[:,2] = data[:,2] - data[0,2] #putting initital position to 0        # print(exp["frame"])
        label = exp[["alpha", "D", "state"]].to_numpy()
        ## adding one to the states
        label[:,2] = label[:,2] + 1
        # If the data is longer than T_max, truncate it
        if data.shape[0] > T_max:
            final_data[n, :, :] = data[:T_max, :]
            final_label[n, :, :] = label[:T_max, :]

        # Otherwise, pad the data to T_max
        else:
            # print((label.shape, T_max))
            final_data[n, :data.shape[0], :] = data
            final_label[n, :data.shape[0], :] = label

    # Return the padded data and labels
    return final_data, final_label


# Define a function to normalize data
def normalize_df(data):
    # Calculate displacement in x and y directions
    # Normalize by substring mean and dividing by variance.

    displacement_x = []
    displacement_y = []
    for _, group in data.groupby("traj_idx"):
        x = np.asarray(group["x"])
        y = np.asarray(group["y"])
        d_x = x[1:] - x[:-1]
        d_y = y[1:] - y[:-1]
        displacement_x = displacement_x + list(d_x)
        displacement_y = displacement_y + list(d_y)

    # Calculate variance in x and y directions
    variance_x = np.sqrt(np.std(displacement_x))
    variance_y = np.sqrt(np.std(displacement_y))

    # Normalize data
    data.loc[:, "x"] = (data["x"] - data["x"].mean()) / variance_x
    data.loc[:, "y"] = (data["y"] - data["y"].mean()) / variance_y


def normalize_np(data):

    displacement_x = []
    displacement_y = []
    for n in range(data.shape[0]):
        x = data[n, :, 1]
        y = data[n, :, 2]
        d_x = x[1:] - x[:-1]
        d_y = y[1:] - y[:-1]
        displacement_x = displacement_x + list(d_x)
        displacement_y = displacement_y + list(d_y)

    # Calculate variance in x and y directions
    variance_x = np.sqrt(np.std(displacement_x))
    variance_y = np.sqrt(np.std(displacement_y))

    # Normalize data

    data[:, :, 1] = (data[:, :, 1] - np.mean(data[:, :, 1])) / variance_x
    data[:, :, 2] = (data[:, :, 2] - np.mean(data[:, :, 2])) / variance_x

    return data


# Define a function to list directory tree with pathlib
def list_directory_tree_with_pathlib(starting_directory):
    path_object = Path(starting_directory)
    folders = []
    for file_path in path_object.rglob("*.csv"):
        folders.append(file_path)
    return folders


# Define a custom dataset class for all data
@dataclass
class Dataset_all_data(Dataset):
    # Initialize filenames and transform flag
    # Pad value should be a tuple such as (N, Tmax)
    filenames: list
    transform: bool = False
    pad: None | tuple = None
    noise: bool = False

    def __len__(self):
        # Return the number of files
        return len(self.filenames)

    def __getitem__(self, idx):
        # Read csv file and extract data and label
        df = pd.read_csv(self.filenames[idx])

        if self.pad is None:
            data = df[["traj_idx", "frame", "x", "y"]]
            label = np.asarray(df[["alpha", "D"]])
            label_2 = np.asarray(df["state"])

        else:
            if len(self.pad) != 2:
                raise ValueError("pad value should be set as (N, T_max)")
            data, label = apply_padding(df, *self.pad)
            data = data[:,:,1:] ## Removing the frame column
            label_2 = label[:, :, -1]
            label_2[label_2[:, :] > 0] = label_2[label_2[:, :] > 0] 
            label = label[:, :, :-1]

        # Normalize data if transform flag is True
        if self.transform:
            if self.pad is None:
                normalize_df(data)
                data = np.asarray(data)
            else:
                data = normalize_np(data)

        if self.noise:
            data = add_noise(data)
        
        # Normalize D between 0 and 1

        # label[:,:,1][label[:,:,1] != 0] = np.log(label[:,:,1][label[:,:,1] != 0]) #- np.log(1e-6)) #/   (np.log(1e12) - np.log(1e-6))
        # label = label[:,:,1]
        label_regression = np.zeros((label.shape[0], 2))

        # print(np.unique(label_2))
        
        for i in range(label.shape[0]):
            K = np.unique(label[i,:,1][label[i,:,1] != 0])
            if  len(K) == 2:
                label_regression[i,:] = K

                if label[i,0,1] != label_regression[i,0]:
                    label_regression[i,:] = label_regression[i,::-1]

            

            elif len(K) == 1:
                states = label_2[i,:]
                if 1 in states:
                    # print(np.unique(states))
                    if states[0] == 1:
                        label_regression[i,:] = [0, K[0]]
                    else:
                        label_regression[i,:] = [K[0], 0]
                    
                    # print(label_regression[i,:])

                else:
                    label_regression[i,:] = [K[0],K[0]] 

            else:
                if  np.unique(label[i,:,1]) == 0:
                    label_regression[i,:] = [0,0]
                else :

                    # print(np.unique(label[i,:,1]))

                    # print(Ds)
                    raise Exception("more than 2 diffusions")


        label_segmentation = np.zeros((label_2.shape[0], label_2.shape[1]))


        for i in range(label.shape[0]):
            if label_regression[i,0] == label_regression[i,1]:
                position = label[i,:,1] == label_regression[i,0]
                label_segmentation[i,position] = 1
            else:

                
                position_1 = label[i,:,1] == label_regression[i,0]
                position_2 = label[i,:,1] == label_regression[i,1]

                

                label_segmentation[i,position_1] = 1
                label_segmentation[i,position_2] = 2

                


        return torch.from_numpy(data.astype(np.float32)), torch.from_numpy(label_segmentation.astype(np.float32))
            # torch.from_numpy(label_2.astype(np.float32)),
        
    
def add_noise(data):
    noise_amplitude = np.random.choice([0.01, 0.1,])
    noise = np.random.normal(0, noise_amplitude, data[:,:,:].shape)
    data[:,:,:][data[:,:,1:] != 0] = data[:,:,:][data[:,:,1:] != 0] + data[:,:,:][data[:,:,1:] != 0]*noise
    return  data

In [45]:
all_data_set = list_directory_tree_with_pathlib(
    r"/home/m.lavaud/Documents/dataset",)
np.random.shuffle(all_data_set)

In [46]:
training_dataset = Dataset_all_data(
        all_data_set[:2000], transform=False, pad=(20, 200)
    )

In [47]:
test = iter(training_dataset)

In [74]:
a = next(test)[1]


tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])

In [None]:
]

In [72]:
for i in range(100):
    a = next(test)[1]
    assert np.sum(a[:,0]) == 20


TypeError: sum() received an invalid combination of arguments - got (out=NoneType, axis=NoneType, ), but expected one of:
 * (*, torch.dtype dtype)
      didn't match because some of the keywords were incorrect: out, axis
 * (tuple of ints dim, bool keepdim, *, torch.dtype dtype)
 * (tuple of names dim, bool keepdim, *, torch.dtype dtype)


In [21]:
dataloader = DataLoader(training_dataset, shuffle=True, batch_size=10, num_workers=0)


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [63]:
from einops import rearrange

In [64]:
from mamba_ssm import Mamba

In [169]:
class alpha_regression(nn.Module):
    def __init__(self, d_model,d_state,  d_conv, expand,dropout = 0.2, device="cuda"):
        super().__init__()
        self.device = device

        self.mamba = Mamba(d_model = d_model, d_state=d_state, d_conv=d_conv, expand=expand).to(device)
        self.dropout = nn.Dropout(p=dropout).to(device)
        self.fc = nn.Linear(in_features = 200*d_model, out_features=2).to(device)
        self.softplus = nn.Softplus()

    def forward(self, input):

        mamba_out = self.mamba(input)
        mamba_out = rearrange(mamba_out, "b l c -> b (l c)")
        mamba_out = self.dropout(mamba_out)
        out = self.fc(mamba_out)

        return self.softplus(out)
    


In [170]:
model = alpha_regression(d_model=2, d_state=1, d_conv=3, expand=2)

In [174]:
class MSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
        
    def forward(self, pred, actual):
        return (self.mse(torch.log(pred + 1), torch.log(actual + 1)))


regression_criterion = MSLELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)



In [175]:
max_epoch = 10
total_running_loss = []
for epoch in range(max_epoch):
    running_regresssion_loss = []
    with tqdm(dataloader, unit="batch") as tepoch:
        model.train()

        for inputs, regression_targets in tepoch:

            tepoch.set_description(f"Epoch {epoch}")

            inputs = inputs.to("cuda")
            inputs = torch.flatten(inputs, start_dim=0, end_dim=1)

            regression_targets = torch.flatten(
                regression_targets, start_dim=0, end_dim=1,
            ).to("cuda")

            optimizer.zero_grad()

            regression_output  = model(inputs)
            regression_output = torch.squeeze(regression_output)
            
            regression_loss = regression_criterion(
                regression_output, regression_targets
            )
            # stop
            regression_loss.backward()
            optimizer.step()

            tepoch.set_postfix(
                regression_loss=regression_loss.item(),
            )

            running_regresssion_loss.append(regression_loss.item())
        total_running_loss.append(np.mean(running_regresssion_loss))

Epoch 0: 100%|██████████| 100/100 [01:09<00:00,  1.44batch/s, regression_loss=3.72]
Epoch 1: 100%|██████████| 100/100 [01:04<00:00,  1.55batch/s, regression_loss=0.555]
Epoch 2: 100%|██████████| 100/100 [01:12<00:00,  1.38batch/s, regression_loss=0.489]
Epoch 3: 100%|██████████| 100/100 [01:22<00:00,  1.22batch/s, regression_loss=0.53]
Epoch 4: 100%|██████████| 100/100 [01:21<00:00,  1.22batch/s, regression_loss=1.35]
Epoch 5: 100%|██████████| 100/100 [01:19<00:00,  1.26batch/s, regression_loss=2.64]
Epoch 6: 100%|██████████| 100/100 [01:18<00:00,  1.28batch/s, regression_loss=1.61]
Epoch 7: 100%|██████████| 100/100 [01:18<00:00,  1.28batch/s, regression_loss=0.154]
Epoch 8: 100%|██████████| 100/100 [01:18<00:00,  1.28batch/s, regression_loss=0.958]
Epoch 9: 100%|██████████| 100/100 [01:19<00:00,  1.26batch/s, regression_loss=0.334]


In [176]:
total_running_loss

[1.286618869304657,
 1.236090176999569,
 1.2193493622541427,
 1.1904808430373668,
 1.188451837003231,
 1.1836995378136634,
 1.1599791413545608,
 1.1390000320971012,
 1.1240574173629283,
 1.1385772958397866]