In [1]:
# default_exp datasets_phenom

In [4]:
#export
from andi_datasets.models_phenom import models_phenom
from andi_datasets.datasets_phenom import *

import inspect
import numpy as np
import pandas as pd

import os
import warnings

In [2]:
import matplotlib.pyplot as plt

# Class constructor

The class is initiated by accessing the `models_phenom` class and inspecting the available models

In [43]:
# export
class datasets_phenom():
    def __init__(self):
            ''' Constructor of the class '''
            self._get_models()
        
    def _get_models(self):        
        '''Loading subclass of models'''

        available_models = inspect.getmembers(models_phenom(), inspect.ismethod)      
        available_models = available_models[1:] # we need this to get rid of the init
        self.avail_models_name = [x[0] for x in available_models]
        self.avail_models_func = [x[1] for x in available_models]
        
    def _get_inputs_models(self, model, get_default_values = False):
        
        model_f = self.avail_models_func[self.avail_models_name.index(model)] 
        defaults = inspect.getfullargspec(model_f).defaults
        params = inspect.getfullargspec(model_f).args[1:]
        if get_default_values:
            return params, defaults
        else:
            return params
        
    def _get_states(self):
        ''' Definition of the possible states found in the ANDI 2022 challenge and their 
        assigned label:
        0: immobile; 1: confined; 2: brownian; 3: anomalous '''
        
        self._states = ['immobile', 'confined', 'brownian', 'anomalous']
        

# `create_dataset`

This function receives a list of dictionaries, each containing the properties of the trajectories to be created. The compulsory input for each dictionary is the key `model`, which defined the phenomenological diffusion model from which to create the trajectories. The rest of the properties are the ones of the model called. If no properties are given, the function automatically choses the default parameters of the model (check `models_phenom` for details).

In [3]:
# export
class datasets_phenom(datasets_phenom):
                
    def create_dataset(self,
                       T = None,
                       N_model = None,
                       dics = False,  
                       path = '',
                       save = False, load = False):
        
        self.T = T
        self.N_model = N_model
        self.path = path
        self.dics = dics
        
        '''Managing dictionaries'''
        # If the input is a single dictionary, transform it to list
        if isinstance(self.dics, dict): self.dics = [self.dics]
        # if dics is False, we select trajectories from all models with default values
        if self.dics is False: self.dics = [{'model': model} for model in self.avail_models_name]

                    
        '''Managing folders of the datasets'''  
        self.save = save
        self.load = load
        if self.save or self.load:                
            if self.load:
                self.save = False            
            if not os.path.exists(self.path) and self.load:
                raise FileNotFoundError('The directory from where you want to load the dataset does not exist')                
            if not os.path.exists(self.path) and self.save:
                os.makedirs(self.path) 
                
                
        '''Create trajectories'''
        trajs, labels = self._create_trajectories()
        
        return trajs, labels                        

# `_create_trajectories`, `_save_trajectories`, `_load_trajectories`
Auxiliary functions to `create_trajectories` that allow for creating, load and saving trajectories.

In [4]:
# export
class datasets_phenom(datasets_phenom):   
    
    def _create_trajectories(self):

        for dic in self.dics:
            
            dataset_idx, df = self._inspect_dic(dic)
            
            # If the dataset does not yet exists
            if dataset_idx is False:
                # Retrive name and function of diffusion model
                model_f = self.avail_models_func[self.avail_models_name.index(dic['model'])]
                # Create dictionary with only arguments
                dic_args = dict(dic); dic_args.pop('model')
                
                trajs, labels = model_f(**dic_args)
                
                # Save the trajectories if asked
                if self.save:
                    self._save_trajectories(trajs = trajs,
                                            labels = labels,
                                            dic = dic, 
                                            df = df,
                                            dataset_idx = dataset_idx,
                                            path = self.path)                    
            else:
                trajs, labels = self._load_trajectories(model_name = dic['model'],
                                                        dataset_idx = dataset_idx,
                                                        path = self.path)
                
            # Stack dataset
            try:
                data_t = np.hstack((data_t, trajs))                    
                data_l = np.hstack((data_l, labels))
            except:
                data_t = trajs
                data_l = labels
                    
        return data_t, data_l  
    
    def _save_trajectories(self, trajs, labels, dic, df, dataset_idx, path):
        
        file_name = path+dic['model']+'_'+str(df.shape[0])+'.npy'
        
        # Save information in CSV handler
        df = df.append(dic, ignore_index = True)
        df.to_csv(path+dic['model']+'.csv')
        
        # Save trajectories and labels
        data = np.stack((trajs, labels))
        np.save(file_name, data)
        
    def _load_trajectories(self, model_name, dataset_idx, path):
        
        file_name = path+model_name+'_'+str(dataset_idx)+'.npy'
        data = np.load(file_name)
        return data[0], data[1]
    

# `_inspect_dic`
Given a dictionary, this function checks that it fulfils the constraints of the program and checks the validity of the save/load actions.

In [5]:
# export
class datasets_phenom(datasets_phenom):   

    def _inspect_dic(self, dic):
        '''Checks the information of the dictionaries and managesloading/saving information.'''        
            
        # Add time and number of trajectories information
        if self.N_model is not None:
            dic['N'] = self.N_model
        if self.T is not None:
            dic['T'] = self.T

        # Check if CSV with information of dataset exists. If not, create it
        model_m = dic['model']
        model_f = self.avail_models_func[self.avail_models_name.index(model_m)]    
        # Check arguments and defaults from model's function            
        args = inspect.getfullargspec(model_f).args[1:]
        defaults = inspect.getfullargspec(model_f).defaults
        try:
            df = pd.read_csv(self.path+model_m+'.csv', index_col=0)
        except:                
            # convert to dataframe and add model
            df = pd.DataFrame(columns = args+['model'])                

        # Assign missing keys in dic with default values
        for arg, default in zip(args, defaults):
            if arg not in dic.keys():
                dic[arg] = default

        # Check if updated keys of dic equal keys of csv.
        if set(list(df.keys())) != set(list(dic.keys())):
            raise ValueError('Input model dictionary does not match model´s properties')

        # Check if the dataset already exists:
        df_conditions = df.copy()
        df_conditions = df_conditions.where(pd.notnull(df_conditions), None) # Need in case of empty elements because deafults are None
        for key in dic:
            # We need to transform it to str to do a fair comparison between matrices (e.g. transition matrix, Ds, alphas,...)
            df_conditions = df_conditions.loc[(df_conditions[key].astype(str) == str(dic[key]))]
            if len(df_conditions.index) == 0:
                break

        # If dataset exists
        if len(df_conditions.index) > 0:
            # if the dataset exists and save was True, do not save but load
            if self.save:
                wrn_str = f'The dataset you want to save already exists (file: {model_m}_{df_conditions.index[0]}.npy). Switching to Load mode.'
                warnings.warn(wrn_str)
                dataset_idx = df_conditions.index[0] 
            elif self.load:
                dataset_idx = df_conditions.index[0]
            else:
                dataset_idx = False                 

        # If dataset does no exists
        else:         
            if self.load:
                raise ValueError('The dataset you want to load does not exist.')
            else: # If the dataset does not exist, append empty string.
                # This allows to mix saving and loading
                dataset_idx = False
                
        return dataset_idx, df

# `_get_args`
Given the name of a model, returns its input parameters

In [15]:
# export
class datasets_phenom(datasets_phenom):  
    def _get_args(self, model, return_defaults = False):
        model_f = self.avail_models_func[self.avail_models_name.index(model)]    
        # Check arguments and defaults from model's function            
        args = inspect.getfullargspec(model_f).args[1:]
        defaults = inspect.getfullargspec(model_f).defaults
        if return_defaults:
            return args, defaults
        else:
            return args

# Define states from given labels
Given an array of labels and their correspondance to the ANDI 2022 state labels, return an array with the state of each trajectory at every timestep.

In [16]:
# export
class datasets_phenom(datasets_phenom):
    def _extract_state(self, label_values, states, labels):
        '''
        Inputs:
        :label_values (array) (size: # of states): values of any property for every existing state.
        :states (array) (size: # of states): labels correspoding to each state as defined in the
        ANDI 2022 state labels: 0: immobile; 1: confined; 2: brownian; 3: anomalous.''
        :labels (array) (size: N x T): values of the labels over time
        '''
        
        dummy_labels = np.zeros_like(labels)
        
        for lab_val, state in zip(label_values, states):
            dummy_labels[labels == lab_val] = state
            
        return dummy_labels

In [55]:
a = np.random.randint(0, 2, (2, 4))
val = [0, 1]; state = [4,7]
b = datasets_phenom()._extract_state(val, state, a)
a, b

(array([[1, 0, 1, 0],
        [0, 1, 1, 0]]),
 array([[7, 4, 7, 4],
        [4, 7, 7, 4]]))

# Test

In [17]:
# from andi_datasets.datasets_phenom import datasets_phenom
dp = datasets_phenom()

In [22]:
dp._get_args('immobile_traps', return_defaults=True)

(['N', 'T', 'L', 'r', 'Pu', 'Pb', 'D', 'alpha', 'Nt', 'traps_pos', 'deltaT'],
 (10, 100, 5, 1, 0.1, 0.01, [1, 1, 1], [1, 1, 1], 10, None, 1))

In [36]:
path = 'datasets/'

main =  [{'model': 'dimerization', 'N': 40},
         {'model': 'immobile_traps', 'N': 53}
        ]


trajs, labels = dp.create_dataset(T = 500, dics = main, N_model = None)

  0%|          | 0/499 [00:00<?, ?it/s]

  0%|          | 0/499 [00:00<?, ?it/s]

In [37]:
trajs.shape

(500, 93, 2)

# NBDEV Export

In [38]:
from nbdev.export import notebook2script
notebook2script()

Converted analysis_methods.ipynb.
Converted datasets_phenom.ipynb.
Converted datasets_theory.ipynb.
Converted models_phenom.ipynb.
Converted models_theory.ipynb.
Converted utils_challenge.ipynb.
Converted utils_trajectories.ipynb.
