In [None]:
#export
from pathlib import Path
import urllib.request as u_request
from zipfile import ZipFile
import csv 
import pandas as pd
from andi import andi_datasets, normalize
import numpy as np

from fastai.text.all import *

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
# default_exp data

# Data

> Here we deal with the data acquisition and processing.

## Data acquirement

In [None]:
#export
DATA_PATH = Path("../data")

In [None]:
#export
def acquire_data(train=True, val=True):
    """Obtains the train and validation datasets of the competition. 
    The train url maight fail. Get it from https://drive.google.com/drive/folders/1RXziMCO4Y0Fmpm5bmjcpy-Genhzv4QJ4"""
    DATA_PATH.mkdir(exist_ok=True)
    
    train_url = ("https://doc-4k-88-drive-data-export.googleusercontent.com/download/qh9kfuk2n3khcj0qvrn9t3a4j19nve1a/" + 
                "rqpd3tajosn0gta5f9mmbbb1e4u8csnn/1599642000000/17390da5-4567-4189-8a62-1749e1b19b06/108540842544374891611/" + 
                "ADt3v-N9HwRAxXINIFMKGcsrjzMlrvhOOYitRyphFom1Ma-CUUekLTkDp75fOegXlyeVVrTPjlnqDaK0g6iI7eDL9YJw91-" + 
                "jiityR3iTfrysZP6hpGA62c4lkZbjGp_NJL-XSDUlPcwiVi5Hd5rFtH1YYP0tiiFCoJZsTT4akE8fjdrkZU7vaqFznxuyQDA8YGaiuYlKu" + 
                "-F1HiAc9kG_k9EMgkMncNflNJtlugxH5pFcNDdrYiOzIINRIRivt5ScquQ_s4KyuV-zYOQ_g2_VYri8YAg0IqbBrcO-exlp5j-" +
                "t02GDh5JZKU3Hky5b70Z8brCL5lvK0SFAFIKOer45ZrFaACA3HGRNJg==?authuser=0&nonce=k5g7m53pp3cqq&user=" + 
                "108540842544374891611&hash=m7kmrh87gmekjhrdcpbhuf1kj13ui0l2")
    val_url = ("https://competitions.codalab.org/my/datasets/download/7ea12913-dfcf-4a50-9f5d-8bf9666e9bb4")

    if train: 
        data = _download_bytes(train_url)
        _write_bytes(data, DATA_PATH)
        train_path = DATA_PATH/"Development dataset for Training"
        train_path.rename(train_path.parent/"train")
        
    if val: 
        data = _download_bytes(val_url)
        _write_bytes(data, DATA_PATH)
        val_path = DATA_PATH/"validation_for_scoring"
        val_path.rename(val_path.parent/"val")
        
    rmtree(DATA_PATH/"__MACOSX")
    
def _download_bytes(url):
    "Downloads data from `url` as bytes"
    u = u_request.urlopen(url)
    data = u.read()
    u.close()
    return data

def _write_bytes(data, path):
    "Saves `data` (bytes) into path."
    zip_path = _zip_bytes(data)
    _unzip_file(zip_path, new_path=path)

def _zip_bytes(data, path=None):
    "Saves bytes data as .zip in `path`."
    if path is None: path = Path("../temp")
    zip_path = path.with_suffix(".zip")
    with open(zip_path, "wb") as f:
        f.write(data)
    return zip_path
        
def _unzip_file(file_path, new_path=None, purge=True):
    "Unzips file in `file_path` to `new_path`."
    if new_path is None: new_path = file_path.with_suffix("")
    zip_path = file_path.with_suffix(".zip")
    with ZipFile(zip_path, 'r') as f:
        f.extractall(new_path)
    if purge: zip_path.unlink()
        
def rmtree(root):
    for p in root.iterdir():
        if p.is_dir(): rmtree(p)
        else: p.unlink()
    root.rmdir()

In [None]:
df = pd.DataFrame(columns=['dim', 'model', 'exp', 'x', 'len'], dtype=object)
for dim in range(1, 4):
    trajs = pd.read_pickle(DATA_PATH/f"custom_val/dataset_{dim}D_task_2.pkl")['dataset_og_t2']
    for traj in trajs:
        model, exp, x = traj[0], traj[1], traj[2:]  
        x = tensor(x).view(dim,-1).T
        x = x[:torch.randint(10, 1000, (1,))]
        df = df.append({'dim': dim, 'model': model, 'exp': exp, 'x': x, 'len': len(x)}, ignore_index=True)
        
    df.to_pickle(DATA_PATH/f"custom_val/custom_{dim}D.pkl")

## Data conditioning

In [None]:
#export
def load_custom_data(dim=1, models=None, exps=None, path=None):
    "Loads data from custom dataset."
    path = DATA_PATH/f"custom{dim}.pkl" if path is None else path
    df = pd.read_pickle(path)
    mod_mask = sum([df['model'] == m for m in models]) if models is not None else np.ones(df.shape[0], dtype=bool)
    exp_mask = sum([df['exp'] == e for e in exps]) if exps is not None else np.ones(df.shape[0], dtype=bool)
    mask = mod_mask & exp_mask    
    return df[mask].reset_index(drop=True)
    
def load_data(task, dim=1, ds='train'):
    "Loads 'train' or 'val' data of corresponding dimension."
    path = DATA_PATH/ds
    try: 
        df = pd.read_pickle(path/f"task{task}.pkl")
    except: 
        _txt2df(task, ds=[ds])
        df = pd.read_pickle(path/f"task{task}.pkl")    
    return df[df['dim']==dim].reset_index(drop=True)

def _txt2df(task, ds=['train', 'val']):
    "Extracts dataset and saves it in df form"
    if 'train' in ds:
        df = pd.DataFrame(columns=['dim', 'y', 'x', 'len'], dtype=object)
        train_path = DATA_PATH/"train"
        if not (train_path/f"task{task}.txt").exists(): acquire_data(train=True, val=False)
        with open(train_path/f"task{task}.txt", "r") as D, open(train_path/f"ref{task}.txt") as Y:
            trajs = csv.reader(D, delimiter=";", lineterminator="\n", quoting=csv.QUOTE_NONNUMERIC)
            labels = csv.reader(Y, delimiter=";", lineterminator="\n", quoting=csv.QUOTE_NONNUMERIC)
            for t, y in zip(trajs, labels):
                dim, x = int(t[0]), t[1:]
                x = tensor(x).view(dim, -1).T
                label = tensor(y[1:]) if task is 3 else y[1]
                df = df.append({'dim': dim, 'y': label, 'x': x, 'len': len(x)}, ignore_index=True)

        df.to_pickle(train_path/f"task{task}.pkl")
        
    if 'val' in ds: 
        df = pd.DataFrame(columns=['dim', 'x', 'len'], dtype=object)
        val_path = DATA_PATH/"val"
        task_path = val_path/f"task{task}.txt"
        if not task_path.exists(): acquire_data(train=False, val=True)
        with open(task_path, "r") as D:
            trajs = csv.reader(D, delimiter=";", lineterminator="\n", quoting=csv.QUOTE_NONNUMERIC)
            for t in trajs:
                dim, x = int(t[0]), t[1:]
                x = tensor(x).view(dim, -1).T
                df = df.append({'dim': dim, 'x': x, 'len': len(x)}, ignore_index=True)
        
        df['y'] = ""
        df.to_pickle(val_path/f"task{task}.pkl")

## Dataloaders

In [None]:
#export
def pad_trajectories(samples, pad_value=0, pad_first=True, backwards=False):
    "Pads trajectories assuming shape (len, dim)"
    max_len = max([s.shape[0] for s, _ in samples])
    if backwards: pad_first = not pad_first
    def _pad_sample(s):
        s = normalize_trajectory(s)
        diff = max_len - s.shape[0]
        pad = s.new_zeros((diff, s.shape[1])) + pad_value
        pad_s = torch.cat([pad, s] if pad_first else [s, pad])
        if backwards: pad_s = pad_s.flip(0)
        return pad_s
    return L((_pad_sample(s), y) for s, y in samples)

def normalize_trajectory(traj):
    "Normalizes the trajectory displacements."
    n_traj = torch.zeros_like(traj)
    disp = traj[1:]-traj[:-1]
    n_traj[1:] = disp.div_(disp.std(0)).cumsum(0)
    return n_traj

In [None]:
#export
@delegates(pad_trajectories)
def get_custom_dls(target='model', dim=1, models=None, exps=None, bs=128, split_pct=0.2, path=None, balance=False, **kwargs):
    "Obtain `DataLoaders` from custom dataset filtered by `models` and `exps` to predict `target`."
    data = load_custom_data(dim=dim, models=models, exps=exps, path=path)
    if balance: data = _subsample_df(data)
    ds = L(zip(data['x'], data[target])) if target is 'exp' else L(zip(data['x'], data[target].astype(int)))
    sorted_dl = partial(SortedDL, before_batch=partial(pad_trajectories, **kwargs), shuffle=True)
    return get_dls_from_ds(ds, sorted_dl, split_pct=split_pct, bs=bs)
    
@delegates(pad_trajectories)
def get_discriminative_dls(task, dim=1, bs=128, split_pct=0.2, ds='train', **kwargs):
    "Obtain `DataLoaders` for classification/regression models."
    data = load_data(task, dim=dim, ds=ds)
    ds = L(zip(data['x'], data['y'])) if task==1 else L(zip(data['x'], data['y'].astype(int)))
    sorted_dl = partial(SortedDL, before_batch=partial(pad_trajectories, **kwargs), shuffle=True)
    return get_dls_from_ds(ds, sorted_dl, split_pct=split_pct, bs=bs)

@delegates(SortedDL.__init__)
def get_turning_point_dls(task=3, dim=1, bs=128, split_pct=0.2, ds='train', **kwargs):
    "Obtain `DataLoaders` to predict change points in trajecotries."
    data = load_data(task, dim=dim, ds=ds)
    ds = L(zip(data['x'], torch.stack(list(data['y'].values))[:, 0]))
    sorted_dl = partial(SortedDL, shuffle=True, **kwargs)
    return get_dls_from_ds(ds, sorted_dl, split_pct=split_pct, bs=bs)
    
@delegates(pad_trajectories)
def get_1vall_dls(target=0, dim=1, models=None, exps=None, bs=128, split_pct=0.2, **kwargs):
    data = load_custom_data(dim=dim, models=models, exps=exps)
    x, y = data['x'], (data['model'] != target).astype(int)
    ds = L(zip(x, y)) 
    sorted_dl = partial(SortedDL, before_batch=partial(pad_trajectories, **kwargs), shuffle=True)
    return get_dls_from_ds(ds, sorted_dl, split_pct=split_pct, bs=bs)
    
@delegates(pad_trajectories)
def get_validation_dl(task, dim=1, bs=64, ds='val', **kwargs):
    "Obtain `DataLoaders` for validation."
    data = load_data(task, dim=dim, ds=ds)
    ds = L(zip(data['x'], data['y']))
    return DataLoader(ds, bs=bs, before_batch=partial(pad_trajectories, **kwargs), device=default_device())

def get_dls_from_ds(ds, dl_type, split_pct=0.2, bs=128):
    idx = L(int(i) for i in torch.randperm(len(ds)))
    cut = int(len(ds)*split_pct)
    
    train_ds, val_ds = ds[idx[cut:]], ds[idx[:cut]]
    return DataLoaders.from_dsets(train_ds, val_ds, bs=bs, dl_type=dl_type, device=default_device())

def _subsample_df(df):
    "Subsamples df to balance models"
    models = df.model.unique()
    max_s = min([len(df[df.model==m]) for m in models])
    sub_dfs = [df[df.model==m].sample(frac=1)[:max_s] for m in models]
    return pd.concat(sub_dfs, ignore_index=True)

In [None]:
dls = get_discriminative_dls(task=1, dim=2)

In [None]:
x, y = dls.one_batch()
x.shape, y.shape

(torch.Size([128, 999, 2]), torch.Size([128]))

## Custom dataset

In [None]:
#export
def create_custom_dataset(N, max_T=1000, min_T=10, dimensions=[1, 2, 3], save=True):
    ad = andi_datasets()
    exponents = np.arange(0.05, 2.01, 0.05)
    n_exp, n_models = len(exponents), len(ad.avail_models_name)
    # Trajectories per model and exponent. Arbitrarely chose to fulfill balanced classes
    N_per_model = np.ceil(1.6*N/5)
    subdif, superdif = n_exp//2, n_exp//2+1
    num_per_class = np.zeros((n_models, n_exp))
    num_per_class[:2,:subdif] = np.ceil(N_per_model/subdif)         # ctrw, attm
    num_per_class[2, :] = np.ceil(N_per_model/(n_exp-1))            # fbm
    num_per_class[2, exponents == 2] = 0                            # fbm can't be ballistic
    num_per_class[3, subdif:] = np.ceil((N_per_model/superdif)*0.8) # lw
    num_per_class[4, :] = np.ceil(N_per_model/n_exp)                # sbm
    
    for dim in dimensions:             
        dataset = ad.create_dataset(T=max_T, N=num_per_class, exponents=exponents, 
                                    dimension=dim, models=np.arange(n_models))            

        # Normalize trajectories
        n_traj = dataset.shape[0]
        norm_trajs = normalize(dataset[:, 2:].reshape(n_traj*dim, max_T))
        dataset[:, 2:] = norm_trajs.reshape(dataset[:, 2:].shape)

        # Add localization error, Gaussian noise with sigma = [0.1, 0.5, 1]
        loc_error_amplitude = np.random.choice(np.array([0.1, 0.5, 1]), size=n_traj*dim)
        loc_error = (np.random.randn(n_traj*dim, int(max_T)).transpose()*loc_error_amplitude).transpose()
        dataset = ad.create_noisy_localization_dataset(dataset, dimension=dim, T=max_T, noise_func=loc_error)
        
        # Add random diffusion coefficients
        trajs = dataset[:, 2:].reshape(n_traj*dim, max_T)
        displacements = trajs[:, 1:] - trajs[:, :-1]
        # Get new diffusion coefficients and displacements
        diffusion_coefficients = np.random.randn(trajs.shape[0])
        new_displacements = (displacements.transpose()*diffusion_coefficients).transpose()  
        # Generate new trajectories and add to dataset
        new_trajs = np.cumsum(new_displacements, axis=1)
        new_trajs = np.concatenate((np.zeros((new_trajs.shape[0], 1)), new_trajs), axis=1)
        dataset[:, 2:] = new_trajs.reshape(dataset[:, 2:].shape)
        
        df = pd.DataFrame(columns=['dim', 'model', 'exp', 'x', 'len'], dtype=object)
        for traj in dataset:
            mod, exp, x = int(traj[0]), traj[1], traj[2:]
            x = cut_trajectory(x, np.random.randint(min_T, max_T), dim=dim)
            x = tensor(x).view(dim, -1).T
            df = df.append({'dim': dim, 'model': mod, 'exp': exp, 'x': x, 'len': len(x)}, ignore_index=True)
            
        if save:
            DATA_PATH.mkdir(exist_ok=True)
            ds_path = DATA_PATH/f"custom{dim}.pkl"
            df.to_pickle(ds_path, protocol=pickle.HIGHEST_PROTOCOL)
        
    return df

def cut_trajectory(traj, t_cut, dim=1):
    "Takes a trajectory and cuts it to `T_max` length."
    cut_traj = traj.reshape(dim, -1)[:, :t_cut]
    return cut_traj.reshape(1, -1)

In [None]:
df = create_custom_dataset(20, max_T=25, save=False)

  dt = (1-np.random.rand(T))**(-1/sigma)
  dt = (1-np.random.rand(T))**(-1/sigma)


## Validation

In [None]:
#export
def validate_model(model, task, dim=1, bs=256, act=False, **kwargs):
    "Validates model on specific task and dimension."
    val_dl = get_validation_dl(task, dim=dim, bs=bs, **kwargs)
    if act: return torch.cat([to_detach(model(batch)[0].softmax(1)) for batch, _ in val_dl]) 
    else:   return torch.cat([to_detach(model(batch)) for batch, _ in val_dl])    
    
@delegates(validate_model)
def validate_task(models, task, dims, **kwargs):
    "Validates `models` on task for `dims`."
    if not hasattr(models, '__iter__'): models = [models]
    if not hasattr(dims, '__iter__'): dims = [dims]
    if len(models) != len(dims): 
        raise InputError(f"There are {len(models)} models and {len(dims)} dimensions")
    pred_path = DATA_PATH/"preds"
    pred_path.mkdir(exist_ok=True)
    task_path = pred_path/f"task{task}.txt"
    preds_dim = []
    for model, dim in zip(models, dims): preds_dim.append(validate_model(model, task, dim=dim, **kwargs))
    
    with open(task_path, "w") as f:
        for dim, preds in zip(dims, preds_dim):
            for pred in preds:
                f.write(f"{int(dim)}; {';'.join(str(i.item()) for i in pred)}\n")

# Export-

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_data.ipynb.
Converted 01_models.ipynb.
Converted 02_prototypes.ipynb.
Converted 03_utils.ipynb.
Converted 04_analysis.ipynb.
Converted index.ipynb.
