In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import datetime
import numpy as np

from tqdm.auto import tqdm

In [None]:
def cast_time(x):
    return datetime.datetime.strptime(x,'%Y-%m-%d %H:%M:%S')

def compute_delta_time(x):
    date_1 = sub_df.iloc[0]['datetime']
    date_2 = x
    time_delta = (date_2 - date_1)
    return time_delta.total_seconds()

def extract_day(x):
    return x.strftime("%Y-%m-%d")

In [None]:
def convert_x_to_visualize(x, x_min, x_max):
    num_pixel_on_x = binary_matrix.shape[0]
    return num_pixel_on_x*(x-x_min)/(x_max-x_min)

def convert_y_to_visualize(y, y_min, y_max):
    num_pixel_on_y = binary_matrix.shape[1]
    return num_pixel_on_y*(1-(y-y_min)/(y_max-y_min))

In [None]:
df_edeka = pd.read_csv('./Traiettorie/edeka_dataset.txt', sep=';')
x_min = df_edeka['x'].min()
x_max = df_edeka['x'].max()
y_min = df_edeka['y'].min()
y_max = df_edeka['y'].max()

'''sub_df = df_edeka[df_edeka['tag_id'] == '0x00205F15109C']
sub_df = sub_df.sort_values(by='time')
sub_df['datetime'] = sub_df['time'].apply(cast_time)
sub_df['deltatime'] = sub_df['datetime'].apply(compute_delta_time)
sub_df['day'] = sub_df['datetime'].apply(extract_day)
'''
#single_day = sub_df[sub_df['day'] == '2019-08-16']

In [None]:
def extract_sequences(threshold, min_seq_time_length, min_num_points):
    
    # threshold = posizioni più distanti in tempo di questa threshold spezzano la sequenza
    # min_seq_time_length = selezioniaimo sequenze t.c. il punto finale e iniziale distano almeno min_seq_time_length secondi 
    # min_num_points = selezioniaimo sequenze t.c. il numero di punti sia almeno min_num_points
    
    curr_seq = []
    seq_idx = 0

    new_seq_df = pd.DataFrame(columns = ['tag_id', 'time', 'x', 'y', 'description', 'datetime', 'deltatime', 'seq_idx'])

    
    for tag_id in tqdm(df_edeka['tag_id'].unique()):
        
        sub_df = df_edeka[df_edeka['tag_id'] == tag_id]
        sub_df = sub_df.sort_values(by='time')
        sub_df['datetime'] = sub_df['time'].apply(cast_time)
        sub_df['deltatime'] = sub_df['datetime'].apply(compute_delta_time)
        #sub_df['day'] = sub_df['datetime'].apply(extract_day)
    
        for row in range(len(sub_df)-1):
            if (sub_df.iloc[row+1]['deltatime'] - sub_df.iloc[row]['deltatime']) <= threshold:
                if len(curr_seq) == 0:
                    curr_seq.append(sub_df.iloc[row])
                    curr_seq.append(sub_df.iloc[row+1])
                else:
                    curr_seq.append(sub_df.iloc[row+1])
            else:
                if len(curr_seq) >= min_num_points:
                    #list_of_seq.append(curr_seq)
                    out_df = pd.DataFrame(curr_seq)
                    if (out_df.iloc[-1]['deltatime'] - out_df.iloc[0]['deltatime']) >= min_seq_time_length:
                        out_df = out_df.drop_duplicates(subset=['datetime'])
                        out_df['seq_idx'] = [seq_idx]*len(out_df)
                        seq_idx += 1
                        new_seq_df = pd.concat([new_seq_df, out_df], ignore_index=True, sort=False)

                    curr_seq = []
            
    return new_seq_df

In [None]:
seq_df = extract_sequences(threshold=120, min_seq_time_length=10, min_num_points=5)

In [None]:
seq_df.to_csv('extracted_sequences.csv', index=False) 

In [2]:
seq_df = pd.read_csv('extracted_sequences.csv')

In [3]:
seq_df.head(20)

Unnamed: 0,tag_id,time,x,y,description,datetime,deltatime,seq_idx
0,0x00205F15109C,2019-08-01 13:25:13,25.34,30.61,Basket,2019-08-01 13:25:13,0.0,0
1,0x00205F15109C,2019-08-01 13:25:32,17.66,32.8,Basket,2019-08-01 13:25:32,19.0,0
2,0x00205F15109C,2019-08-01 13:25:33,14.66,33.12,Basket,2019-08-01 13:25:33,20.0,0
3,0x00205F15109C,2019-08-01 13:25:35,13.08,32.87,Basket,2019-08-01 13:25:35,22.0,0
4,0x00205F15109C,2019-08-01 13:25:36,12.49,33.08,Basket,2019-08-01 13:25:36,23.0,0
5,0x00205F15109C,2019-08-01 13:25:38,12.71,31.94,Basket,2019-08-01 13:25:38,25.0,0
6,0x00205F15109C,2019-08-01 13:25:39,11.43,30.7,Basket,2019-08-01 13:25:39,26.0,0
7,0x00205F15109C,2019-08-01 13:25:40,10.17,29.69,Basket,2019-08-01 13:25:40,27.0,0
8,0x00205F15109C,2019-08-01 13:25:41,8.94,28.91,Basket,2019-08-01 13:25:41,28.0,0
9,0x00205F15109C,2019-08-01 13:25:43,7.6,28.75,Basket,2019-08-01 13:25:43,30.0,0


In [4]:
seq_df['seq_idx'].unique()

array([    0,     1,     2, ..., 22591, 22592, 22593], dtype=int64)

In [4]:
from torch.utils.data import Dataset
from tqdm.auto import tqdm

def pad_missing_values(curr_seq):
    curr_seq['deltatime'] = curr_seq['deltatime'] - curr_seq['deltatime'].iloc[0]
    last_timestamp = int(curr_seq['deltatime'].iloc[-1])+1
    filled_seq = dict(zip(list(range(last_timestamp)), [float("NAN")]*last_timestamp))
    for row in range(len(curr_seq)):
        filled_seq[curr_seq['deltatime'].iloc[row]] = curr_seq[['x','y']].iloc[row]
    filled_df = pd.DataFrame.from_dict(filled_seq).T    
    return filled_df.astype(np.float32)


class IndividualTfDataset(Dataset):
    def __init__(self,data):
        super(IndividualTfDataset,self).__init__()

        self.data=data

    def __len__(self):
        return self.data['src'].shape[0]


    def __getitem__(self,index):
        return {'src':torch.Tensor(self.data['src'][index]),
                'trg':torch.Tensor(self.data['trg'][index]),
                'tag_id':self.data['tag_id'][index],
                'seq_idx':self.data['seq_idx'][index],
                }

In [6]:
def split_and_fill_sequences(seq_df, min_seq_length=20, max_seq_length=100, n_obs=40, n_preds=60, step=1):
    
    data_pos=[]
    data_speed=[]
    data_rel_pos=[]
    info_tag_id=[]
    info_seq_idx=[]
    
    data = {}
    
    for seq_idx in tqdm(seq_df['seq_idx'].unique(), total=len(seq_df['seq_idx'].unique())):
        
        curr_seq = seq_df[seq_df['seq_idx']==seq_idx].copy()#.astype(np.float32)
        
        filled_curr_seq = pad_missing_values(curr_seq)
        curr_seq_len = len(filled_curr_seq)
        
        if curr_seq_len < min_seq_length:
            # Consider only sequence with minimum number of data
            continue
        elif curr_seq_len <= max_seq_length:
            # Pad and add to list
            df_nan = pd.DataFrame(np.nan, index=np.arange(max_seq_length - curr_seq_len), columns=['x', 'y'])
            filled_curr_seq = pd.concat([filled_curr_seq, df_nan])
            
            data_pos.append(filled_curr_seq[['x','y']].values)
            data_speed.append(np.concatenate((np.zeros((1,2)), np.diff(filled_curr_seq.values, axis=0)), 0))
            data_rel_pos.append(filled_curr_seq.values - filled_curr_seq.values[0,:])
            
            info_tag_id.append(curr_seq['tag_id'].iloc[0])
            info_seq_idx.append(curr_seq['seq_idx'].iloc[0])
        else:
            # Split to max length and Add each list
            for i in range(1+(curr_seq_len - n_obs - n_preds)//step):
                new_curr_seq = filled_curr_seq[['x','y']].iloc[i*step:i*step+n_obs+n_preds].values
                if np.isnan(new_curr_seq[0,:]).any():
                    # Skip seq if first position is nan
                    continue
                data_pos.append(new_curr_seq)
                data_speed.append(np.concatenate((np.zeros((1,2)), np.diff(new_curr_seq, axis=0)), 0))
                data_rel_pos.append(new_curr_seq - new_curr_seq[0,:])
                
                info_tag_id.append(curr_seq['tag_id'].iloc[0])
                info_seq_idx.append(curr_seq['seq_idx'].iloc[0])
                
        if seq_idx > 5000:
            print("You stopped at:", str(seq_idx))
            break
                
    data_pos_stack = np.stack(data_pos)
    data_speed_stack = np.stack(data_speed)
    data_rel_pos_stack = np.stack(data_rel_pos)
    info_tag_id_stack = np.stack(info_tag_id)
    info_seq_idx_stack = np.stack(info_seq_idx)
    
    all_data = np.concatenate((data_pos_stack, data_speed_stack, data_rel_pos_stack), 2)
    inp = all_data[:,:n_obs,:]
    out = all_data[:,n_obs:,:]
    
    data['src'] = inp
    data['trg'] = out
    data['tag_id'] = info_tag_id_stack
    data['seq_idx'] = info_seq_idx_stack
    
    return data, IndividualTfDataset(data)

In [None]:
data, dataset = split_and_fill_sequences(seq_df)

HBox(children=(FloatProgress(value=0.0, max=22594.0), HTML(value='')))

In [None]:
import pickle

In [None]:
with open('processed_data.pkl', 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

In [None]:
data['src'].shape

In [None]:
idx = 4
aa = np.concatenate((data['src'][idx,:,:], data['trg'][idx,:,:]), axis=0)
aa

In [None]:
np.isnan(aa[0,:]).any()

In [None]:
data['trg'][idx,:,:].shape

In [None]:
with open('processed_data.pkl', 'rb') as f:
    data_2 = pickle.load(f)

In [None]:
data_2['src'].shape

In [None]:
import torch

In [None]:
test_dl = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=False, num_workers=0)

In [None]:
for batch in test_dl:
    print(batch['tag_id'], batch['seq_idx'])
    print(batch['src'].shape)
    print(batch['trg'].shape)
    break

In [None]:
bbb = torch.cat((batch['src'], batch['trg']), 1)
bbb[1,:,:]

In [None]:
seq_idx = 1
curr_seq = seq_df[seq_df['seq_idx']==seq_idx].copy()
filled_curr_seq = pad_missing_values(curr_seq)
filled_curr_seq

In [None]:
data_pos=[]
data_speed=[]
data_rel_pos=[]
data_time=[]

curr_seq_len = len(filled_curr_seq)

for i in range(1+(curr_seq_len - n_obs - n_preds)//step):
    new_curr_seq = filled_curr_seq[['x','y']].iloc[i*step:i*step+n_obs+n_preds].values
    
    data_pos.append(new_curr_seq)
    data_speed.append(np.concatenate((np.zeros((1,2)), np.diff(new_curr_seq, axis=0)), 0))
    data_rel_pos.append(new_curr_seq - new_curr_seq[0,:])
    data_time.append(filled_curr_seq.iloc[i*step:i*step+n_obs+n_preds].index.values)

In [None]:
np.stack(data_pos).shape

In [None]:
def split_or_pad_sequences(seq_df, max_time_length=30, max_seq_length=20):
    
    data_src=[]
    data_trg=[]
    data_time=[]
    data_tag_id=[]
    data_seq_id=[]
    
    for seq_idx in seq_df['seq_idx'].unique():
        
        curr_seq = seq_df[seq_df['seq_idx']==seq_idx]
        curr_seq_len = len(curr_seq)
        curr_time_len = curr_seq['deltatime'].iloc[-1] - curr_seq['deltatime'].iloc[0]
        
        if curr_seq_len < max_seq_length:
            # Pad
        else: 
            if curr_seq_len > max_seq_length:
                

In [None]:
curr_seq = seq_df[seq_df['seq_idx']==1]
curr_seq_len = len(curr_seq)
curr_time_len = curr_seq['deltatime'].iloc[-1] - curr_seq['deltatime'].iloc[0]

In [None]:
curr_seq

In [None]:
curr_seq_len, curr_time_len

In [None]:
max_time_length=30
max_seq_length=20

In [None]:
list_seq = []

if curr_seq_len < max_seq_length:
    # Pad
    pass
else:
    num_new_seq = curr_seq_len - max_seq_length
    
    for ii in range(num_new_seq):
        
        new_curr_seq = curr_seq.iloc[ii:max_seq_length+ii]
        new_curr_seq['deltatime'] = new_curr_seq['deltatime'] - new_curr_seq['deltatime'].iloc[0]
        new_curr_time_len = new_curr_seq.iloc[-1]
        
        if (max_time_length not None) and (new_curr_time_len > max_time_length):
            time_idx = find_index(new_curr_seq['deltatime'], max_time_length)
            
            for ii_2 in range(len(new_curr_seq['deltatime'])-time_idx):
                
                new_curr_seq = curr_seq.iloc[ii_2:max_seq_length+ii_2]
            # Pad and add

In [None]:
curr_seq.iloc[0:10]

In [None]:
curr_seq['deltatime'] - curr_seq['deltatime'].iloc[0]

In [None]:
def find_index(seq, threshold):
    for idx in range(len(seq)):
        if seq.iloc[idx] > threshold:
            return idx-1

In [None]:
ss = curr_seq['deltatime'] - curr_seq['deltatime'].iloc[0]
ss

In [None]:
find_index(ss, 40)

In [None]:
curr_seq_2 = seq_df[seq_df['seq_idx']==1].copy()
curr_seq_2['deltatime'] = curr_seq_2['deltatime'] - curr_seq_2['deltatime'].iloc[0]
curr_seq_2

In [None]:
curr_seq_2.iloc[:20][['x','y','deltatime']]