# Load the sample data
   * Sampled small-scale complete data (trave survey data from 10,000 individuals)
   * Sampled large-scale incomplete data (smart card data from 10,000 + 10,000 individuals)

In [1]:
import numpy as np
import pandas as pd

#Load the sampled complete and incomplte data 
x_train_cond_R = pd.read_csv('Data/train_complete_qualitative.csv')
y_train_cond_R = pd.read_csv('Data/train_complete_tripChain.csv')
y_train_cond_SC_R = pd.read_csv('Data/train_incomplete_tripChain.csv')

In [2]:
print(x_train_cond_R.shape)
print(y_train_cond_R.shape)
print(y_train_cond_SC_R.shape)

(64141, 9)
(64141, 20)
(112982, 20)


# Data Preprocessing
 * Transforming the long-form data into ndarray sequential form

In [3]:
## Remaining only relevent columns
x_catcol = x_train_cond_R.columns.drop(['ID','P_Trip_seq'])
y_catcol = y_train_cond_R.columns.drop(['ID','P_Trip_seq',"JIGA","P_Home_Meanage","P_Home_Older"])

## Define data types
x_train_cond_R[x_catcol]= x_train_cond_R[x_catcol].apply(lambda x: x.astype('category') )
y_train_cond_R[y_catcol]= y_train_cond_R[y_catcol].apply(lambda x: x.astype('category'))
y_train_cond_SC_R[y_catcol]= y_train_cond_SC_R[y_catcol].apply(lambda x: x.astype('category'))

## Sort by ID and trip sequence (order)
x_train_cond_R = x_train_cond_R.sort_values(by=['ID','P_Trip_seq'],axis=0)
y_train_cond_R = y_train_cond_R.sort_values(by=['ID','P_Trip_seq'],axis=0)
y_train_cond_SC_R = y_train_cond_SC_R.sort_values(by=['NID','P_Trip_seq'],axis=0)

## Save the Trip purposes of each trip in the trip-chain

samples = pd.concat([x_train_cond_R['ID'],x_train_cond_R[x_catcol].drop('P_Trip_purpose',axis=1)],axis=1)
samples_R = x_train_cond_R.copy()
samples_R['idx'] = samples_R.groupby('ID').cumcount()
samples_R['prod_idx'] = 'TP_' + samples_R.idx.astype(str)

Trip_purpose = samples_R.pivot(index='ID',columns='prod_idx',values='P_Trip_purpose')
for col in Trip_purpose.columns:
    Trip_purpose[col] = Trip_purpose[col].cat.add_categories("Z").fillna("Z")    
samples =  pd.merge(samples.groupby('ID').head(1),Trip_purpose,on="ID")


## Create the ndarray trip purposes
x_train_cond = pd.concat([x_train_cond_R['ID'],pd.get_dummies(x_train_cond_R[x_catcol].drop('P_Trip_purpose',axis=1))],axis=1)
x_train_cond_R['idx'] = x_train_cond_R.groupby('ID').cumcount()
x_train_cond_R['prod_idx'] = 'TP_' + x_train_cond_R.idx.astype(str)

Trip_purpose = x_train_cond_R.pivot(index='ID',columns='prod_idx',values='P_Trip_purpose')
for col in Trip_purpose.columns:
    Trip_purpose[col] = Trip_purpose[col].cat.add_categories("Z").fillna("Z")
    
Trip_purpose = Trip_purpose[Trip_purpose['TP_1'] != 'Z']

Trip_purpose = pd.get_dummies(Trip_purpose)
x_train_cond =  pd.merge(x_train_cond.groupby('ID').head(1),Trip_purpose,on="ID")


In [4]:
## Divide the trip-chain attributes into the sequential and non-sequential one

y_train_cat = y_train_cond_R[['isHome','P_Arrival_time','stay_time','tr_time']]
y_train_seq = pd.concat([pd.get_dummies(y_train_cat),y_train_cond_R[['P_Arrival_x','P_Arrival_y','ID','P_Trip_seq']]],axis=1)
y_train_seq = y_train_seq[y_train_seq['ID'].isin(x_train_cond['ID'])]


y_train_SC_cat = y_train_cond_SC_R[['isHome','P_Arrival_time','stay_time','tr_time']]
y_train_SC_seq = pd.concat([pd.get_dummies(y_train_SC_cat),y_train_cond_SC_R[['P_Arrival_x','P_Arrival_y','NID','P_Trip_seq']]],axis=1)

y_train_nseq = pd.concat([pd.get_dummies(y_train_cond_R[['Age_SC','start_time']]),y_train_cond_R[['ID','JIGA','P_Home_Meanage','P_Home_Older']]],axis=1)
y_train_SC_nseq = pd.concat([pd.get_dummies(y_train_cond_SC_R[['Age_SC','start_time']]),y_train_cond_SC_R[['NID','JIGA','P_Home_Meanage','P_Home_Older']]],axis=1)

y_train_nseq = y_train_nseq.groupby('ID').head(1)
y_train_nseq = y_train_nseq[y_train_nseq['ID'].isin(x_train_cond['ID'])]
y_train_nseq = y_train_nseq.drop(['ID'],axis=1)

y_train_SC_nseq = y_train_SC_nseq.groupby('NID').head(1)
y_train_SC_nseq = y_train_SC_nseq.drop(['NID'],axis=1)

x_train_cond = x_train_cond.drop(['ID'],axis=1)

In [5]:
## Transform the data into sequential ndarray

### Zero padding
def pad(x):
    zero_data = np.zeros(shape=(maxlen - len(x),num_features+4))
    d = pd.DataFrame(zero_data, columns=x.columns)
    data = x.append(d, ignore_index=True)
    return data

num_features = len(y_train_seq.columns)-4
maxlen = 5
num_data = y_train_seq['ID'].nunique()
num_data_SC = y_train_SC_seq['NID'].nunique()

### Adding dummy dimension to be divded 4 in the Transformer (for Multi-head attention)
for i in range(3):
    y_train_seq.insert(num_features,i,0)
    y_train_SC_seq.insert(num_features,i,0)

num_features = len(y_train_seq.columns)-4

### Resahpe

y_train_SC_seq=y_train_SC_seq.groupby('NID').apply(pad)
y_train_SC_seq=y_train_SC_seq.to_numpy()
y_train_SC_seq=y_train_SC_seq.reshape(num_data_SC,maxlen,num_features+4)

y_train_seq=y_train_seq.groupby('ID').apply(pad)
y_train_seq=y_train_seq.to_numpy()
y_train_seq=y_train_seq.reshape(num_data,maxlen,num_features+4)

# Save the data as a numpy format
 * Split the data into training and test sets
 * Save the sequential data as ndarray format

In [6]:
from sklearn.model_selection import train_test_split

def choice_train_test_split(X, y, y_ns, test_size=0.2,random_state=1004):
    test_num = int(X.shape[0] * test_size)
    train_num = X.shape[0] - test_num
    np.random.seed(random_state)
    train_idx = np.random.choice(X.shape[0], train_num, replace=False)
    test_idx = np.setdiff1d(range(X.shape[0]), train_idx)
    X_train = X.iloc[train_idx, :]
    X_test = X.iloc[test_idx, :]
    y_train = y[train_idx,:]
    y_test = y[test_idx,:]
    y_train_ns = y_ns.iloc[train_idx,:]
    y_test_ns = y_ns.iloc[test_idx,:]     

    return X_train, X_test, y_train, y_test,y_train_ns,y_test_ns
 
X_train, X_test, y_train, y_test, y_train_ns, y_test_ns = choice_train_test_split(x_train_cond,y_train_seq,y_train_nseq,test_size=0.2,random_state=1004)
y_train_SC, y_test_SC,y_train_SC_ns,y_test_SC_ns = train_test_split(y_train_SC_seq,y_train_SC_nseq,test_size=0.5,shuffle=True,random_state=1004)

In [7]:
np.save('Data/y_train_seq', y_train,allow_pickle=True)
np.save('Data/y_test_seq', y_test,allow_pickle=True)
np.save('Data/y_train_SC_seq', y_train_SC,allow_pickle=True)
np.save('Data/y_test_SC_seq', y_test_SC,allow_pickle=True)

X_train.to_csv('Data/x_train_cond.csv',index=False)
X_test.to_csv('Data/x_test_cond.csv',index=False)
y_train_ns.to_csv('Data/y_train_nseq.csv',index=False)
y_test_ns.to_csv('Data/y_test_nseq.csv',index=False)
y_train_SC_ns.to_csv('Data/y_train_SC_nseq.csv',index=False)
y_test_SC_ns.to_csv('Data/y_test_SC_nseq.csv',index=False)