In [70]:
import gzip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import random
from collections import Counter

# Load Data

In [2]:
yjmob1 = 'yjmob100k-dataset1.csv.gz' # dataset under normal scenes
yjmob_df = pd.read_csv(yjmob1, compression='gzip')

In [3]:
# Retrieve all ids

uids = yjmob_df['uid'].unique()

In [4]:
# Just to reduce memory space
rand_indicies = [random.randint(0, len(uids)) for _ in range(10000)]
selected_uids = [uid for uid in uids[rand_indicies]]
# selected_uids = uids[:200]

In [5]:
df = yjmob_df[yjmob_df['uid'].isin(selected_uids)] 

In [6]:
# linearization of the 2-dimensional grid, i.e., the original x,y coordinate system
def spatial_token(x, y):
    # x,y are the coordinate location
    # x determines the column order while
    # y determines the row order
    # (x-1) calculates the starting grid-column position
    # (y-1)*200 calculates the start index of the grid-row
    return (x-1)+(y-1)*200

In [7]:
#Location

df['combined_xy'] = df.apply(lambda row: spatial_token(row['x'], row['y']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined_xy'] = df.apply(lambda row: spatial_token(row['x'], row['y']), axis=1)


In [8]:
# 8:2 split

train_uids, test_uids = train_test_split(selected_uids, test_size=0.2, random_state=42)

df_train = df[df['uid'].isin(train_uids)]
df_test = df[df['uid'].isin(test_uids)]

In [9]:
STEP_SIZE = 100

In [10]:
def generate_sequences(data, data_t):
    return torch.tensor(data[:STEP_SIZE]),torch.tensor(data[STEP_SIZE]),\
                torch.tensor(data_t[:STEP_SIZE]),torch.tensor(data_t[STEP_SIZE])

In [25]:
# Group data by uid

# grouped_data_train = [group for _, group in df_train.groupby('uid')]
# grouped_data_test = [group for _, group in df_test.groupby('uid')]

# Rationale behind a model being a 'baseline' model

1. No trajectory or sequence of steps awareness
2. No timeline awareness (~ Markov Chain, LSTM)
3. Memory-less (~ Markov Chain)
4. Assume stable and general trajectory for users and at all time (~ somewhat in all models, but not as strong as the two baseline models)

# Baseline Model 1

Average location by time `t`

In [44]:
## Model

# all_t = sorted(df_train['t'].unique())
all_t = [int(i) for i in range(48)]

predictions_by_t = []

for t in range(len(all_t)):
    loc = list(df_train[df_train['t'] == t]['combined_xy'])
    if (len(loc) == 0):
        # handle the case when one or more particular t doesn't exist
        if (t == 0):
            for t in range(t+1, len(all_t)):
                loc_next = list(df_train[df_train['t'] == t]['combined_xy'])
                if (len(loc_next) > 0):
                    predictions_by_t.append(sum(loc_next)/len(loc_next))
                    break
        else:
            predictions_by_t.append(predictions_by_t[-1])
    else:
        predictions_by_t.append(sum(loc)/len(loc))

In [68]:
## Inferece

unique_uid = df_test['uid'].unique()

total_correct = 0

for usr in unique_uid:
    true_label = df_test[df_test['uid']==usr].iloc[[-1]]['combined_xy'].item()
    related_time = df_test[df_test['uid']==usr].iloc[[-1]]['t'].item()
    total_correct += int(true_label == predictions_by_t[related_time])
    
print(f"Accuracy: {total_correct/len(unique_uid)}, Num Correct: {total_correct}, Num Sample: {len(unique_uid)}")

Accuracy: 0.0, Num Correct: 0, Num Sample: 1982


# Baseline Model 2

Most frequencly visited location by time

In [74]:
# Model

all_t = [int(i) for i in range(48)]

predictions_by_freq = []

for t in range(len(all_t)):
    loc = list(df_train[df_train['t'] == t]['combined_xy'])
    if (len(loc) == 0):
        # handle the case when one or more particular t doesn't exist
        if (t == 0):
            for t in range(t+1, len(all_t)):
                loc_next = list(df_train[df_train['t'] == t]['combined_xy'])
                if (len(loc_next) > 0):
                    most_freq_loc, count = Counter(loc_next).most_common(1)[0]
                    predictions_by_freq.append(most_freq_loc)
                    break
        else:
            predictions_by_freq.append(predictions_by_freq[-1])
    else:
        most_freq_loc , count = Counter(loc).most_common(1)[0]
        predictions_by_freq.append(most_freq_loc)

In [76]:
# Inference

# Interesting to see that 15334 is very repetitive, you also see in Transformer that it tends to predict 15334
# would the transformer predicting 15334 considered as a kind of overfitting? (check the confidence)

unique_uid = df_test['uid'].unique()

total_correct = 0

for usr in unique_uid:
    true_label = df_test[df_test['uid']==usr].iloc[[-1]]['combined_xy'].item()
    related_time = df_test[df_test['uid']==usr].iloc[[-1]]['t'].item()
    total_correct += int(true_label == predictions_by_freq[related_time])
    
print(f"Accuracy: {total_correct/len(unique_uid)}, Num Correct: {total_correct}, Num Sample: {len(unique_uid)}")

Accuracy: 0.0010090817356205853, Num Correct: 2, Num Sample: 1982


# Baseline Model 3

Random guesses

In [77]:
# Inference

unique_uid = df_test['uid'].unique()

total_correct = 0

for usr in unique_uid:
    true_label = df_test[df_test['uid']==usr].iloc[[-1]]['combined_xy'].item()
    related_time = df_test[df_test['uid']==usr].iloc[[-1]]['t'].item()
    total_correct += int(random.randint(0,40000-1) == predictions_by_freq[related_time])
    
print(f"Accuracy: {total_correct/len(unique_uid)}, Num Correct: {total_correct}, Num Sample: {len(unique_uid)}")

Accuracy: 0.0, Num Correct: 0, Num Sample: 1982
