In [1]:
import numpy  as np
import os
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from abc import ABC, abstractmethod

In [2]:
def read_data_geolife():
    data = []
    for folder in os.listdir('./Geolife Trajectories 1.3/Geolife Trajectories 1.3/Data'):
        user = folder
        file_path = f"./Geolife Trajectories 1.3/Geolife Trajectories 1.3/Data/{user}/Trajectory"
     
        for filename in os.listdir(file_path):
            record = np.genfromtxt(f'{file_path}/{filename}', delimiter=',',skip_header=6,dtype=None, encoding=None)
            for item in record:
                data_dict = ({'Latitude': item[0], 'Longitude': item[1], '0': item[2], 'Altitute': item[3], 'Date': item[4], 'Date string': item[5], 'Time': item[6], 'UID' : user})
                hour, minute, second = data_dict['Time'].split(':')
                minute = round(int(minute), -1)
                minute = str('%02d' % minute)
                if (minute == '00' or minute == '30') and (hour != prev_hour or minute != prev_minute):
                    data.append(data_dict)
                prev_hour = hour
                prev_minute = minute

    df = pd.DataFrame(data, columns=['Latitude', 'Longitude', '0', 'Altitute', 'Date', 'Date string', 'Time', 'UID'])
    return df


## Remove outliers

In [3]:

def remove_out_ranged(min_range, max_range, df):
    day_record_count = []
    users = df['UID'].unique()
    # check the records for a day
    for user in users:
        dates = df[df['UID'] == user]['Date string'].unique()
        for date in dates:
            num_record = len(df[(df['UID'] == user) & (df['Date string'] == date)])
            if num_record < min_range or num_record > max_range:
                df = df.drop(df[(df['UID'] == user) & (df['Date string'] == date)].index)
    df = df.reset_index(drop=True)
    return df

## Discretize and Interpolate

In [4]:
class Discretizer(ABC):
    '''
    abstract interface for discretizing points
    '''
    
    @abstractmethod
    def discretize(self, points: np.ndarray) -> np.ndarray:
        '''
        discretize points
        '''
        return points
    
class GridDiscretizer(Discretizer):
    '''
    discretize by mapping to center of grid containing the point
    '''

    def __init__(self, xrange, yrange, dim):
        self.xrange = xrange
        self.yrange = yrange
        self.dim = dim

        self.window_width = (
            (xrange[1] - xrange[0]) / dim[0],
            (yrange[1] - yrange[0]) / dim[1]
        )

        self.intercept = (
            xrange[0] + self.window_width[0] / 2,
            yrange[0] + self.window_width[1] / 2
        )

        self.slope = (
            (xrange[1] - xrange[0] - self.window_width[0]) / (dim[0] - 1),
            (yrange[1] - yrange[0] - self.window_width[1]) / (dim[1] - 1)
        )

    def discretize(self, points: np.ndarray) -> np.ndarray:
        assert np.all((self.xrange[0] <= points[:, 0]) & (points[:, 0] <= self.xrange[1]))
        assert np.all((self.yrange[0] <= points[:, 1]) & (points[:, 1] <= self.yrange[1]))

        out = np.zeros_like(points, dtype=float)

        # out[:, 0] = self.intercept[0] + self.slope[0] * (points[:, 0] // self.window_width[0]).clip(0, self.dim[0] - 1)
        # out[:, 1] = self.intercept[1] + self.slope[1] * (points[:, 1] // self.window_width[1]).clip(0, self.dim[1] - 1)

        x_indices = ((points[:, 0] - self.xrange[0]) / self.window_width[0])
        y_indices = ((points[:, 1] - self.yrange[0]) / self.window_width[1])

        # out[:, 0] = self.intercept[0] + self.slope[0] * x_indices.clip(0, self.dim[0] - 1)
        # out[:, 1] = self.intercept[1] + self.slope[1] * y_indices.clip(0, self.dim[0] - 1)
        out[:, 0] = self.intercept[0] + self.slope[0] * np.round(x_indices)
        out[:, 1] = self.intercept[1] + self.slope[1] * np.round(y_indices)
        return out
    
    def points(self):
        x = self.xrange[0] + self.slope[0] * np.arange(0, self.dim[0])
        y = self.yrange[0] + self.slope[1] * np.arange(0, self.dim[1])
        return np.meshgrid(x, y)

class NearestNeighborDiscretizer(Discretizer):
    '''
    discretize by mapping to nearest support
    '''

    def __init__(self, points: np.ndarray):
        self.points = points
        self.nn = NearestNeighbors(n_neighbors=1).fit(points)

    def discretize(self, points: np.ndarray) -> np.ndarray:
        _, indices = self.nn.kneighbors(points)
        return self.points[indices[:, 0], :]



In [20]:
def interpolate(df):
    df['datetime'] = (df['Date string'].astype(str) + ' ' + df['Time'].astype(str))
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['Latitude'] = pd.to_numeric(df['Latitude'])
    df['Longitude'] = pd.to_numeric(df['Longitude'])

    interpolated_df = pd.DataFrame()
    for UID in df['UID'].unique():
        for date in df['Date string'].unique():
            user_df = df[(df['UID'] == UID) & (df['Date string'] == date)]
            
            if not user_df.empty:
                user_df = user_df.set_index('datetime').resample(rule='1S').interpolate(method='linear')
                # print(user_df)
                result = user_df.resample(rule='30T', offset=pd.Timedelta(minutes=0, seconds=1)).interpolate(method='linear')
                interpolated_df = pd.concat([interpolated_df, result])
    return interpolated_df

## Export to formatted csv

In [6]:
def time_to_point(time):
    hour,min,sec = time.split(':')
    hour = int(hour)
    loop_hour = 0
    loop_min = '00'
    count = 0
    for _ in range(48):
        if (hour == loop_hour) and (min == loop_min):
            return count
        if loop_min == '30':
            loop_hour += 1
            loop_min = '00'
        else:
            loop_min = '30'
        count += 1

def standardize_output(df):
    all_dates = (df['Date string'].unique())
    for date in all_dates:
        rows_list = []
        date_df = df[(df['Date string'] == date)]
        all_users_in_date = (date_df['UID'].unique())
        if len(all_users_in_date) >= 10:
            for user in all_users_in_date:
                user_df = date_df[(date_df['UID'] == user)]
                uid = int(user)
                # start from 0
                timepoint = time_to_point(user_df.iloc[0]['Time'])
                lat = (user_df.iloc[0]['Latitude'])
                long = (user_df.iloc[0]['Longitude'])
                for i in range(timepoint):
                    time = i
                    lat = (user_df.iloc[0]['Latitude'])
                    long = (user_df.iloc[0]['Longitude'])
                    dict1 = {'uid':uid, 't':i, 'lat':lat, 'long': long}
                    rows_list.append(dict1)
                # with records
                for index, row in user_df.iterrows():
                    timepoint = time_to_point(row['Time'])
                    lat = (row['Latitude'])
                    long = (row['Longitude'])
                    dict1 = {'uid':uid, 't':timepoint, 'lat':lat, 'long': long}
                    rows_list.append(dict1)
                # append to 47
                for j in range(timepoint+1, 48):
                    time = j
                    dict1 = {'uid':uid, 't':j, 'lat':lat, 'long': long}
                    rows_list.append(dict1)
            
            day_df = pd.DataFrame(rows_list)
            day_df.to_csv(f'data/{date}.csv', index=False) 

## Main procedures for preprocessing of Geolife data

In [27]:
# Read data
df = read_data_geolife()
print('Data reading completed')

# Remove outliers
beijing_min_longitude, beijing_max_longitude = 115.416827, 117.508251
beijing_min_latitude, beijing_max_latitude = 39.442078, 41.058964
filtered_df = df[(df['Latitude'] < beijing_max_latitude) & (df['Latitude'] > beijing_min_latitude) & (df['Longitude'] < beijing_max_longitude) & (df['Longitude'] > beijing_min_longitude)]
filtered_df = filtered_df.reset_index(drop=True)

df = remove_out_ranged(5, 50, filtered_df)
print('Outliers removed')
df.to_csv("checkpoint.csv")

# Interpolate
df = pd.read_csv('checkpoint.csv')
df = interpolate(df)

df['datetime'] = df.index.astype(str)
df[['Date string', 'Time']] = df['datetime'].str.split(' ', expand=True)
df.dropna(inplace=True)
print('Data Interpolated')

# Discretize
discretizer = GridDiscretizer(xrange=(39.442078, 41.058964), yrange=(115.416827, 117.508251), dim=(100, 100))
df[['Latitude', 'Longitude']] = discretizer.discretize(df[['Latitude', 'Longitude']].to_numpy())
print('Data Discretized')


# Standardize output
standardize_output(df) # write to csv.file for further processing
print('Data Exported')

Data reading completed
Outliers removed
Data Interpolated
Data Discretized
Data Exported


In [28]:
print(df)

                       Unnamed: 0   Latitude   Longitude    0    Altitute  \
datetime                                                                    
2008-10-28 01:00:01      0.000000  40.016073  116.305682  0.0  114.000000   
2008-10-28 01:30:01      1.108108  40.016073  116.305682  0.0   72.675676   
2008-10-28 02:00:01      1.918919  40.016073  116.305682  0.0   70.243243   
2008-10-28 02:30:01      3.025475  40.016073  116.305682  0.0  170.395924   
2008-10-28 03:00:01      3.216538  40.016073  116.305682  0.0   98.365354   
...                           ...        ...         ...  ...         ...   
2007-12-11 12:00:01  35490.738658  39.983735  116.326596  0.0  173.622488   
2007-12-11 12:30:01  35490.980268  39.983735  116.326596  0.0  161.732195   
2007-12-11 13:00:01  35491.944032  39.983735  116.326596  0.0  188.636115   
2007-12-11 13:30:01  35492.475818  39.983735  116.305682  0.0  169.994651   
2007-12-11 14:00:01  35492.979033  39.999904  116.305682  0.0  148.532069   