In [1]:
import numpy as np
import pandas as pd
import glob
import os.path
import datetime
import os

def read_plt(plt_file):
    points = pd.read_csv(plt_file, skiprows=6, header=None,
                         parse_dates=[[5, 6]], infer_datetime_format=True)

    # for clarity rename columns
    points.rename(inplace=True, columns={0: 'lat', 1: 'lon', 3: 'alt', '5_6': 'time'})

    # remove unused columns
    points.drop(inplace=True, columns=[2, 4])

    return points

mode_names = ['walk', 'bike', 'bus', 'car', 'subway','train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']
mode_ids = {s : i + 1 for i, s in enumerate(mode_names)}

def read_labels(labels_file):
    labels = pd.read_csv(labels_file, skiprows=1, header=None,
                         parse_dates=[[0, 1], [2, 3]],
                         infer_datetime_format=True, delim_whitespace=True)

    # for clarity rename columns
    labels.columns = ['start_time', 'end_time', 'label']

    # replace 'label' column with integer encoding
    labels['label'] = [mode_ids[i] for i in labels['label']]

    return labels

def apply_labels(points, labels):
    indices = labels['start_time'].searchsorted(points['time'], side='right') - 1
    no_label = (indices < 0) | (points['time'].values >= labels['end_time'].iloc[indices].values)
    points['label'] = labels['label'].iloc[indices].values
    points.loc[no_label, 'label'] = 0

def read_user(user_folder):
    labels = None

    plt_files = glob.glob(os.path.join(user_folder, 'Trajectory', '*.plt'))
    df = pd.concat([read_plt(f) for f in plt_files])

    labels_file = os.path.join(user_folder, 'labels.txt')
    if os.path.exists(labels_file):
        labels = read_labels(labels_file)
        apply_labels(df, labels)
    else:
        df['label'] = 0

    return df

def read_all_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    for i, sf in enumerate(subfolders):
        print('[%d/%d] processing user %s' % (i + 1, len(subfolders), sf))
        df = read_user(os.path.join(folder,sf))
        df['user'] = int(sf)
        dfs.append(df)
    return pd.concat(dfs)

In [2]:
df = read_all_users("Geolife Trajectories 1.3")

[1/2] processing user 010
[2/2] processing user 179


In [3]:
df

Unnamed: 0,time,lat,lon,alt,label,user
0,2007-10-21 11:07:59,37.204267,112.167773,2415,0,10
1,2007-10-21 11:08:00,37.204267,112.167770,2415,0,10
2,2007-10-21 11:10:18,37.204510,112.167785,2397,0,10
3,2007-10-21 11:10:19,37.204537,112.167782,2395,0,10
4,2007-10-21 11:10:20,37.204563,112.167775,2393,0,10
...,...,...,...,...,...,...
4704,2008-11-29 08:15:52,40.007802,116.319362,84,0,179
4705,2008-11-29 08:15:54,40.007780,116.319360,88,0,179
4706,2008-11-29 08:15:56,40.007756,116.319362,92,0,179
4707,2008-11-29 08:15:58,40.007740,116.319361,97,0,179


In [6]:
df['time'] = pd.to_datetime(df['time']).dt.date

#Excluindo os dados das datas especificadas para os usuários 179 e 10 para diminuir o arquivo
df = df[~((df['user'] == 179) & (df['time'] == datetime.date(2008, 11, 29)))]
df = df[~((df['user'] == 10) & (df['time'] == datetime.date(2008, 4, 3)))]

In [8]:
#Salvando o DataFrame
df.to_csv("geolife_filtrado.csv", index=False)