In [1]:
import numpy as np
import pandas as pd
import glob
import os.path
import datetime
import os

def read_plt(plt_file):
    points = pd.read_csv(plt_file, skiprows=6, header=None,
                         parse_dates=[[5, 6]], infer_datetime_format=True)

    # for clarity rename columns
    points.rename(inplace=True, columns={0: 'lat', 1: 'lon', 3: 'alt', '5_6': 'time'})

    # remove unused columns
    points.drop(inplace=True, columns=[2, 4])

    return points

mode_names = ['walk', 'bike', 'bus', 'car', 'subway','train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']
mode_ids = {s : i + 1 for i, s in enumerate(mode_names)}

def read_labels(labels_file):
    labels = pd.read_csv(labels_file, skiprows=1, header=None,
                         parse_dates=[[0, 1], [2, 3]],
                         infer_datetime_format=True, delim_whitespace=True)

    # for clarity rename columns
    labels.columns = ['start_time', 'end_time', 'label']

    # replace 'label' column with integer encoding
    labels['label'] = [mode_ids[i] for i in labels['label']]

    return labels

def apply_labels(points, labels):
    indices = labels['start_time'].searchsorted(points['time'], side='right') - 1
    no_label = (indices < 0) | (points['time'].values >= labels['end_time'].iloc[indices].values)
    points['label'] = labels['label'].iloc[indices].values
    points.loc[no_label, 'label'] = 0

def read_user(user_folder):
    labels = None

    plt_files = glob.glob(os.path.join(user_folder, 'Trajectory', '*.plt'))
    df = pd.concat([read_plt(f) for f in plt_files])

    labels_file = os.path.join(user_folder, 'labels.txt')
    if os.path.exists(labels_file):
        labels = read_labels(labels_file)
        apply_labels(df, labels)
    else:
        df['label'] = 0

    return df

def read_all_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    for i, sf in enumerate(subfolders):
        print('[%d/%d] processing user %s' % (i + 1, len(subfolders), sf))
        df = read_user(os.path.join(folder,sf))
        df['user'] = int(sf)
        dfs.append(df)
    return pd.concat(dfs)

In [2]:
df = read_all_users("Geolife Trajectories 1.3")

[1/2] processing user 010
[2/2] processing user 179


In [3]:
df

Unnamed: 0,time,lat,lon,alt,label,user
0,2007-10-21 11:07:59,37.204267,112.167773,2415,0,10
1,2007-10-21 11:08:00,37.204267,112.167770,2415,0,10
2,2007-10-21 11:10:18,37.204510,112.167785,2397,0,10
3,2007-10-21 11:10:19,37.204537,112.167782,2395,0,10
4,2007-10-21 11:10:20,37.204563,112.167775,2393,0,10
...,...,...,...,...,...,...
4704,2008-11-29 08:15:52,40.007802,116.319362,84,0,179
4705,2008-11-29 08:15:54,40.007780,116.319360,88,0,179
4706,2008-11-29 08:15:56,40.007756,116.319362,92,0,179
4707,2008-11-29 08:15:58,40.007740,116.319361,97,0,179


In [4]:
#Criar a nova coluna 'trackid' como a junção de 'time', 'label' e 'user', a fim de gerar um id de trajetórias
df['track_id'] = df['time'].dt.date.astype(str) + '-' + df['label'].astype(str) + '-' + df['user'].astype(str)

df

Unnamed: 0,time,lat,lon,alt,label,user,track_id
0,2007-10-21 11:07:59,37.204267,112.167773,2415,0,10,2007-10-21-0-10
1,2007-10-21 11:08:00,37.204267,112.167770,2415,0,10,2007-10-21-0-10
2,2007-10-21 11:10:18,37.204510,112.167785,2397,0,10,2007-10-21-0-10
3,2007-10-21 11:10:19,37.204537,112.167782,2395,0,10,2007-10-21-0-10
4,2007-10-21 11:10:20,37.204563,112.167775,2393,0,10,2007-10-21-0-10
...,...,...,...,...,...,...,...
4704,2008-11-29 08:15:52,40.007802,116.319362,84,0,179,2008-11-29-0-179
4705,2008-11-29 08:15:54,40.007780,116.319360,88,0,179,2008-11-29-0-179
4706,2008-11-29 08:15:56,40.007756,116.319362,92,0,179,2008-11-29-0-179
4707,2008-11-29 08:15:58,40.007740,116.319361,97,0,179,2008-11-29-0-179


In [5]:
df = df[(df['label'] != 0)]

df

Unnamed: 0,time,lat,lon,alt,label,user,track_id
0,2008-04-03 16:00:00,41.765052,83.344790,-777,6,10,2008-04-03-6-10
1,2008-04-03 16:00:01,41.765113,83.345118,-777,6,10,2008-04-03-6-10
2,2008-04-03 16:00:02,41.765173,83.345452,-777,6,10,2008-04-03-6-10
3,2008-04-03 16:00:03,41.765227,83.345795,-777,6,10,2008-04-03-6-10
4,2008-04-03 16:00:04,41.765282,83.346137,-777,6,10,2008-04-03-6-10
...,...,...,...,...,...,...,...
655,2008-11-29 02:29:27,40.029529,116.411977,291,5,179,2008-11-29-5-179
656,2008-11-29 02:29:29,40.029320,116.411975,289,5,179,2008-11-29-5-179
657,2008-11-29 02:29:31,40.029111,116.411963,275,5,179,2008-11-29-5-179
658,2008-11-29 02:29:33,40.028904,116.411962,274,5,179,2008-11-29-5-179


In [6]:
#A fim de deixar o arquivo de teste do Geolife com o tamanho de arquivo adequado para ser armazenado no GitHub, eliminou-se os primeiros registros do user 10
#Optou-se, para a eliminação, os registros do user 10 porque ele tinha mais que o dobro dos registros do user 179

#Criar uma lista com as datas que devem ser removidas
datas_a_remover = ['2008-04-03', '2008-04-04', '2008-06-18', '2008-06-19']

#Eliminar os registros dos usuários 10 com as datas especificadas
for data in datas_a_remover:
    df = df[~(((df['user'] == 10) & (df['time'].dt.date == pd.to_datetime(data).date())))]

df

Unnamed: 0,time,lat,lon,alt,label,user,track_id
23029,2008-06-20 00:00:00,36.977203,115.693044,92,6,10,2008-06-20-6-10
23030,2008-06-20 00:00:01,36.977469,115.692954,92,6,10,2008-06-20-6-10
23031,2008-06-20 00:00:02,36.977741,115.692864,92,6,10,2008-06-20-6-10
23032,2008-06-20 00:00:03,36.978006,115.692784,92,6,10,2008-06-20-6-10
23033,2008-06-20 00:00:04,36.978275,115.692705,92,6,10,2008-06-20-6-10
...,...,...,...,...,...,...,...
655,2008-11-29 02:29:27,40.029529,116.411977,291,5,179,2008-11-29-5-179
656,2008-11-29 02:29:29,40.029320,116.411975,289,5,179,2008-11-29-5-179
657,2008-11-29 02:29:31,40.029111,116.411963,275,5,179,2008-11-29-5-179
658,2008-11-29 02:29:33,40.028904,116.411962,274,5,179,2008-11-29-5-179


In [7]:
#Ordenar a tabela por usuário e tempo
df = df.sort_values(by=['user', 'time'])

In [8]:
#Contar a quantidade de linhas com o usuário 10 e 179
quantidade_user_10 = df[df['user'] == 10].shape[0]
quantidade_user_179 = df[df['user'] == 179].shape[0]

print("Quantidade de linhas com o usuário 10:", quantidade_user_10)
print("Quantidade de linhas com o usuário 179:", quantidade_user_179)

Quantidade de linhas com o usuário 10: 244639
Quantidade de linhas com o usuário 179: 119232


In [9]:
#Salvando o DataFrame
df.to_csv("geolife_teste.csv", index=False)