In [4]:
import pandas as pd
import numpy as np
import os
import shutil

### Data info:

+ There are 182 folders, each folder for each user

+ The date/time of all labels was converted to GMT, even though most of them were created in China.

+ Not only car data! Possible transportation modes are: walk, bike, bus, car, subway, train, airplane, boat, run and motorcycle.

+ 73 users have labeled their trajectories with transportation mode, such as driving, taking a bus, riding a bike and walking. There
is a label file storing the transportation mode labels in each user’s folder.

+ So, we will consider only users who provided data, and consider only bus and car modes.

+ That's sad, because we need to get rid of a big part of data, but it is what it is. We can't do anything about it, but we can always combine this dataset with other datasets to get more data.

### Remove the folders (users) not having labels

In [None]:
# Define the paths
base_path = "Geolife Trajectories 1.3/Data"
filtered_data_path = "FILTERED_DATA"

# Create the FILTERED_DATA folder if it doesn't exist
os.makedirs(filtered_data_path, exist_ok=True)

# Iterate through the folders in the base path
for folder in os.listdir(base_path):
    folder_path = os.path.join(base_path, folder)
    
    # Check if it's a directory and contains a labels.txt file
    if os.path.isdir(folder_path) and "labels.txt" in os.listdir(folder_path):
        # Copy the folder to the FILTERED_DATA directory
        shutil.copytree(folder_path, os.path.join(filtered_data_path, folder))

print(f"Filtered folders have been copied to {filtered_data_path}.")

Filtered folders have been copied to FILTERED_DATA.


In [5]:
import os
# Define the path to the FILTERED_DATA folder
filtered_data_path = "FILTERED_DATA"

# Count the number of folders in FILTERED_DATA
if os.path.exists(filtered_data_path):
    folder_count = len([folder for folder in os.listdir(filtered_data_path) if os.path.isdir(os.path.join(filtered_data_path, folder))])
    print(f"The FILTERED_DATA folder contains {folder_count} folders.")
else:
    print("The FILTERED_DATA folder does not exist.")

The FILTERED_DATA folder contains 69 folders.


**A bit weird, because authors talked about 73 users, but we see that there are 69. Anyway, that's not very important for now**

### Load the data

In [6]:
def read_plt(plt_file):
    points = pd.read_csv(plt_file, skiprows=6, header=None)

    # for clarity rename columns
    points.rename(inplace=True, columns={0: 'lat', 1: 'lon', 3: 'alt', 5: 'day', 6: 'hour'})
    date_format = '%Y-%m-%d %H:%M:%S'
    points['time'] = pd.to_datetime(points['day'] + ' ' + points['hour'], format=date_format)
    # remove unused columns
    points.drop(inplace=True, columns=[2, 4, 'day', 'hour'])
    return points

In [6]:
plt_file = "FILTERED_DATA/010/Trajectory/20070804033032.plt"
df_plt = read_plt(plt_file)
df_plt

Unnamed: 0,lat,lon,alt,time
0,39.921712,116.472343,13,2007-08-04 03:30:32
1,39.921705,116.472343,13,2007-08-04 03:30:33
2,39.921695,116.472345,13,2007-08-04 03:30:34
3,39.921683,116.472342,13,2007-08-04 03:30:35
4,39.921672,116.472342,13,2007-08-04 03:30:36
...,...,...,...,...
1111,39.902912,116.421455,180,2007-08-04 04:14:32
1112,39.902908,116.421432,180,2007-08-04 04:14:33
1113,39.902903,116.421413,180,2007-08-04 04:14:35
1114,39.902892,116.421330,180,2007-08-04 04:14:45


**Convert all the data to a dataset**

**Try on a sample first**

In [9]:
# Initialize an empty DataFrame
all_data = pd.DataFrame()

# Define the path to the FILTERED_DATA folder
filtered_data_path = "FILTERED_DATA"

# Iterate through each folder (user) in FILTERED_DATA
i = 0
for user_folder in os.listdir(filtered_data_path):
    i += 1
    if i == 5:
        break
    user_folder_path = os.path.join(filtered_data_path, user_folder)
    
    # Check if it's a directory
    if os.path.isdir(user_folder_path):
        trajectory_folder = os.path.join(user_folder_path, "Trajectory")
        
        # Check if the Trajectory folder exists
        if os.path.exists(trajectory_folder):
            # Iterate through each .plt file in the Trajectory folder
            for plt_file in os.listdir(trajectory_folder):
                if plt_file.endswith(".plt"):
                    plt_file_path = os.path.join(trajectory_folder, plt_file)
                    
                    # Read the .plt file using the read_plt function
                    points = read_plt(plt_file_path)
                    
                    # Add a user_id column to the DataFrame
                    points["user_id"] = user_folder
                    
                    # Append the DataFrame to the main DataFrame
                    all_data = pd.concat([all_data, points], ignore_index=True)

In [10]:
all_data

Unnamed: 0,lat,lon,alt,time,user_id
0,39.930748,116.306143,0.000000,2008-08-16 07:47:56,104
1,39.930792,116.306167,0.000000,2008-08-16 07:47:57,104
2,39.931093,116.306342,0.000000,2008-08-16 07:48:00,104
3,39.930950,116.306313,0.000000,2008-08-16 07:48:05,104
4,39.930963,116.306383,0.000000,2008-08-16 07:48:10,104
...,...,...,...,...,...
48021,39.886283,116.296767,180.446194,2007-10-02 08:24:05,105
48022,39.885450,116.296783,180.446194,2007-10-02 08:24:19,105
48023,39.883533,116.296717,177.165354,2007-10-02 08:24:49,105
48024,39.882467,116.296683,177.165354,2007-10-02 08:25:06,105


In [11]:
all_data.user_id.value_counts()

104    38572
102     6678
105     1977
161      799
Name: user_id, dtype: int64

**Try on the whole data**

In [None]:
# Initialize an empty DataFrame
all_data = pd.DataFrame()

# Define the path to the FILTERED_DATA folder
filtered_data_path = "FILTERED_DATA"

# Iterate through each folder (user) in FILTERED_DATA
for user_folder in os.listdir(filtered_data_path):
    user_folder_path = os.path.join(filtered_data_path, user_folder)
    
    # Check if it's a directory
    if os.path.isdir(user_folder_path):
        trajectory_folder = os.path.join(user_folder_path, "Trajectory")
        
        # Check if the Trajectory folder exists
        if os.path.exists(trajectory_folder):
            # Iterate through each .plt file in the Trajectory folder
            for plt_file in os.listdir(trajectory_folder):
                if plt_file.endswith(".plt"):
                    plt_file_path = os.path.join(trajectory_folder, plt_file)
                    
                    # Read the .plt file using the read_plt function
                    points = read_plt(plt_file_path)
                    
                    # Add a user_id column to the DataFrame
                    points["user_id"] = user_folder
                    
                    # Append the DataFrame to the main DataFrame
                    all_data = pd.concat([all_data, points], ignore_index=True)

In [None]:
all_data

### Try to filter the data which is not bus, car, or taxi (if the taxi mode even exists)

**Try on a sample first**

In [21]:
# Initialize an empty DataFrame
all_data = pd.DataFrame()

# Define the path to the FILTERED_DATA folder
filtered_data_path = "FILTERED_DATA"

# Iterate through each folder (user) in FILTERED_DATA
i = 0
for user_folder in os.listdir(filtered_data_path):
    i += 1
    if i == 5:
        break
    user_folder_path = os.path.join(filtered_data_path, user_folder)
    # Check if it's a directory
    if os.path.isdir(user_folder_path):
        trajectory_folder = os.path.join(user_folder_path, "Trajectory")
        labels_file = os.path.join(user_folder_path, "labels.txt")
        
        # Check if the Trajectory folder exists
        if os.path.exists(trajectory_folder):
            user_data = pd.DataFrame()
            
            # Iterate through each .plt file in the Trajectory folder
            for plt_file in os.listdir(trajectory_folder):
                if plt_file.endswith(".plt"):
                    plt_file_path = os.path.join(trajectory_folder, plt_file)
                    
                    # Read the .plt file using the read_plt function
                    points = read_plt(plt_file_path)
                    
                    # Add a user_id column to the DataFrame
                    points["user_id"] = user_folder
                    
                    # Append the DataFrame to the user's data
                    user_data = pd.concat([user_data, points], ignore_index=True)
            
            # If labels.txt exists, filter the data
            if os.path.exists(labels_file):
                # Read the labels.txt file
                labels = pd.read_csv(labels_file, sep="\t")
                labels.columns = ["start_time", "end_time", "transport_mode"]
                
                # Convert start_time and end_time to datetime
                labels["start_time"] = pd.to_datetime(labels["start_time"])
                labels["end_time"] = pd.to_datetime(labels["end_time"])
                
                # Filter for transportation modes: car, bus, or taxi
                valid_modes = ["car", "bus", "taxi"]
                labels = labels[labels["transport_mode"].isin(valid_modes)]
                
                # Filter the user's data based on the labels
                filtered_data = pd.DataFrame()
                for _, row in labels.iterrows():
                    mask = (user_data["time"] >= row["start_time"]) & (user_data["time"] <= row["end_time"])
                    filtered_data = pd.concat([filtered_data, user_data[mask]], ignore_index=True)
                
                # Update the user's data with the filtered data
                user_data = filtered_data
            
            # Append the filtered user's data to the main DataFrame
            all_data = pd.concat([all_data, user_data], ignore_index=True)

In [None]:
# much less data than it was before filtering
all_data

Unnamed: 0,lat,lon,alt,time,user_id
0,39.966107,116.340690,0.000000,2008-01-01 09:42:31,104
1,39.966100,116.341192,0.000000,2008-01-01 09:42:34,104
2,39.966090,116.341647,0.000000,2008-01-01 09:42:37,104
3,39.966098,116.342063,0.000000,2008-01-01 09:42:40,104
4,39.966103,116.342487,0.000000,2008-01-01 09:42:43,104
...,...,...,...,...,...
3221,39.975083,116.330000,177.165354,2007-10-10 11:39:42,105
3222,39.975017,116.329800,180.446194,2007-10-10 11:40:19,105
3223,39.975000,116.329567,180.446194,2007-10-10 11:40:49,105
3224,39.975000,116.329367,187.007874,2007-10-10 11:41:20,105


**Do on the whole data**

In [7]:
# Initialize an empty DataFrame
all_data = pd.DataFrame()

# Define the path to the FILTERED_DATA folder
filtered_data_path = "FILTERED_DATA"

# Iterate through each folder (user) in FILTERED_DATA
for user_folder in os.listdir(filtered_data_path):
    user_folder_path = os.path.join(filtered_data_path, user_folder)
    # Check if it's a directory
    if os.path.isdir(user_folder_path):
        trajectory_folder = os.path.join(user_folder_path, "Trajectory")
        labels_file = os.path.join(user_folder_path, "labels.txt")
        
        # Check if the Trajectory folder exists
        if os.path.exists(trajectory_folder):
            user_data = pd.DataFrame()
            
            # Iterate through each .plt file in the Trajectory folder
            for plt_file in os.listdir(trajectory_folder):
                if plt_file.endswith(".plt"):
                    plt_file_path = os.path.join(trajectory_folder, plt_file)
                    
                    # Read the .plt file using the read_plt function
                    points = read_plt(plt_file_path)
                    
                    # Add a user_id column to the DataFrame
                    points["user_id"] = user_folder
                    
                    # Append the DataFrame to the user's data
                    user_data = pd.concat([user_data, points], ignore_index=True)
            
            # If labels.txt exists, filter the data
            if os.path.exists(labels_file):
                # Read the labels.txt file
                labels = pd.read_csv(labels_file, sep="\t")
                labels.columns = ["start_time", "end_time", "transport_mode"]
                
                # Convert start_time and end_time to datetime
                labels["start_time"] = pd.to_datetime(labels["start_time"])
                labels["end_time"] = pd.to_datetime(labels["end_time"])
                
                # Filter for transportation modes: car, bus, or taxi
                valid_modes = ["car", "bus", "taxi"]
                labels = labels[labels["transport_mode"].isin(valid_modes)]
                
                # Filter the user's data based on the labels
                filtered_data = pd.DataFrame()
                for _, row in labels.iterrows():
                    mask = (user_data["time"] >= row["start_time"]) & (user_data["time"] <= row["end_time"])
                    filtered_data = pd.concat([filtered_data, user_data[mask]], ignore_index=True)
                
                # Update the user's data with the filtered data
                user_data = filtered_data
            
            # Append the filtered user's data to the main DataFrame
            all_data = pd.concat([all_data, user_data], ignore_index=True)

In [9]:
all_data['time'] = pd.to_datetime(all_data['time'])
all_data

Unnamed: 0,lat,lon,alt,time,user_id
0,39.966107,116.340690,0.000000,2008-01-01 09:42:31,104
1,39.966100,116.341192,0.000000,2008-01-01 09:42:34,104
2,39.966090,116.341647,0.000000,2008-01-01 09:42:37,104
3,39.966098,116.342063,0.000000,2008-01-01 09:42:40,104
4,39.966103,116.342487,0.000000,2008-01-01 09:42:43,104
...,...,...,...,...,...
2044977,39.991850,116.216700,226.377953,2007-10-19 05:50:43,114
2044978,39.991733,116.215333,226.377953,2007-10-19 05:51:01,114
2044979,39.992400,116.210750,252.624672,2007-10-19 05:51:46,114
2044980,39.992950,116.199233,337.926509,2007-10-19 05:53:30,114


In [10]:
all_data = all_data.sort_values(by=["user_id", "time"])
all_data

Unnamed: 0,lat,lon,alt,time,user_id
1414240,41.741415,86.186028,-777.0,2008-03-31 16:00:08,010
1414241,41.737063,86.179470,-777.0,2008-03-31 16:01:07,010
1414242,41.734105,86.172823,-777.0,2008-03-31 16:02:07,010
1414243,41.739110,86.166563,-777.0,2008-03-31 16:03:06,010
1414244,41.744368,86.159987,-777.0,2008-03-31 16:04:05,010
...,...,...,...,...,...
2022724,40.070186,116.314153,-45.0,2008-11-29 02:01:31,179
2022725,40.070193,116.314041,-48.0,2008-11-29 02:01:33,179
2022726,40.070224,116.313923,-51.0,2008-11-29 02:01:35,179
2022727,40.070227,116.313843,-56.0,2008-11-29 02:01:37,179


### Data Analysis (assuming there are no mistakes in the data loading, which we will need to check; hopefully there are no mistakes)

In [11]:
all_data.groupby("user_id").size().sort_values(ascending=False)

user_id
068    267199
128    242748
062    201119
085    201022
153    151708
084    122251
167    110376
115     99785
010     94389
052     80663
126     75290
141     73833
065     62041
179     45995
163     44918
020     23539
125     22227
081     19543
078     18217
089     15550
064     15409
112     10834
067     10810
144      7478
082      3965
069      2857
129      2834
111      2051
106      1985
102      1603
096      1479
101      1273
076      1155
021      1071
105      1022
058       971
073       642
154       637
092       582
110       549
086       488
098       452
080       401
139       328
104       320
161       281
108       193
174       179
056       163
175       163
091       139
075       131
053        89
114        26
100         9
dtype: int64

**What are the three users having less than 100 observations?**

In [12]:
rare_users = all_data.loc[all_data['user_id'].isin(['053', '114', '100'])]
rare_users

Unnamed: 0,lat,lon,alt,time,user_id
724990,39.978567,116.330283,183.727034,2008-04-30 13:39:34,053
724991,39.970667,116.346817,183.727034,2008-04-30 13:46:02,053
724992,39.966250,116.351050,187.007874,2008-04-30 13:48:31,053
724993,39.816983,119.477367,59.055118,2008-05-02 06:57:41,053
724994,39.826250,119.501367,124.671916,2008-05-02 07:09:48,053
...,...,...,...,...,...
2044977,39.991850,116.216700,226.377953,2007-10-19 05:50:43,114
2044978,39.991733,116.215333,226.377953,2007-10-19 05:51:01,114
2044979,39.992400,116.210750,252.624672,2007-10-19 05:51:46,114
2044980,39.992950,116.199233,337.926509,2007-10-19 05:53:30,114


Check how much time passed from the first observation to the last observation for each user

In [13]:
time_diff = rare_users.groupby('user_id')['time'].agg(lambda x: (x.max() - x.min()).total_seconds() / 60)
time_diff_df = time_diff.reset_index(name='time_in_minutes')
print(time_diff_df)
# based on this, filter user 100

  user_id  time_in_minutes
0     053      9326.650000
1     100         0.133333
2     114        29.383333


In [14]:
all_data = all_data.loc[all_data['user_id'] != '100']
all_data

Unnamed: 0,lat,lon,alt,time,user_id
1414240,41.741415,86.186028,-777.0,2008-03-31 16:00:08,010
1414241,41.737063,86.179470,-777.0,2008-03-31 16:01:07,010
1414242,41.734105,86.172823,-777.0,2008-03-31 16:02:07,010
1414243,41.739110,86.166563,-777.0,2008-03-31 16:03:06,010
1414244,41.744368,86.159987,-777.0,2008-03-31 16:04:05,010
...,...,...,...,...,...
2022724,40.070186,116.314153,-45.0,2008-11-29 02:01:31,179
2022725,40.070193,116.314041,-48.0,2008-11-29 02:01:33,179
2022726,40.070224,116.313923,-51.0,2008-11-29 02:01:35,179
2022727,40.070227,116.313843,-56.0,2008-11-29 02:01:37,179


In [15]:
pd.set_option('display.float_format', '{:.2f}'.format)
time_diff = all_data.groupby('user_id')['time'].agg(lambda x: (x.max() - x.min()).total_seconds() / 60)
time_diff_df = time_diff.reset_index(name='time_in_minutes')
time_diff_df['time_in_minutes'].describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

count        54.00
mean     202423.58
std      350515.19
min           3.85
1%           17.38
5%           83.74
10%        3907.29
25%       11188.33
50%       82855.05
75%      205278.68
90%      489168.64
95%      873576.32
99%     1595356.98
max     1792833.75
Name: time_in_minutes, dtype: float64

**Why there were 69 folders, but now there are 54? Seems like some users were not using a bus or a car at all!**

In [16]:
all_data.to_csv('Geolife_all_data.csv', index=False)

### Some further analysis

**What are the time differences between the consecutive observations**

In [None]:
all_data = pd.read_csv('Geolife_all_data.csv')
# for some reason after importing the data, the time column is not datetime. So we need to convert it once again
all_data['time'] = pd.to_datetime(all_data['time'])
all_data.dtypes

lat               float64
lon               float64
alt               float64
time       datetime64[ns]
user_id             int64
dtype: object

In [20]:
all_data['time_diff'] = all_data.groupby('user_id')['time'].diff().dt.total_seconds() / 60  # Convert to minutes

# Drop rows where time_diff is NaN (first observation for each user)
time_differences = all_data['time_diff'].dropna()

# Use describe with many percentiles to get a detailed picture
time_differences.describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

count   2044919.00
mean          5.35
std         788.07
min           0.00
1%            0.00
5%            0.00
10%           0.02
25%           0.03
50%           0.03
75%           0.03
90%           0.08
95%           0.10
99%           0.40
max      428887.58
Name: time_diff, dtype: float64

**We see that, on average, very few time passes between two consecutive obervations for a user**