In [1]:
import pandas as pd

df = pd.read_csv('raw\\train.csv')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1710670 entries, 0 to 1710669
Data columns (total 9 columns):
 #   Column        Dtype  
---  ------        -----  
 0   TRIP_ID       int64  
 1   CALL_TYPE     object 
 2   ORIGIN_CALL   float64
 3   ORIGIN_STAND  float64
 4   TAXI_ID       int64  
 5   TIMESTAMP     int64  
 6   DAY_TYPE      object 
 7   MISSING_DATA  bool   
 8   POLYLINE      object 
dtypes: bool(1), float64(2), int64(3), object(3)
memory usage: 106.0+ MB


In [3]:
taxi_ids = df['TAXI_ID'].unique()
print(len(taxi_ids))

448


In [4]:
missing_data_counts = df['MISSING_DATA'].value_counts()
print("Number of records with MISSING_DATA = True:", missing_data_counts.get(True, 0))
print("Number of records with MISSING_DATA = False:", missing_data_counts.get(False, 0))

Number of records with MISSING_DATA = True: 10
Number of records with MISSING_DATA = False: 1710660


In [5]:
df_clean = df[df['MISSING_DATA'] == False].copy()
df_clean.reset_index(drop=True, inplace=True)

In [6]:
missing_data_counts = df_clean['MISSING_DATA'].value_counts()
print("Number of records with MISSING_DATA = True:", missing_data_counts.get(True, 0))
print("Number of records with MISSING_DATA = False:", missing_data_counts.get(False, 0))

Number of records with MISSING_DATA = True: 0
Number of records with MISSING_DATA = False: 1710660


In [7]:
taxi_dfs = {taxi_id: df_clean[df_clean['TAXI_ID'] == taxi_id] for taxi_id in taxi_ids}
print(len(taxi_dfs))
for taxi_id, taxi_df in taxi_dfs.items():
    print(f'Taxi ID: {taxi_id}, Number of records: {len(taxi_df)}')
print("Total number of entries:", sum(len(taxi_df) for taxi_df in taxi_dfs.values()))

448
Taxi ID: 20000589, Number of records: 4744
Taxi ID: 20000596, Number of records: 3978
Taxi ID: 20000320, Number of records: 6146
Taxi ID: 20000520, Number of records: 4960
Taxi ID: 20000337, Number of records: 5731
Taxi ID: 20000231, Number of records: 3056
Taxi ID: 20000456, Number of records: 2131
Taxi ID: 20000011, Number of records: 5963
Taxi ID: 20000403, Number of records: 9238
Taxi ID: 20000233, Number of records: 6313
Taxi ID: 20000571, Number of records: 4201
Taxi ID: 20000497, Number of records: 4177
Taxi ID: 20000570, Number of records: 3883
Taxi ID: 20000005, Number of records: 6093
Taxi ID: 20000089, Number of records: 7267
Taxi ID: 20000423, Number of records: 5147
Taxi ID: 20000657, Number of records: 4562
Taxi ID: 20000309, Number of records: 3942
Taxi ID: 20000161, Number of records: 3930
Taxi ID: 20000178, Number of records: 3236
Taxi ID: 20000235, Number of records: 6410
Taxi ID: 20000653, Number of records: 6070
Taxi ID: 20000009, Number of records: 4775
Taxi ID

In [8]:
import ast

def process_polyline(polyline, timestamp):
    coords = ast.literal_eval(polyline)
    delta_seconds = 15
    data = []
    for i, (lon, lat) in enumerate(coords):
        data.append({'latitude': lat, 'longitude': lon, 'timestamp': timestamp + i * delta_seconds})
    trajectory_df = pd.DataFrame(data)
    return trajectory_df
    
    

In [10]:
processed_data = {}
for taxi_id, taxi_df in taxi_dfs.items():
    processed_data[taxi_id] = {}
    for _, row in taxi_df.iterrows():
        timestamp = int(row['TIMESTAMP'])
        polyline = str(row['POLYLINE'])
        tripid = str(row['TRIP_ID'])
        trajectory = process_polyline(polyline, timestamp)

        processed_data[taxi_id][tripid] = trajectory

In [11]:
import pickle

with open('processed/porto_data.pkl', 'wb') as f:
    pickle.dump(processed_data, f)

In [1]:
import pickle

with open('processed/porto_data.pkl', 'rb') as f:
    processed_data = pickle.load(f)

In [2]:
import folium

# Center the map on Porto
m = folium.Map(location=[41.15, -8.61], zoom_start=12, tiles='cartodbpositron')

for taxi_id in list(processed_data.keys())[:1]:
    print(taxi_id)
    trips = processed_data[taxi_id]
    for i, (trip_id, traj) in enumerate(trips.items()):
        print(trip_id)
        if i >= 4:
            break
        if not traj.empty:
            points = traj[['latitude', 'longitude']].values.tolist()
            folium.PolyLine(points, color='blue', weight=2, opacity=0.2).add_to(m)

m

20000589
1372636858620000589
1372637345620000589
1372661951620000589
1372667894620000589
1372670054620000589


In [3]:
m = folium.Map(location=[41.15, -8.61], zoom_start=12, tiles='cartodbpositron')
traj = processed_data[20000589]['1372667894620000589']
points = traj[['latitude', 'longitude']].values.tolist()
folium.PolyLine(points, color='blue', weight=2, opacity=0.2).add_to(m)
m

In [4]:
from geopy.distance import geodesic

traj = processed_data[20000589]['1372667894620000589']
distances = [0.0]
for i in range(1, len(traj)):
    prev_point = (traj.iloc[i-1]['latitude'], traj.iloc[i-1]['longitude'])
    curr_point = (traj.iloc[i]['latitude'], traj.iloc[i]['longitude'])
    distances.append(geodesic(prev_point, curr_point).meters)

traj_with_dist = traj.copy()
traj_with_dist['distance_diff'] = distances
print(traj_with_dist)

     latitude  longitude   timestamp  distance_diff
0   41.154102  -8.649342  1372667894       0.000000
1   41.154201  -8.649936  1372667909      51.057515
2   41.153769  -8.650485  1372667924      66.523328
3   41.153283  -8.650971  1372667939      67.656185
4   41.152581  -8.651763  1372667954     102.458571
5   41.152257  -8.653365  1372667969     139.204459
6   41.152212  -8.655048  1372667984     141.361582
7   41.151627  -8.656290  1372667999     122.841784
8   41.151033  -8.657784  1372668014     141.701875
9   41.150790  -8.659431  1372668029     140.863359
10  41.150673  -8.660088  1372668044      56.660693
11  41.150457  -8.661213  1372668059      97.435434
12  41.150475  -8.661483  1372668074      22.752747
13  41.150484  -8.661483  1372668089       0.999511
14  41.150484  -8.661492  1372668104       0.755492
15  41.150502  -8.662608  1372668119      93.702300
16  41.150718  -8.664903  1372668134     194.137781
17  41.151609  -8.667648  1372668149     250.770840
18  41.15269

In [5]:
import pickle

with open('processed/porto_data_cleaned.pkl', 'rb') as f:
    clean_data = pickle.load(f)

In [None]:
print(len(clean_data))

count = 0
for taxi_id, trips in clean_data.items():
    count += len(trips)
print(count)

448
1710579


In [8]:
import folium

# Center the map on Porto
m = folium.Map(location=[41.15, -8.61], zoom_start=12, tiles='cartodbpositron')

for taxi_id in list(clean_data.keys())[:1]:
    trips = clean_data[taxi_id]
    for i, (trip_id, traj) in enumerate(trips.items()):
        if i >= 4:
            break
        if not traj.empty:
            points = traj[['latitude', 'longitude']].values.tolist()
            folium.PolyLine(points, color='blue', weight=2, opacity=0.2).add_to(m)

m

In [9]:
# Filter out trajectories with less than 3 entries
filtered_clean_data = {}
for taxi_id, trips in clean_data.items():
    filtered_trips = {}
    for trip_id, traj in trips.items():
        if len(traj) >= 3:
            filtered_trips[trip_id] = traj
    if filtered_trips:  # Only add taxi if it has at least one valid trip
        filtered_clean_data[taxi_id] = filtered_trips

clean_data = filtered_clean_data

In [14]:
print(len(filtered_clean_data))

count = 0
for taxi_id, trips in filtered_clean_data.items():
    count += len(trips)
print(count)

442
1666247


In [16]:
# Center the map on Porto
m = folium.Map(location=[41.15, -8.61], zoom_start=12, tiles='cartodbpositron')

for taxi_id in list(filtered_clean_data.keys())[:10]:
    trips = filtered_clean_data[taxi_id]
    for i, (trip_id, traj) in enumerate(trips.items()):
        if i >= 10:
            break
        points = traj[['latitude', 'longitude']].values.tolist()
        folium.PolyLine(points, color='blue', weight=2, opacity=0.2).add_to(m)

m

In [15]:
import pickle

with open('processed/porto_filtered_clean_data.pkl', 'wb') as f:
    pickle.dump(filtered_clean_data, f)