In [13]:
import pandas as pd

# Import the CSV file
df_endpoints = pd.read_csv('porto_end_points.csv')

In [16]:
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt

kmeans = KMeans(n_clusters=3400, random_state=42, n_init=10)
kmeans.fit_predict(df_endpoints[['latitude', 'longitude']])

array([ 160,  395, 1639, ..., 3329,  713,  817])

In [17]:
centroids_df = pd.DataFrame(kmeans.cluster_centers_, columns=['latitude', 'longitude'])
centroids_df.to_csv(f'end_point_centroids_k3400.csv', index=False)

In [5]:
import pandas as pd

# Import the CSV file
centroids_df = pd.read_csv('new_porto_data\\end_point_centroids_k3400.csv')

In [6]:
import folium

# Create a folium map centered on Porto
porto_map = folium.Map(
    location=[41.1579, -8.6291],  # Porto center coordinates
    zoom_start=12,
    tiles='OpenStreetMap'
)

# Add centroids to the map
for idx, row in centroids_df.iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=2,
        popup=f'Centroid {idx}',
        color='red',
        fill=True,
        fillOpacity=0.6
    ).add_to(porto_map)

# Display the map
porto_map

In [7]:
import pickle

with open('porto_filtered_clean_data.pkl', 'rb') as file:
    data = pickle.load(file)

In [11]:
print(len(data))
print(data[20000589]['1372636858620000589'])

442
     latitude  longitude   timestamp
0   41.141412  -8.618643  1372636858
1   41.141376  -8.618499  1372636873
2   41.142510  -8.620326  1372636888
3   41.143815  -8.622153  1372636903
4   41.144373  -8.623953  1372636918
5   41.144778  -8.626680  1372636933
6   41.144697  -8.627373  1372636948
7   41.145210  -8.630226  1372636963
8   41.146920  -8.632746  1372636978
9   41.148225  -8.631738  1372636993
10  41.150385  -8.629938  1372637008
11  41.151213  -8.629110  1372637023
12  41.151240  -8.629128  1372637038
13  41.152203  -8.628786  1372637053
14  41.152374  -8.628687  1372637068
15  41.152518  -8.628759  1372637083
16  41.152680  -8.630838  1372637098
17  41.153022  -8.632323  1372637113
18  41.154489  -8.631144  1372637128
19  41.154507  -8.630829  1372637143
20  41.154516  -8.630829  1372637158
21  41.154498  -8.630829  1372637173
22  41.154489  -8.630838  1372637188


In [12]:
centroids_df = pd.read_csv('new_porto_data\\end_point_centroids_k3400.csv')

In [13]:
from sklearn.neighbors import NearestNeighbors
nn_model = NearestNeighbors(n_neighbors=1, metric='euclidean')
nn_model.fit(centroids_df[['latitude', 'longitude']].values)

0,1,2
,n_neighbors,1
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'euclidean'
,p,2
,metric_params,
,n_jobs,


In [15]:
test_taxi_ids = list(data.keys())[-12:]
test_data = []

for taxi_id in test_taxi_ids:
    for _, trip_df in data[taxi_id].items():

        last_point = trip_df.iloc[-1]
        point = [last_point['latitude'], last_point['longitude']]
        _, indices = nn_model.kneighbors([point])
        end_centroid_idx = indices[0][0]

        start_timestamp = trip_df.iloc[0]['timestamp']
        start_date = pd.to_datetime(start_timestamp, unit='s')
        month = start_date.month
        day_of_week = start_date.dayofweek
        hour = start_date.hour
        metadata = [month, day_of_week, hour]

        seq = trip_df[['latitude', 'longitude']].iloc[:-1].values.tolist()
        # format is [sequence], [metadata], [y real], y_centroid
        extracted_data = [seq, metadata, point, end_centroid_idx]
        test_data.append(extracted_data)

with open('meta_porto_data/test_data.pkl', 'wb') as file:
    pickle.dump(test_data, file)


In [17]:
print(test_data[:10])

[[[[41.158989, -8.644779], [41.158989, -8.644806], [41.158998, -8.644815], [41.159016, -8.644806], [41.159025, -8.644788], [41.159034, -8.64477]], [8, 1, 13], [41.159025, -8.644734], 1476], [[[41.158737, -8.644743], [41.158728, -8.644743], [41.15889, -8.644761], [41.158827, -8.644725]], [8, 1, 14], [41.158719, -8.644698], 1476], [[[41.159043, -8.644743], [41.158989, -8.644689], [41.158971, -8.644653], [41.15889, -8.644581], [41.15898, -8.644743], [41.159043, -8.644815], [41.159106, -8.644914], [41.15916, -8.64495], [41.159196, -8.645004], [41.159214, -8.645004], [41.15925, -8.645031], [41.159214, -8.64495], [41.159115, -8.644887]], [8, 2, 13], [41.159061, -8.64486], 1476], [[[41.15871, -8.644716], [41.1588, -8.644716], [41.158782, -8.644734], [41.158899, -8.644743], [41.15898, -8.644779], [41.159097, -8.644752], [41.158989, -8.644743], [41.158962, -8.644752], [41.158791, -8.644761], [41.158728, -8.644743], [41.1588, -8.644743], [41.158899, -8.644716], [41.158953, -8.644716], [41.158854

In [16]:
remaining_taxi_ids = list(data.keys())[:-12]
num_parts = 10
part_size = len(remaining_taxi_ids) // num_parts

for part_idx in range(num_parts):
    start_idx = part_idx * part_size
    if part_idx == num_parts - 1:
        end_idx = len(remaining_taxi_ids)
    else:
        end_idx = (part_idx + 1) * part_size
    
    part_taxi_ids = remaining_taxi_ids[start_idx:end_idx]
    part_data = []
    
    for taxi_id in part_taxi_ids:
        for _, trip_df in data[taxi_id].items():
            last_point = trip_df.iloc[-1]
            point = [last_point['latitude'], last_point['longitude']]
            _, indices = nn_model.kneighbors([point])
            end_centroid_idx = indices[0][0]

            start_timestamp = trip_df.iloc[0]['timestamp']
            start_date = pd.to_datetime(start_timestamp, unit='s')
            month = start_date.month
            day_of_week = start_date.dayofweek
            hour = start_date.hour
            metadata = [month, day_of_week, hour]
            
            seq = trip_df[['latitude', 'longitude']].iloc[:-1].values.tolist()
            extracted_data = [seq, metadata, point, end_centroid_idx]
            part_data.append(extracted_data)
    
    with open(f'meta_porto_data/train_data_part_{part_idx}_{num_parts}.pkl', 'wb') as file:
        pickle.dump(part_data, file)
    
    print(f"Part {part_idx}: {len(part_taxi_ids)} taxi IDs, {len(part_data)} trips")

Part 0: 43 taxi IDs, 228035 trips
Part 1: 43 taxi IDs, 209718 trips
Part 2: 43 taxi IDs, 170033 trips
Part 3: 43 taxi IDs, 171139 trips
Part 4: 43 taxi IDs, 143244 trips
Part 5: 43 taxi IDs, 156734 trips
Part 6: 43 taxi IDs, 149209 trips
Part 7: 43 taxi IDs, 154194 trips
Part 8: 43 taxi IDs, 144008 trips
Part 9: 43 taxi IDs, 125658 trips


In [4]:
import pickle
import pandas as pd
with open('new_porto_data/new_porto_test_data.pkl', 'rb') as file:
    test_data = pickle.load(file)

centroids_df = pd.read_csv('new_porto_data/end_point_centroids_k3400.csv')

In [105]:
import folium
import random

print(len(centroids_df))

random_index = random.randint(0, len(test_data) - 1)

first_y = test_data[random_index][1]
first_centroid_index = test_data[random_index][2]
print(first_y)
print(first_centroid_index)

print(centroids_df.iloc[first_centroid_index])

# Create a map centered on the first point
map_center = folium.Map(
    location=first_y,
    zoom_start=15,
    tiles='OpenStreetMap'
)

folium.Marker(
    location=first_y,
    popup='Actual Endpoint',
    icon=folium.Icon(color='blue', icon='info-sign')
).add_to(map_center)

centroid_coords = [centroids_df.iloc[first_centroid_index]['latitude'], 
                   centroids_df.iloc[first_centroid_index]['longitude']]
folium.Marker(
    location=centroid_coords,
    popup=f'Centroid {first_centroid_index}',
    icon=folium.Icon(color='red', icon='info-sign')
).add_to(map_center)

map_center

3400
[41.155164, -8.638407]
3389
latitude     41.154707
longitude    -8.638212
Name: 3389, dtype: float64


In [1]:
import pickle
with open('porto_filtered_clean_data.pkl', 'rb') as file:
    porto_data = pickle.load(file)

In [2]:
all_latitudes = []
all_longitudes = []
for taxi_id, trips in porto_data.items():
    for trip_id, trip_df in trips.items():
        all_latitudes.extend(trip_df['latitude'].tolist())
        all_longitudes.extend(trip_df['longitude'].tolist())

In [3]:
print(len(all_latitudes), len(all_longitudes))

83286453 83286453


In [4]:
import numpy as np

lat_mean = np.mean(all_latitudes)
lat_std = np.std(all_latitudes)
lon_mean = np.mean(all_longitudes)
lon_std = np.std(all_longitudes)

print(f"Latitude - Mean: {lat_mean}, Std: {lat_std}")
print(f"Longitude - Mean: {lon_mean}, Std: {lon_std}")

Latitude - Mean: 41.15940314187152, Std: 0.07367446333478521
Longitude - Mean: -8.616175170744098, Std: 0.05705206609122599


In [1]:
import pandas as pd

df = pd.read_csv('test.csv')

In [2]:
import ast

def process_polyline(polyline):
    coords = ast.literal_eval(polyline)
    delta_seconds = 15
    data = []
    for i, (lon, lat) in enumerate(coords):
        data.append([lat, lon])
    return data

def process_meta(timestamp) :
    start_date = pd.to_datetime(timestamp, unit='s')
    month = start_date.month
    day_of_week = start_date.dayofweek
    hour = start_date.hour
    metadata = [month, day_of_week, hour]

    return metadata

In [3]:
trip_ids = df['TRIP_ID'].unique()
print(len(trip_ids))

320


In [4]:
trip_df = {trip_id: df[df['TRIP_ID'] == trip_id] for trip_id in trip_ids}


In [5]:
print(len(trip_df))
print(trip_df[trip_ids[0]].head())

320
  TRIP_ID CALL_TYPE  ORIGIN_CALL  ORIGIN_STAND   TAXI_ID   TIMESTAMP DAY_TYPE  \
0      T1         B          NaN          15.0  20000542  1408039037        A   

   MISSING_DATA                                           POLYLINE  
0         False  [[-8.585676,41.148522],[-8.585712,41.148639],[...  


In [7]:
test_data = {}
for trip_id, t_df in trip_df.items():
    data = {}
    data["seq"] = process_polyline(t_df['POLYLINE'].iloc[0])
    data["meta"] = process_meta(t_df['TIMESTAMP'].iloc[0])
    test_data[trip_id] = data

In [9]:
print(test_data['T1'])

{'seq': [[41.148522, -8.585676], [41.148639, -8.585712], [41.148855, -8.585685], [41.148927, -8.58573], [41.148963, -8.585982], [41.148954, -8.586396], [41.14872, -8.586072], [41.147847, -8.586324], [41.14746, -8.586999], [41.147154, -8.586576], [41.146623, -8.584884]], 'meta': [8, 3, 17]}


In [10]:
stats = {"lat_mean" : 41.15940314187152,
        "lat_std" : 0.07367446333478521,
        "lon_mean" : -8.616175170744098,
        "lon_std" : 0.05705206609122599
        }

In [13]:
import numpy as np

def standardize(x_seq):
    x_seq_array = np.array(x_seq)
    x_seq_array[:, 0] = (x_seq_array[:, 0] - stats["lat_mean"]) / stats["lat_std"]
    x_seq_array[:, 1] = (x_seq_array[:, 1] - stats["lon_mean"]) / stats["lon_std"]

    return x_seq_array

def normalize(meta):
    meta_array = np.array(meta, dtype=np.float32)

    meta_array[0] = (meta_array[0] - 1) / 11
    meta_array[1] = (meta_array[1] - 1) / 30
    meta_array[2] = meta_array[2] / 23

    return meta_array

In [14]:
standardized_test = {}
for trip_id, data in test_data.items():
   
    std_norm_data = {}
    standardized_seq = standardize(data['seq'])
    normalized_meta = normalize(data['meta'])
    std_norm_data['seq'] = standardized_seq.tolist()
    std_norm_data['meta'] = normalized_meta.tolist()

    standardized_test[trip_id] = std_norm_data

In [15]:
import pickle

with open('test_standardized.pkl', 'wb') as file:
    pickle.dump(standardized_test, file)