In [1]:
import csv
import torch
import numpy as np
import pandas as pd
import copy
import networkx as nx
import random
import matplotlib.pyplot as plt
from math import sqrt 
import os




In [2]:
# Load the CSV file into a DataFrame
data = pd.read_csv("train.csv")

# Display the first five rows of the DataFrame
print(data.shape)
data.head()

(1458644, 11)


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [3]:
print(data['trip_duration'].nlargest(5000))

978383     3526282
924150     2227612
680594     2049578
355003     1939736
1234291      86392
            ...   
1152568       4593
1156474       4593
1407454       4593
224047        4592
273828        4592
Name: trip_duration, Length: 5000, dtype: int64


In [4]:
# Remove trips with trip_duration larger than 10000
data = data[data['trip_duration'] <= 10000]

In [5]:
print(data.shape)


(1456521, 11)


In [6]:
# Convert the pickup and dropoff timestamps to datetime objects
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
data['dropoff_datetime'] = pd.to_datetime(data['dropoff_datetime'])

In [7]:
# Remove rides with pickup or dropoff latitude outside the range [40.695, 40.830]
data = data[(data['pickup_latitude'] >= 40.695) & (data['pickup_latitude'] <= 40.830) & (data['dropoff_latitude'] >= 40.695) & (data['dropoff_latitude'] <= 40.830)]
data = data[(data['pickup_longitude'] >= -74.022) & (data['pickup_longitude'] <= -73.900) & (data['dropoff_longitude'] >= -74.022) & (data['dropoff_longitude'] <= -73.900)]

In [8]:
print(data.shape)

(1277659, 11)


In [9]:
data2_5=copy.deepcopy(data)
data2_5['pickup_longitude'] = data2_5['pickup_longitude'].apply(lambda x: round(x / 0.005) * 0.005)
data2_5['pickup_latitude'] = data2_5['pickup_latitude'].apply(lambda x: round(x / 0.005) * 0.005)
data2_5['dropoff_longitude'] = data2_5['dropoff_longitude'].apply(lambda x: round(x / 0.005) * 0.005)
data2_5['dropoff_latitude'] = data2_5['dropoff_latitude'].apply(lambda x: round(x / 0.005) *0.005)


In [10]:
import pandas as pd

# Define the start and end dates
start_date = pd.to_datetime('2016-01-04')
end_date = pd.to_datetime('2016-01-30')

# Create an empty DataFrame to store the time slots
time_slots = pd.DataFrame()

data_all_slots = {}
slot = 0
# Loop through each day
for day in pd.date_range(start=start_date, end=end_date, freq='D'):
    # Generate 12 time slots for each day
    slots = pd.date_range(start=day, periods=12, freq='2H')
    
    for timestamp in slots:
        start_time = timestamp
        end_time = timestamp + pd.Timedelta(hours=3)
        copy = data2_5[(data2_5['pickup_datetime'] >= start_time) & (data2_5['pickup_datetime'] < end_time)].copy()
        data_all_slots[slot]=(copy)
        slot += 1

In [11]:
print (len(data_all_slots))

324


In [12]:
def calc_air_distance(lat1, lon1, lat2, lon2):
    # Assuming 1 degree of latitude and longitude is approximately 111 kilometers
    lat_km = 111.0
    lon_km = 90.0

    # Calculate the difference in coordinates
    delta_lat = lat2 - lat1
    delta_lon = lon2 - lon1

    # Calculate the distance using Euclidean distance formula
    distance = sqrt((delta_lat * lat_km)**2 + (delta_lon * lon_km)**2)
    return distance

In [13]:
# 40.695  40.830  -74.022  -73.900
LongMatrix = np.zeros((123, 136))
# LongMatrix = np.array(LongMatrix)


LatMatrix = np.zeros((123, 136))
# LatMatrix = np.array(LatMatrix)
print(LongMatrix.shape)

(123, 136)


In [32]:
# Create a dictionary to store the directed graphs
x0 = -74022
y0 = 40695
graphs = {}
max_nodes = 0
# Loop through each data in data_all_slots
for key, data in data_all_slots.items():
    # Create a directed graph
    G = nx.DiGraph()
    selfloops = 0
    # Add edges to the graph
    for row in data.itertuples():
        source = (round(row.pickup_longitude, 3),round(row.pickup_latitude,3))
        destination = (round(row.dropoff_longitude,3),round( row.dropoff_latitude,3))
        distance = calc_air_distance(row.dropoff_latitude, row.dropoff_longitude, row.pickup_latitude, row.pickup_longitude)
        speed = 3600 * distance / row.trip_duration
        weight = speed
        if source != destination:
            G.add_edge(source, destination, weight=weight)
            x1 = int((1000 * source[0]) - x0)
            y1 = int((1000 * source[1]) - y0)
            x2 = int((1000 * destination[0]) - x0)
            y2 = int((1000 * destination[1]) - y0)
            LongMatrix[x1][y1] =round(source[0], 3)
            LongMatrix[x2][y2] = round(destination[0],3)
            LatMatrix[x1][y1] = round(source[1], 3)
            LatMatrix[x2][y2] = round(destination[1],3)
        else:
            selfloops += 1
    print(f'num of self loops in data = {selfloops}')
    print(f'num of edges in data = {G.number_of_edges()}')
    print(f'num of nodes in data = {G.number_of_nodes()}')
    if G.number_of_nodes() > max_nodes:
        max_nodes = G.number_of_nodes()
    # Add the graph to the dictionary
    graphs[key] = G
print (max_nodes)

num of self loops in data = 3
num of edges in data = 181
num of nodes in data = 144
num of self loops in data = 5
num of edges in data = 88
num of nodes in data = 96
num of self loops in data = 7
num of edges in data = 234
num of nodes in data = 144
num of self loops in data = 10
num of edges in data = 755
num of nodes in data = 196
num of self loops in data = 11
num of edges in data = 855
num of nodes in data = 191
num of self loops in data = 14
num of edges in data = 731
num of nodes in data = 186
num of self loops in data = 21
num of edges in data = 794
num of nodes in data = 192
num of self loops in data = 19
num of edges in data = 830
num of nodes in data = 189
num of self loops in data = 16
num of edges in data = 992
num of nodes in data = 205
num of self loops in data = 16
num of edges in data = 1120
num of nodes in data = 223
num of self loops in data = 13
num of edges in data = 852
num of nodes in data = 230
num of self loops in data = 11
num of edges in data = 487
num of node

In [15]:
# # Create a dictionary to store the directed graphs
# graphs = {}
# max_nodes = 0
# # Loop through each data in data_all_slots
# for key, data in data_all_slots.items():
#     # Create a directed graph
#     G = nx.DiGraph()
#     selfloops = 0
#     # Add edges to the graph
#     for row in data.itertuples():
#         source = (row.pickup_longitude, row.pickup_latitude, row.pickup_datetime.hour, row.pickup_datetime.dayofweek)
#         destination = (row.dropoff_longitude, row.dropoff_latitude, row.dropoff_datetime.hour, row.dropoff_datetime.dayofweek)
#         distance = calc_air_distance(row.dropoff_latitude, row.dropoff_longitude, row.pickup_latitude, row.pickup_longitude)
#         speed = 3600 * distance / row.trip_duration
#         weight = speed
#         print(source[0],source[1])
#         if (source[0] != destination[0] )and (source[1] != destination[1]):
#             G.add_edge(source, destination, weight=weight)
#         else:
#             selfloops += 1
#     print(f'num of self loops in data = {selfloops}')
#     print(f'num of edges in data = {G.number_of_edges()}')
#     print(f'num of nodes in data = {G.number_of_nodes()}')
#     if G.number_of_nodes() > max_nodes:
#         max_nodes = G.number_of_nodes()
#     # Add the graph to the dictionary
#     graphs[key] = G
# print (max_nodes)

In [33]:
non_zero_count = np.count_nonzero(LongMatrix)
print(non_zero_count)
non_zero_count = np.count_nonzero(LatMatrix)
print(non_zero_count)

512
512


In [34]:
for key, graph in graphs.items():
    for i in range (123):
        for j in range (136):
            if LongMatrix[i][j] != 0:
                node = (LongMatrix[i][j], LatMatrix[i][j])
                if not graph.has_node(node):
                    graph.add_node(node)

In [35]:
for key, graph in graphs.items():
    print(f'num of nodes in data = {graph.number_of_nodes()}')

num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
n

In [49]:
import networkx as nx

day_in_week = 2
hour = 1

# Iterate over each graph
for graph in graphs.values():
    # Iterate over each node in the graph
    for node in graph.nodes:
        graph.nodes[node]['day_in_week'] = day_in_week
        graph.nodes[node]['hour'] = hour
        print(graph.nodes[node])
    hour += 2
    if hour == 23:
        hour = 1
        day_in_week += 1
    if day_in_week == 8:
        day_in_week = 1
    # # Replace the node with new node
    # for graph in graphs.values():
    #     for node in list(graph.nodes):
    #         new_node = (node[0], node[1], day_in_week, hour)
    #         neighbors = list(graph.neighbors(node))
    #         graph.remove_node(node)
    #         graph.add_node(new_node)
    #         for neighbor in neighbors:
    #             graph.add_edge(new_node, neighbor)
    #     hour += 2
    #     if hour == 23:
    #         hour = 1
    #         day_in_week += 1
    #     if day_in_week == 8:
    #         day_in_week = 1

{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_week': 2, 'hour': 1}
{'day_in_w

In [50]:
import networkx as nx

# Create an empty graph
big_graph = nx.DiGraph()

# Connect all graphs in the graphs dictionary
for graph in graphs.values():
    big_graph = nx.compose_all([big_graph, graph])

# Print the number of nodes and edges in the big graph
print(f"Number of nodes in the big graph: {big_graph.number_of_nodes()}")
print(f"Number of edges in the big graph: {big_graph.number_of_edges()}")

Number of nodes in the big graph: 512
Number of edges in the big graph: 27667


In [64]:
i = 0
A = {}
for graph in graphs.values():
    A[i] = nx.adjacency_matrix(graph, weight='weight').toarray()
    i += 1

In [67]:
from scipy.sparse import lil_matrix

# Calculate the total size of the block matrix
total_rows = sum(matrix.shape[0] for matrix in A.values())
total_cols = sum(matrix.shape[1] for matrix in A.values())

# Create a sparse matrix of the appropriate size
block_matrix = lil_matrix((total_rows, total_cols))

# Place each matrix in the correct position in the block matrix
current_row = 0
current_col = 0
for matrix in A.values():
    rows, cols = matrix.shape
    block_matrix[current_row:current_row + rows, current_col:current_col + cols] = matrix
    current_row += rows
    current_col += cols

# Convert to a more efficient format for further operations if needed
block_matrix = block_matrix.tocsr()

print(block_matrix)

  (0, 1)	9.981232388839581
  (0, 50)	19.600838126752603
  (1, 2)	29.565944351525367
  (1, 46)	26.429520228520825
  (1, 63)	34.56426767117028
  (2, 112)	22.438386661425458
  (3, 4)	18.9473684210354
  (3, 26)	25.696972193353304
  (3, 71)	13.815038893690286
  (3, 75)	28.060769002734993
  (3, 134)	19.113284531307603
  (3, 137)	22.616028580524304
  (4, 13)	21.435309654867975
  (4, 39)	18.409090909091606
  (4, 92)	10.872483221482073
  (5, 6)	12.459146611262545
  (5, 52)	19.636363636345777
  (6, 3)	17.241054513721487
  (7, 1)	23.960745367907876
  (7, 8)	26.11580475247742
  (7, 74)	26.70538925523486
  (7, 91)	13.258954425678372
  (8, 98)	18.844164586580167
  (9, 10)	18.9135085189986
  (9, 126)	21.114442163387366
  :	:
  (165571, 165576)	11.249999999989768
  (165574, 165418)	20.91249722424965
  (165577, 165567)	25.520541158304354
  (165578, 165561)	8.756756756773681
  (165590, 165591)	22.154468446083047
  (165592, 165407)	27.20014384718297
  (165594, 165398)	19.57347569433174
  (165596, 165571)

In [78]:
import numpy as np

# Define the number of nodes in the graph
num_nodes = 512 * len(graphs)
print(num_nodes)

# Create an empty features matrix
features_matrix = np.zeros((num_nodes, 4))

for j, graph in enumerate(graphs.values()):
    # Loop through each node
    for i, node in enumerate(graph.nodes()):
        # Get the node attributes
        longitude = node[0]
        latitude = node[1]
        hour =  graph.nodes[node]['hour']
        day_in_week =  graph.nodes[node]['day_in_week']
        
        # Assign the features to the features matrix
        features_matrix[i+j*512] = [longitude, latitude, hour, day_in_week]

# Print the features matrix
print(features_matrix)

165888
[[-73.995  40.76    1.      2.   ]
 [-73.99   40.75    1.      2.   ]
 [-73.98   40.76    1.      2.   ]
 ...
 [-73.9    40.815   9.      3.   ]
 [-73.9    40.82    9.      3.   ]
 [-73.9    40.83    9.      3.   ]]


In [82]:
np.savez('dataset/traffic/temporal/traffic.npz', adj=block_matrix, features=features_matrix )