In [1]:
import csv
import torch
import numpy as np
import pandas as pd
import copy
import networkx as nx
import random
import matplotlib.pyplot as plt
from math import sqrt 
import os




In [2]:
# Load the CSV file into a DataFrame
data = pd.read_csv("train.csv")

# Display the first five rows of the DataFrame
print(data.shape)
data.head()

(1458644, 11)


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [3]:
print(data['trip_duration'].nlargest(5000))

978383     3526282
924150     2227612
680594     2049578
355003     1939736
1234291      86392
            ...   
1152568       4593
1156474       4593
1407454       4593
224047        4592
273828        4592
Name: trip_duration, Length: 5000, dtype: int64


In [4]:
# Remove trips with trip_duration larger than 10000
data = data[data['trip_duration'] <= 10000]

In [5]:
print(data.shape)


(1456521, 11)


In [6]:
# Convert the pickup and dropoff timestamps to datetime objects
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
data['dropoff_datetime'] = pd.to_datetime(data['dropoff_datetime'])

In [7]:
# Remove rides with pickup or dropoff latitude outside the range [40.695, 40.830]
data = data[(data['pickup_latitude'] >= 40.695) & (data['pickup_latitude'] <= 40.830) & (data['dropoff_latitude'] >= 40.695) & (data['dropoff_latitude'] <= 40.830)]
data = data[(data['pickup_longitude'] >= -74.022) & (data['pickup_longitude'] <= -73.900) & (data['dropoff_longitude'] >= -74.022) & (data['dropoff_longitude'] <= -73.900)]

In [8]:
print(data.shape)

(1277659, 11)


In [9]:
data2_5=copy.deepcopy(data)
data2_5['pickup_longitude'] = data2_5['pickup_longitude'].apply(lambda x: round(x / 0.005) * 0.005)
data2_5['pickup_latitude'] = data2_5['pickup_latitude'].apply(lambda x: round(x / 0.005) * 0.005)
data2_5['dropoff_longitude'] = data2_5['dropoff_longitude'].apply(lambda x: round(x / 0.005) * 0.005)
data2_5['dropoff_latitude'] = data2_5['dropoff_latitude'].apply(lambda x: round(x / 0.005) *0.005)


In [10]:
import pandas as pd

# Define the start and end dates
start_date = pd.to_datetime('2016-01-04')
end_date = pd.to_datetime('2016-01-30')

# Create an empty DataFrame to store the time slots
time_slots = pd.DataFrame()

data_all_slots = {}
slot = 0
# Loop through each day
for day in pd.date_range(start=start_date, end=end_date, freq='D'):
    # Generate 12 time slots for each day
    slots = pd.date_range(start=day, periods=12, freq='2H')
    
    for timestamp in slots:
        start_time = timestamp
        end_time = timestamp + pd.Timedelta(hours=3)
        copy = data2_5[(data2_5['pickup_datetime'] >= start_time) & (data2_5['pickup_datetime'] < end_time)].copy()
        data_all_slots[slot]=(copy)
        slot += 1

In [11]:
print (len(data_all_slots))

324


In [12]:
def calc_air_distance(lat1, lon1, lat2, lon2):
    # Assuming 1 degree of latitude and longitude is approximately 111 kilometers
    lat_km = 111.0
    lon_km = 90.0

    # Calculate the difference in coordinates
    delta_lat = lat2 - lat1
    delta_lon = lon2 - lon1

    # Calculate the distance using Euclidean distance formula
    distance = sqrt((delta_lat * lat_km)**2 + (delta_lon * lon_km)**2)
    return distance

In [63]:
# 40.695  40.830  -74.022  -73.900
LongMatrix = np.zeros((123, 136))
# LongMatrix = np.array(LongMatrix)


LatMatrix = np.zeros((123, 136))
# LatMatrix = np.array(LatMatrix)
print(LongMatrix.shape)

(123, 136)


In [64]:
# Create a dictionary to store the directed graphs
x0 = -74022
y0 = 40695
graphs = {}
max_nodes = 0
# Loop through each data in data_all_slots
for key, data in data_all_slots.items():
    # Create a directed graph
    G = nx.DiGraph()
    selfloops = 0
    # Add edges to the graph
    for row in data.itertuples():
        source = (round(row.pickup_longitude, 3),round(row.pickup_latitude,3))
        destination = (round(row.dropoff_longitude,3),round( row.dropoff_latitude,3))
        distance = calc_air_distance(row.dropoff_latitude, row.dropoff_longitude, row.pickup_latitude, row.pickup_longitude)
        speed = 3600 * distance / row.trip_duration
        weight = speed
        if source != destination:
            G.add_edge(source, destination, weight=weight)
            x1 = int((1000 * source[0]) - x0)
            y1 = int((1000 * source[1]) - y0)
            x2 = int((1000 * destination[0]) - x0)
            y2 = int((1000 * destination[1]) - y0)
            LongMatrix[x1][y1] =round(source[0], 3)
            LongMatrix[x2][y2] = round(destination[0],3)
            LatMatrix[x1][y1] = round(source[1], 3)
            LatMatrix[x2][y2] = round(destination[1],3)
        else:
            selfloops += 1
    print(f'num of self loops in data = {selfloops}')
    print(f'num of edges in data = {G.number_of_edges()}')
    print(f'num of nodes in data = {G.number_of_nodes()}')
    if G.number_of_nodes() > max_nodes:
        max_nodes = G.number_of_nodes()
    # Add the graph to the dictionary
    graphs[key] = G
print (max_nodes)

num of self loops in data = 3
num of edges in data = 181
num of nodes in data = 144
num of self loops in data = 5
num of edges in data = 88
num of nodes in data = 96
num of self loops in data = 7
num of edges in data = 234
num of nodes in data = 144
num of self loops in data = 10
num of edges in data = 755
num of nodes in data = 196
num of self loops in data = 11
num of edges in data = 855
num of nodes in data = 191
num of self loops in data = 14
num of edges in data = 731
num of nodes in data = 186
num of self loops in data = 21
num of edges in data = 794
num of nodes in data = 192
num of self loops in data = 19
num of edges in data = 830
num of nodes in data = 189
num of self loops in data = 16
num of edges in data = 992
num of nodes in data = 205
num of self loops in data = 16
num of edges in data = 1120
num of nodes in data = 223
num of self loops in data = 13
num of edges in data = 852
num of nodes in data = 230
num of self loops in data = 11
num of edges in data = 487
num of node

In [63]:
# # Create a dictionary to store the directed graphs
# graphs = {}
# max_nodes = 0
# # Loop through each data in data_all_slots
# for key, data in data_all_slots.items():
#     # Create a directed graph
#     G = nx.DiGraph()
#     selfloops = 0
#     # Add edges to the graph
#     for row in data.itertuples():
#         source = (row.pickup_longitude, row.pickup_latitude, row.pickup_datetime.hour, row.pickup_datetime.dayofweek)
#         destination = (row.dropoff_longitude, row.dropoff_latitude, row.dropoff_datetime.hour, row.dropoff_datetime.dayofweek)
#         distance = calc_air_distance(row.dropoff_latitude, row.dropoff_longitude, row.pickup_latitude, row.pickup_longitude)
#         speed = 3600 * distance / row.trip_duration
#         weight = speed
#         print(source[0],source[1])
#         if (source[0] != destination[0] )and (source[1] != destination[1]):
#             G.add_edge(source, destination, weight=weight)
#         else:
#             selfloops += 1
#     print(f'num of self loops in data = {selfloops}')
#     print(f'num of edges in data = {G.number_of_edges()}')
#     print(f'num of nodes in data = {G.number_of_nodes()}')
#     if G.number_of_nodes() > max_nodes:
#         max_nodes = G.number_of_nodes()
#     # Add the graph to the dictionary
#     graphs[key] = G
# print (max_nodes)

-73.995 40.76
-73.99 40.75
-73.985 40.765
-74.005 40.75
-73.955 40.77
-73.98 40.785000000000004
-73.99 40.725
-73.965 40.755
-74.005 40.74
-74.005 40.74
-73.96000000000001 40.76
-73.97 40.755
-73.985 40.735
-73.98 40.775
-73.97500000000001 40.725
-74.0 40.745
-73.99 40.72
-73.97500000000001 40.78
-73.95 40.725
-73.935 40.715
-73.985 40.77
-73.97500000000001 40.785000000000004
-74.0 40.745
-73.96000000000001 40.765
-73.95 40.785000000000004
-74.01 40.74
-74.005 40.735
-73.965 40.76
-74.01 40.745
-73.985 40.77
-74.0 40.745
-73.955 40.78
-73.955 40.775
-73.955 40.77
-74.0 40.72
-73.965 40.79
-73.99 40.730000000000004
-73.995 40.745
-73.99 40.745
-73.995 40.74
-73.99 40.75
-74.01 40.705
-73.995 40.72
-74.0 40.74
-73.97 40.755
-73.97500000000001 40.76
-74.005 40.730000000000004
-73.955 40.775
-74.005 40.74
-73.98 40.775
-73.985 40.765
-73.995 40.755
-73.97500000000001 40.75
-73.955 40.81
-73.99 40.725
-73.955 40.77
-74.0 40.75
-73.99 40.72
-73.93 40.695
-73.97 40.795
-73.985 40.725
-73.955 

In [65]:
non_zero_count = np.count_nonzero(LongMatrix)
print(non_zero_count)
non_zero_count = np.count_nonzero(LatMatrix)
print(non_zero_count)

512
512


In [67]:
for key, graph in graphs.items():
    for i in range (123):
        for j in range (136):
            if LongMatrix[i][j] != 0:
                node = (LongMatrix[i][j], LatMatrix[i][j])
                if not graph.has_node(node):
                    graph.add_node(node)

In [70]:
for key, graph in graphs.items():
    print(f'num of nodes in data = {graph.number_of_nodes()}')

num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
num of nodes in data = 512
n