In [1]:
import pandas as pd
import requests 
from xml.etree import ElementTree
from datetime import datetime

In [26]:
environments = ["local", "google_colab"]
# please select one of the environments
selected_env = environments[0]
chunk_size = 12 ** 6
TrafficEvents = None
WeatherEvents = None

max_distance_from_node = 50 # Meter
number_of_nearest_node = 5 # count

OSRM_BASIC_URL = "https://usaeta.bluebitsoft.com/"
OSM_BASIC_URL = "https://api.openstreetmap.org/api/0.6/"
if selected_env == "local":
    TrafficEvents = {
        "name": "TrafficEvents",
        # "file_name": "TrafficEvents_Aug16_Dec19_Publish_Road_Construction.csv",
        "file_name": "All_Constructions_June2020_Combined_TO_StartEnd_30_100.csv",
        "path": "/home/amin/CETI/RoadConstruction/TrafficEventData/",
    }
    WeatherEvents = {
        "name": "WeatherEvent",
        "file_name": "WeatherEvents_Aug16_Dec19_Publish.csv",
        "path": "/home/amin/CETI/RoadConstruction/WeatherEventData/",
    }

else:
    TrafficEvents = {
        "name": "TrafficEvents",
        "file_name": "TrafficEvents_Aug16_Dec19_Publish.csv",
        "id": "1uGLJS4uIbiUf7PjVGtQxo2afjClQT8JY",
    }
    WeatherEvents = {
        "name": "WeatherEvent",
        "file_name": "WeatherEvents_Aug16_Dec19_Publish.csv",
        "id": "1iNUrJYyxBxmklh0e6Iy61eXL34UJo6X8",
    }
    
load_data_type = ["complete", "part"]
selected_load_data_type = load_data_type[0]

chunk_size_each_part = 500
start_load_part_id = 0
end_load_part = 100


must_split_data = True


In [3]:
if selected_env == "google_colab":
    !pip install -U -q PyDrive
    !pip install --upgrade -q gspread

In [4]:
if selected_env == "google_colab":
    from pydrive.auth import GoogleAuth
    from pydrive.drive import GoogleDrive
    from google.colab import auth
    from oauth2client.client import GoogleCredentials

    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive = GoogleDrive(gauth)

    traffic_events_downloaded = drive.CreateFile(TrafficEvents)
    traffic_events_downloaded.GetContentFile(TrafficEvents["file_name"])

    weather_events_downloaded = drive.CreateFile(WeatherEvents)
    weather_events_downloaded.GetContentFile(WeatherEvents["file_name"])

In [5]:

def read_big_data_by_filter_with_key_values(path, filters):
    data = None
    print("method {} is started ".format("read_big_data_filter_with_key_values"))
    print("read csv data is started with file {}".format(path))
    print("chunk size is {}".format(chunk_size))
    for key, value in zip(filters['keys'], filters['values']):
        print("the key is {} and the value is {}".format(filters['keys'], filters['values']))
    for num, df in enumerate(pd.read_csv(path, chunksize=chunk_size), start=1):
        print("continue reading file page num is {}".format(num))
        for key, value in zip(filters['keys'], filters['values']):
            df = df[df[key] == value]
        data = df.append(data)
    print("method {} finished ".format("read_big_data_filter_with_key_values"))
    return data

In [6]:
weather_file_path = WeatherEvents["path"] + WeatherEvents["file_name"]
traffic_file_path = TrafficEvents["path"] + TrafficEvents["file_name"]

if selected_env == "google_colab":
    traffic_file_path = TrafficEvents["file_name"]
    weather_file_path = WeatherEvents["file_name"]
traffic_filters = {
    "keys": ["Type"],
    "values": ["Construction"],
    }
weather_filters = {
    "keys": [],
    "values": [],
}

traffic_events_data = read_big_data_by_filter_with_key_values(traffic_file_path, traffic_filters)
weather_events_data = read_big_data_by_filter_with_key_values(weather_file_path, weather_filters)


method read_big_data_filter_with_key_values is started 
read csv data is started with file /home/amin/CETI/RoadConstruction/TrafficEventData/All_Constructions_June2020_Combined_TO_StartEnd_30_100.csv
chunk size is 2985984
the key is ['Type'] and the value is ['Construction']


  if (await self.run_code(code, result,  async_=asy)):


continue reading file page num is 1
method read_big_data_filter_with_key_values finished 
method read_big_data_filter_with_key_values is started 
read csv data is started with file /home/amin/CETI/RoadConstruction/WeatherEventData/WeatherEvents_Aug16_Dec19_Publish.csv
chunk size is 2985984
continue reading file page num is 1
continue reading file page num is 2
method read_big_data_filter_with_key_values finished 


In [29]:
if must_split_data:
    print(traffic_events_data.shape[0])
    print(traffic_events_data.shape)

2046493
(2046493, 58)


In [7]:
traffic_events_data.info(verbose=True)

weather_events_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2046493 entries, 0 to 2050890
Data columns (total 58 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   Id                       object 
 1   Source                   object 
 2   Type                     object 
 3   EventCode                float64
 4   Severity                 int64  
 5   StartPoint_Lat           float64
 6   StartPoint_Lng           float64
 7   EndPoint_Lat             float64
 8   EndPoint_Lng             float64
 9   Distance(mi)             float64
 10  StartTime                object 
 11  EndTime                  object 
 12  ImpactingRoad            object 
 13  DelayFromTypical(mins)   float64
 14  DelayFromFreeFlow(mins)  float64
 15  ShortDescription         object 
 16  FullDescription          object 
 17  AdditionalDescription    object 
 18  Number                   float64
 19  Street                   object 
 20  Side                     object 
 21  City    

In [8]:
traffic_events_data.head(10)

Unnamed: 0,Id,Source,Type,EventCode,Severity,StartPoint_Lat,StartPoint_Lng,EndPoint_Lat,EndPoint_Lng,Distance(mi),...,Junction,Noexit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Circle,Turning_Loop
0,B-1,Bing,Construction,,3,38.250703,-85.750555,38.257121,-85.741728,0.652697,...,False,False,False,False,False,False,False,False,False,False
1,B-3,Bing,Construction,,4,38.5591,-85.76444,38.57591,-85.78586,1.639514,...,False,False,False,False,False,False,False,False,False,False
2,B-4,Bing,Construction,,3,38.25214,-85.7485,38.25133,-85.74661,0.116827,...,False,False,False,False,True,False,False,True,False,False
3,B-5,Bing,Construction,,3,41.09217,-80.64598,41.096203,-80.635086,0.632006,...,False,False,False,False,False,False,False,False,False,False
4,B-6,Bing,Construction,,3,38.25258,-85.74636,38.25053,-85.746758,0.143278,...,True,False,False,False,True,False,False,True,False,False
5,B-7,Bing,Construction,,4,41.73895,-84.23415,41.73899,-84.22262,0.594453,...,False,False,False,False,False,False,False,False,False,False
6,B-9,Bing,Construction,,2,39.5,-84.73031,39.489353,-84.721373,0.876482,...,False,False,False,False,False,False,False,False,False,False
7,B-10,Bing,Construction,,3,38.255166,-85.741822,38.256585,-85.739618,0.154635,...,False,False,False,False,False,False,False,False,False,False
8,B-13,Bing,Construction,,3,39.094616,-84.864515,39.095992,-84.858633,0.329429,...,False,False,False,False,False,False,False,False,False,False
9,B-14,Bing,Construction,,3,41.67975,-81.30335,41.690332,-81.305615,0.740429,...,False,False,False,False,False,False,False,True,False,False


In [9]:
weather_events_data.head(10)

Unnamed: 0,EventId,Type,Severity,StartTime(UTC),EndTime(UTC),TimeZone,LocationLat,LocationLng,AirportCode,City,County,State,ZipCode
2985984,W-3381383,Precipitation,UNK,2019-05-03 19:35:00,2019-05-03 20:15:00,US/Central,35.9499,-96.7731,KCUH,Cushing,Payne,OK,74023.0
2985985,W-3381384,Rain,Light,2019-05-03 20:35:00,2019-05-03 20:55:00,US/Central,35.9499,-96.7731,KCUH,Cushing,Payne,OK,74023.0
2985986,W-3381385,Rain,Moderate,2019-05-03 21:15:00,2019-05-03 21:55:00,US/Central,35.9499,-96.7731,KCUH,Cushing,Payne,OK,74023.0
2985987,W-3381386,Rain,Heavy,2019-05-03 21:55:00,2019-05-03 22:15:00,US/Central,35.9499,-96.7731,KCUH,Cushing,Payne,OK,74023.0
2985988,W-3381387,Rain,Moderate,2019-05-03 22:15:00,2019-05-03 22:55:00,US/Central,35.9499,-96.7731,KCUH,Cushing,Payne,OK,74023.0
2985989,W-3381388,Rain,Heavy,2019-05-03 22:55:00,2019-05-03 23:15:00,US/Central,35.9499,-96.7731,KCUH,Cushing,Payne,OK,74023.0
2985990,W-3381389,Rain,Light,2019-05-03 23:15:00,2019-05-04 00:35:00,US/Central,35.9499,-96.7731,KCUH,Cushing,Payne,OK,74023.0
2985991,W-3381390,Rain,Light,2019-05-06 05:35:00,2019-05-06 05:55:00,US/Central,35.9499,-96.7731,KCUH,Cushing,Payne,OK,74023.0
2985992,W-3381391,Rain,Light,2019-05-06 06:15:00,2019-05-06 06:35:00,US/Central,35.9499,-96.7731,KCUH,Cushing,Payne,OK,74023.0
2985993,W-3381392,Precipitation,UNK,2019-05-06 06:55:00,2019-05-06 08:15:00,US/Central,35.9499,-96.7731,KCUH,Cushing,Payne,OK,74023.0


In [10]:
# write to file

# traffic_events_data.to_csv (r'/home/amin/CETI/RoadConstruction/TrafficEventData/export_dataframe.csv', index = False, header=True)

In [11]:
class Location(object):
    def __init__(self, osrm_data):
        self.longitude = osrm_data[0]
        self.latitude = osrm_data[1]

In [12]:
class WayPoint(object):
    def __init__(self, json_data):
        self.nodes = json_data["nodes"]
        self.hint = json_data["hint"]
        self.distance = json_data["distance"]
        self.name = json_data["name"]
        self.location = Location(json_data["location"])
    
    def validate_way_point(self):
        if self.distance < max_distance_from_node:
            return True
        return False

In [13]:
class OSM(object):
    @staticmethod
    def get_node_details_by_node_id(node_id):
        URL = OSM_BASIC_URL + "node/{}".format(node_id)
        response = requests.get(url = URL)
        result_tree = ElementTree.fromstring(response.content)
        return result_tree, response.content
    
    @staticmethod
    def get_all_ways_contain_node_by_node_id(node_id):
        URL = OSM_BASIC_URL + "node/{}/ways".format(node_id)
        response = requests.get(url = URL)
        result_tree = ElementTree.fromstring(response.content)
        return result_tree, response.content
    

In [14]:
class OSRM(object):
    @staticmethod
    def get_all_nearest_nodes_of_location(location, number=1):
        params = {
            "number": number
        }
        URL = OSRM_BASIC_URL + "nearest/v1/driving/{},{}".format(location.longitude, location.latitude)
        response = requests.get(url = URL, params = params)
        return response.json() 

In [20]:
nearst_nodes_of_start_point = []
nearst_nodes_of_end_point = []

nearst_node_ids_of_start_validate_point = []
nearst_node_ids_of_end_validate_point = []

In [24]:
print(len(nearst_nodes_of_start_point))
print(len(nearst_nodes_of_end_point))
print(len(nearst_node_ids_of_start_validate_point))
print(len(nearst_node_ids_of_end_validate_point))

for index, data in traffic_events_data.iterrows():
    start_location = Location([data["StartPoint_Lng"], data["StartPoint_Lat"]])
    if "EndPoint_Lng" in data and "EndPoint_Lat" in data:
        end_location = Location([data["EndPoint_Lng"], data["EndPoint_Lat"]])
    else:
        end_location = Location([data["StartPoint_Lng"], data["StartPoint_Lat"]])

    nearst_nodes_start_location = OSRM.get_all_nearest_nodes_of_location(start_location, number_of_nearest_node)
    nearst_nodes_end_location = OSRM.get_all_nearest_nodes_of_location(end_location, number_of_nearest_node)
    
    
    nearst_way_points_start_location = []
    start_ids = []
    for point in nearst_nodes_start_location["waypoints"]:
        way_point = WayPoint(point)
        if way_point.validate_way_point():
            nearst_way_points_start_location.append(way_point)
            start_ids.extend(way_point.nodes)
    
    nearst_way_points_end_location = []
    end_ids = []
    for point in nearst_nodes_end_location["waypoints"]:
        way_point = WayPoint(point)
        if way_point.validate_way_point():
            nearst_way_points_end_location.append(way_point)
            end_ids.extend(way_point.nodes)
            
    
    nearst_nodes_of_start_point.append(nearst_nodes_start_location)
    nearst_nodes_of_end_point.append(nearst_nodes_end_location)
    
    nearst_node_ids_of_start_validate_point.append(start_ids)
    nearst_node_ids_of_end_validate_point.append(end_ids)
    
    if len(nearst_node_ids_of_start_validate_point) % 100 == 1:
        print(datetime.now())
        print(len(nearst_nodes_of_start_point))
        print(len(nearst_nodes_of_end_point))
        print(len(nearst_node_ids_of_start_validate_point))
        print(len(nearst_node_ids_of_end_validate_point))
        print("=======")
    
#     if len(nearst_node_ids_of_end_validate_point) >= 3:
#         break
    

1070
1070
1070
1070
2020-08-20 19:04:49.024999
1101
1101
1101
1101
2020-08-20 19:06:44.783192
1201
1201
1201
1201
2020-08-20 19:08:53.426751
1301
1301
1301
1301
2020-08-20 19:11:02.332098
1401
1401
1401
1401
2020-08-20 19:12:57.253299
1501
1501
1501
1501
2020-08-20 19:14:56.620285
1601
1601
1601
1601
2020-08-20 19:17:02.755979
1701
1701
1701
1701
2020-08-20 19:19:06.697226
1801
1801
1801
1801


KeyboardInterrupt: 

In [None]:
traffic_events_data["nearst_nodes_of_start_point"] = nearst_nodes_of_start_point
traffic_events_data["nearst_nodes_of_end_point"] = nearst_nodes_of_end_point
traffic_events_data["nearst_node_ids_of_start_validate_point"] = nearst_node_ids_of_start_validate_point
traffic_events_data["nearst_node_ids_of_end_validate_point"] = nearst_node_ids_of_end_validate_point


In [None]:
traffic_events_data.to_csv (r'/home/amin/CETI/RoadConstruction/TrafficEventData/export_dataframe_with_node_ids.csv', index = False, header=True)