# Importing libraries

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


## Data Loading

In [37]:
import os
import requests
import json
import time
import pandas as pd
from google.cloud import storage
from datetime import datetime
import schedule

# Replace with your HERE API Key and GCS Bucket Name
API_KEY = 'N82GXPWfciDemPYjpN8K9vb2OzPrbNYkjs48M7zbcC8'
BUCKET_NAME = 'traffic_intelligent_platform'
storage_client = storage.Client()

# Ensure directory exists
def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

# Save data to Google Cloud Storage in JSON and CSV formats
def save_data_to_gcs(data, gcs_folder, file_prefix):
    timestamp = time.strftime('%Y%m%d_%H%M%S')
    
    # Save JSON data
    json_filename = f"{file_prefix}_data_{timestamp}.json"
    json_blob = storage_client.bucket(BUCKET_NAME).blob(f"{gcs_folder}/{json_filename}")
    json_blob.upload_from_string(json.dumps(data, indent=4), content_type="application/json")
    print(f"Data saved to GCS as {json_filename} in folder {gcs_folder}")

    # Save to CSV if there is 'results' key
    if "results" in data:
        df = pd.json_normalize(data["results"])
        csv_filename = f"{file_prefix}_data_{timestamp}.csv"
        csv_blob = storage_client.bucket(BUCKET_NAME).blob(f"{gcs_folder}/{csv_filename}")
        csv_blob.upload_from_string(df.to_csv(index=False), content_type="text/csv")
        print(f"Flattened data saved to GCS as {csv_filename} in folder {gcs_folder}")

# HERE API Functions
def get_here_data(bbox, data_type="flow"):
    url = f"https://data.traffic.hereapi.com/v7/{data_type}?locationReferencing=shape&in=bbox:{bbox}&apiKey={API_KEY}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data if data.get("results") else None
    else:
        print(f"Error fetching {data_type} data: {response.status_code}")
        return None

def filter_flow_data(data):
    filtered_data = []
    for result in data.get("results", []):
        jam_factor = result.get("currentFlow", {}).get("jamFactor", 0)
        traversability = result.get("currentFlow", {}).get("traversability", "open")
        if jam_factor > 7 or traversability == "closed":
            filtered_data.append(result)
    return filtered_data

def fetch_and_save_here_data(bbox):
    # Flow Data
    flow_data = get_here_data(bbox, "flow")
    if flow_data:
        save_data_to_gcs(flow_data, "traffic_data/real_time/here/flow", "here_flow")
        filtered_flow_data = filter_flow_data(flow_data)
        if filtered_flow_data:
            save_data_to_gcs({"results": filtered_flow_data}, "traffic_data/real_time/here/flow", "here_filtered_flow")

    # Incident Data
    incident_data = get_here_data(bbox, "incidents")
    if incident_data:
        save_data_to_gcs(incident_data, "traffic_data/real_time/here/incidents", "here_incident")

# TFL API Functions
def fetch_tfl_data(endpoint):
    base_url = "https://api.tfl.gov.uk"
    full_url = f"{base_url}/{endpoint}"
    response = requests.get(full_url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching data from {endpoint}: {response.status_code}")
        return None

def upload_to_gcs(bucket_name, data, destination_blob_name):
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_string(json.dumps(data, indent=4), content_type="application/json")
    print(f"Data successfully uploaded to GCS: gs://{bucket_name}/{destination_blob_name}")

def job_tfl(data_type, endpoint, gcs_folder):
    data = fetch_tfl_data(endpoint)
    if data:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        destination_blob_name = f"{gcs_folder}/{data_type}_tfl_data_{timestamp}.json"
        upload_to_gcs(BUCKET_NAME, data, destination_blob_name)

def schedule_tfl_jobs():
    schedule.every(5).minutes.do(job_tfl, "road", "Road", "traffic_data/real_time/tfl/road")
    schedule.every(5).minutes.do(job_tfl, "disruption", "Road/all/Disruption", "traffic_data/real_time/tfl/disruption")
    schedule.every(5).minutes.do(job_tfl, "jamcam", "Place/Type/JamCam?activeOnly=true", "traffic_data/real_time/tfl/jamcam")

# Bounding box for central London
bbox = "-0.15,51.50,-0.10,51.52"

# Fetch and save HERE data
fetch_and_save_here_data(bbox)

# Schedule TFL data fetching
schedule_tfl_jobs()

# Run scheduled jobs in a loop
try:
    while True:
        schedule.run_pending()
        time.sleep(15)
except KeyboardInterrupt:
    print("Script execution stopped by user.")



Data saved to GCS as here_flow_data_20241028_202754.json in folder traffic_data/real_time/here/flow
Flattened data saved to GCS as here_flow_data_20241028_202754.csv in folder traffic_data/real_time/here/flow
Data saved to GCS as here_filtered_flow_data_20241028_202800.json in folder traffic_data/real_time/here/flow
Flattened data saved to GCS as here_filtered_flow_data_20241028_202800.csv in folder traffic_data/real_time/here/flow
Data saved to GCS as here_incident_data_20241028_202801.json in folder traffic_data/real_time/here/incidents
Flattened data saved to GCS as here_incident_data_20241028_202801.csv in folder traffic_data/real_time/here/incidents
Starting job for road_data at endpoint: Road
Data successfully uploaded to GCS: gs://traffic_intelligent_platform/traffic_data/road_data/road_data_20241028_202802.json
Starting job for disruption_data at endpoint: Road/all/Disruption
Data successfully uploaded to GCS: gs://traffic_intelligent_platform/traffic_data/disruption_data/disru

# Importing data

In [20]:
import pandas as pd
import gcsfs
import json

# Initialize the GCS file system
fs = gcsfs.GCSFileSystem()

# Define file paths (adjusted for your structure)
here_flow = 'gs://traffic_intelligent_platform/traffic_data/real_time/here/flow/here_filtered_flow_data_20241028_193451.json'
here_incident = 'gs://traffic_intelligent_platform/traffic_data/real_time/here/incidents/here_incident_data_20241028_193453.csv'
tfl_disruption = 'gs://traffic_intelligent_platform/traffic_data/real_time/tfl/disruption/disruption_tfl_data_20241028_193900.json'
tfl_road = 'gs://traffic_intelligent_platform/traffic_data/real_time/tfl/road/road_tfl_data_20241028_193900.json'

# Load HERE flow JSON data
with fs.open(here_flow) as f:
    here_flow_df = pd.read_json(f)

# Load HERE incident CSV data
with fs.open(here_incident) as f:
    here_incident_df = pd.read_csv(f)

# Load TFL disruption JSON data
with fs.open(tfl_disruption) as f:
    tfl_disruption_df = pd.read_json(f)

# Load TFL road JSON data
with fs.open(tfl_road) as f:
    tfl_road_df = pd.read_json(f)



## Data Exploration 

In [38]:
tfl_road_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   $type                      23 non-null     object
 1   id                         23 non-null     object
 2   displayName                23 non-null     object
 3   statusSeverity             23 non-null     object
 4   statusSeverityDescription  23 non-null     object
 5   bounds                     23 non-null     object
 6   envelope                   23 non-null     object
 7   url                        23 non-null     object
 8   group                      6 non-null      object
dtypes: object(9)
memory usage: 1.7+ KB


In [39]:
tfl_disruption_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   $type                      93 non-null     object
 1   id                         93 non-null     object
 2   url                        93 non-null     object
 3   point                      93 non-null     object
 4   severity                   93 non-null     object
 5   ordinal                    93 non-null     int64 
 6   category                   93 non-null     object
 7   subCategory                93 non-null     object
 8   comments                   93 non-null     object
 9   currentUpdate              93 non-null     object
 10  currentUpdateDateTime      93 non-null     object
 11  corridorIds                93 non-null     object
 12  startDateTime              93 non-null     object
 13  endDateTime                93 non-null     object
 14  lastModified

In [40]:
here_flow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   results  11 non-null     object
dtypes: object(1)
memory usage: 216.0+ bytes


In [42]:
here_incident_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 20 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   location.length                         84 non-null     float64
 1   location.shape.links                    84 non-null     object 
 2   incidentDetails.id                      84 non-null     int64  
 3   incidentDetails.hrn                     84 non-null     object 
 4   incidentDetails.originalId              84 non-null     int64  
 5   incidentDetails.originalHrn             84 non-null     object 
 6   incidentDetails.startTime               84 non-null     object 
 7   incidentDetails.endTime                 84 non-null     object 
 8   incidentDetails.entryTime               84 non-null     object 
 9   incidentDetails.roadClosed              84 non-null     bool   
 10  incidentDetails.criticality             84 non-null     object 


In [21]:
tfl_road_df.head()


TFL Road Data:


Unnamed: 0,$type,id,displayName,statusSeverity,statusSeverityDescription,bounds,envelope,url,group
0,"Tfl.Api.Presentation.Entities.RoadCorridor, Tf...",a1,A1,Good,No Exceptional Delays,"[[-0.25616,51.5319],[-0.10234,51.6562]]","[[-0.25616,51.5319],[-0.25616,51.6562],[-0.102...",/Road/a1,
1,"Tfl.Api.Presentation.Entities.RoadCorridor, Tf...",a10,A10,Good,No Exceptional Delays,"[[-0.08703,51.52719],[-0.04999,51.68256]]","[[-0.08703,51.52719],[-0.08703,51.68256],[-0.0...",/Road/a10,
2,"Tfl.Api.Presentation.Entities.RoadCorridor, Tf...",a12,A12,Good,No Exceptional Delays,"[[-0.07183,51.51187],[0.28532,51.60844]]","[[-0.07183,51.51187],[-0.07183,51.60844],[0.28...",/Road/a12,
3,"Tfl.Api.Presentation.Entities.RoadCorridor, Tf...",a13,A13,Good,No Exceptional Delays,"[[-0.07183,51.49995],[0.22918,51.53299]]","[[-0.07183,51.49995],[-0.07183,51.53299],[0.22...",/Road/a13,
4,"Tfl.Api.Presentation.Entities.RoadCorridor, Tf...",a2,A2,Good,No Exceptional Delays,"[[-0.0857,51.44091],[0.17118,51.49438]]","[[-0.0857,51.44091],[-0.0857,51.49438],[0.1711...",/Road/a2,


In [25]:
tfl_disruption_df.head()

Unnamed: 0,$type,id,url,point,severity,ordinal,category,subCategory,comments,currentUpdate,...,location,status,geography,isProvisional,hasClosures,roadDisruptionLines,roadDisruptionImpactAreas,recurringSchedules,geometry,streets
0,"Tfl.Api.Presentation.Entities.RoadDisruption, ...",TIMS-204604,/Road/All/Disruption/TIMS-204604,"[0.140113,51.448509]",Moderate,1,Works,TfL works,[A2] East Rochester Way (Both directions) betw...,Traffic is flowing well.,...,"[A2] EAST ROCHESTER WAY (DA5,DA6) (Bexley)",Active,"{'type': 'Point', 'coordinates': [0.140113, 51...",False,False,[],[],[],,
1,"Tfl.Api.Presentation.Entities.RoadDisruption, ...",TIMS-203815,/Road/All/Disruption/TIMS-203815,"[-0.000708,51.458658]",Moderate,2,Works,Utility works,[A20] Lee High Road (Both directions) between ...,Delays are possible.,...,"[A20] LEE HIGH ROAD (SE12,SE13,SE3) (Lewisham)",Active,"{'type': 'Point', 'coordinates': [-0.000708, 5...",False,False,[],[],[],,
2,"Tfl.Api.Presentation.Entities.RoadDisruption, ...",TIMS-199964,/Road/All/Disruption/TIMS-199964,"[-0.16044,51.435373]",Moderate,3,Works,Utility works,[A24] Upper Tooting Road (Both directions) bet...,Traffic is flowing well.,...,[A24] UPPER TOOTING ROAD (SW17) (Wandsworth),Active,"{'type': 'Point', 'coordinates': [-0.16044, 51...",False,False,[],[],[],,
3,"Tfl.Api.Presentation.Entities.RoadDisruption, ...",TIMS-204859,/Road/All/Disruption/TIMS-204859,"[-0.290523,51.467115]",Moderate,4,Works,TfL works,[A316] Lower Mortlake Road (Eastbound) at the ...,Traffic is flowing well.,...,[A316] LOWER MORTLAKE ROAD (TW9) (Richmond upo...,Active,"{'type': 'Point', 'coordinates': [-0.290523, 5...",False,False,[],[],[],"{'type': 'Polygon', 'coordinates': [[[-0.29130...",[{'$type': 'Tfl.Api.Presentation.Entities.Stre...
4,"Tfl.Api.Presentation.Entities.RoadDisruption, ...",TIMS-181068,/Road/All/Disruption/TIMS-181068,"[-0.288237,51.467669]",Moderate,5,Works,TfL works,[A316] Lower Richmond Road (Both directions) a...,Delays are possible.,...,[A316] LOWER RICHMOND ROAD (TW9) (Richmond upo...,Active,"{'type': 'Point', 'coordinates': [-0.288236999...",False,False,[],[],[],"{'type': 'Polygon', 'coordinates': [[[-0.29130...",[{'$type': 'Tfl.Api.Presentation.Entities.Stre...


In [23]:

here_flow_df.head()

Unnamed: 0,results
0,{'location': {'description': 'B521/Holborn Cir...
1,"{'location': {'length': 220.0, 'shape': {'link..."
2,"{'location': {'length': 564.0, 'shape': {'link..."
3,"{'location': {'description': 'Fitzrovia', 'len..."
4,"{'location': {'description': 'Bloomsbury', 'le..."


In [24]:
here_incident_df.head()

Unnamed: 0,location.length,location.shape.links,incidentDetails.id,incidentDetails.hrn,incidentDetails.originalId,incidentDetails.originalHrn,incidentDetails.startTime,incidentDetails.endTime,incidentDetails.entryTime,incidentDetails.roadClosed,incidentDetails.criticality,incidentDetails.type,incidentDetails.codes,incidentDetails.description.value,incidentDetails.description.language,incidentDetails.summary.value,incidentDetails.summary.language,incidentDetails.comment,incidentDetails.junctionTraversability,incidentDetails.vehicleRestrictions
0,143.0,"[{'points': [{'lat': 51.51182, 'lng': -0.12713...",2527057899314780784,here:traffic:incident:2527057899314780784,2527057899314780784,here:traffic:incident:2527057899314780784,2024-10-22T07:00:00Z,2024-12-17T17:00:00Z,2024-10-24T17:55:01Z,False,major,construction,[701],Road construction,en-US,Road construction,en-US,Roadworks scheduled 22/10/2024 - 17/12/2024 (W...,,
1,71.0,"[{'points': [{'lat': 51.51367, 'lng': -0.14882...",2815420157553621314,here:traffic:incident:2815420157553621314,2815420157553621301,here:traffic:incident:2815420157553621301,2024-10-25T09:19:20Z,2024-10-29T15:19:20Z,2024-10-25T09:19:20Z,True,critical,roadClosure,[401],Closed,en-US,Closed,en-US,,intermediateClosedEdgeOpen,
2,20.0,"[{'points': [{'lat': 51.51721, 'lng': -0.10336...",1842692596620803022,here:traffic:incident:1842692596620803022,3667547261699565395,here:traffic:incident:3667547261699565395,2024-10-14T07:00:00Z,2024-11-15T23:59:00Z,2024-10-28T19:18:26Z,False,critical,construction,[703],Road maintenance operations,en-US,Road maintenance operations,en-US,,,
3,230.0,"[{'points': [{'lat': 51.507, 'lng': -0.12811},...",4279916361528991598,here:traffic:incident:4279916361528991598,4279916361528991598,here:traffic:incident:4279916361528991598,2023-07-19T05:00:00Z,2026-12-24T23:59:00Z,2024-10-08T13:44:55Z,True,critical,roadClosure,"[401, 802]",Closed,en-US,Closed,en-US,,intermediateClosedEdgeOpen,
4,31.0,"[{'points': [{'lat': 51.51718, 'lng': -0.13835...",350241208415054898,here:traffic:incident:350241208415054898,225366863386674685,here:traffic:incident:225366863386674685,2024-10-22T23:00:00Z,2025-11-10T23:59:00Z,2024-10-28T19:20:42Z,False,critical,construction,[802],Long-term road construction,en-US,Long-term road construction,en-US,,,


In [36]:
tfl_road_df.dtypes

$type                        object
id                           object
displayName                  object
statusSeverity               object
statusSeverityDescription    object
bounds                       object
envelope                     object
url                          object
group                        object
dtype: object

In [30]:
# Begin by investigating summaries, unique values, and data types for each DataFrame.
tfl_road_summary = tfl_road_df.describe(include='all')
tfl_disruption_summary = tfl_disruption_df.describe(include='all')
here_flow_summary = here_flow_df.describe(include='all')
here_incident_summary = here_incident_df.describe(include='all')

# Checking data types and unique values
tfl_road_dtypes = tfl_road_df.dtypes
tfl_disruption_dtypes = tfl_disruption_df.dtypes
here_flow_dtypes = here_flow_df.dtypes
here_incident_dtypes = here_incident_df.dtypes