In [1]:
%pip install pandas geopandas osmnx shapely requests numpy pyproj


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import requests
import osmnx as ox
from shapely.geometry import Point




In [3]:
TOMTOM_API_KEY = "JpflhXSm3GcxsvR2vqAhrItPmiIPO9HZ"
OPENWEATHER_API_KEY = "296144434b35a6dc13d768c4c33739f5"


In [4]:
delhi_boundary = ox.geocode_to_gdf("Delhi, India")
delhi_boundary



Unnamed: 0,geometry,bbox_west,bbox_south,bbox_east,bbox_north,place_id,osm_type,osm_id,lat,lon,class,type,place_rank,importance,addresstype,name,display_name
0,"POLYGON ((77.05037 28.55871, 77.05275 28.55734...",77.050371,28.481221,77.255139,28.645684,421167494,relation,2763541,28.613895,77.209006,boundary,administrative,9,0.702638,city,New Delhi,"New Delhi, Delhi, India"


In [5]:
G = ox.graph_from_polygon(
    delhi_boundary.geometry.iloc[0],
    network_type="drive"
)

nodes, roads = ox.graph_to_gdfs(G)
roads = roads.to_crs("EPSG:4326")
roads.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,osmid,highway,name,oneway,reversed,length,geometry,lanes,maxspeed,bridge,ref,junction,width,access,tunnel,area,service,landuse
u,v,key,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
58056528,301037382,0,590609974,primary,Nelson Mandela Marg,True,False,31.509355,"LINESTRING (77.171 28.55939, 77.17125 28.55957)",,,,,,,,,,,
58056528,7612915570,0,"[558065085, 558065079]",secondary,Palam Marg,True,False,325.63717,"LINESTRING (77.171 28.55939, 77.17018 28.56011...",,,,,,,,,,,
60890393,6436786516,0,1121966690,residential,,False,True,17.640539,"LINESTRING (77.17495 28.58062, 77.17511 28.58069)",,,,,,,,,,,
60890393,6110396131,0,25744598,tertiary,,True,False,156.010558,"LINESTRING (77.17495 28.58062, 77.17504 28.580...",2.0,,,,,,,,,,
60890393,280740684,0,"[583917363, 1312224221, 44317159]",primary,Moti Bagh Flyover,True,False,621.717997,"LINESTRING (77.17495 28.58062, 77.17517 28.580...",,60.0,yes,,,,,,,,


In [6]:
minx, miny, maxx, maxy = delhi_boundary.total_bounds

grid_size = 0.05  # ~5km
lats = np.arange(miny, maxy, grid_size)
lons = np.arange(minx, maxx, grid_size)

grid_points = [
    {"lat": lat, "lon": lon}
    for lat in lats
    for lon in lons
]

grid_df = pd.DataFrame(grid_points)
grid_df.head()


Unnamed: 0,lat,lon
0,28.481221,77.050371
1,28.481221,77.100371
2,28.481221,77.150371
3,28.481221,77.200371
4,28.481221,77.250371


In [7]:
def get_tomtom_traffic(lat, lon):
    url = "https://api.tomtom.com/traffic/services/4/flowSegmentData/absolute/10/json"
    params = {
        "key": TOMTOM_API_KEY,
        "point": f"{lat},{lon}",
        "unit": "KMPH"
    }
    
    r = requests.get(url, params=params)
    
    if r.status_code == 200:
        d = r.json()["flowSegmentData"]
        return {
            "speed": d["currentSpeed"],
            "free_flow": d["freeFlowSpeed"],
            "confidence": d["confidence"]
        }
    else:
        return None


In [8]:
traffic_rows = []

for _, row in grid_df.iterrows():
    traffic = get_tomtom_traffic(row.lat, row.lon)
    if traffic:
        traffic_rows.append({
            "lat": row.lat,
            "lon": row.lon,
            "timestamp": pd.to_datetime("now"),
            **traffic
        })

traffic_df = pd.DataFrame(traffic_rows)
traffic_df.head()


Unnamed: 0,lat,lon,timestamp,speed,free_flow,confidence
0,28.481221,77.050371,2026-01-20 20:15:21.232713,30,39,1.0
1,28.481221,77.100371,2026-01-20 20:15:21.936674,18,26,1.0
2,28.481221,77.150371,2026-01-20 20:15:22.798480,28,28,1.0
3,28.481221,77.200371,2026-01-20 20:15:23.471538,26,31,1.0
4,28.481221,77.250371,2026-01-20 20:15:24.169139,32,38,0.996782


In [9]:
traffic_gdf = gpd.GeoDataFrame(
    traffic_df,
    geometry=gpd.points_from_xy(
        traffic_df.lon,
        traffic_df.lat
    ),
    crs="EPSG:4326"
)

traffic_gdf = gpd.clip(traffic_gdf, delhi_boundary)
traffic_gdf.head()


Unnamed: 0,lat,lon,timestamp,speed,free_flow,confidence,geometry
6,28.531221,77.100371,2026-01-20 20:15:25.772553,19,34,1.0,POINT (77.10037 28.53122)
7,28.531221,77.150371,2026-01-20 20:15:26.445123,30,38,1.0,POINT (77.15037 28.53122)
11,28.581221,77.100371,2026-01-20 20:15:29.231827,31,36,1.0,POINT (77.10037 28.58122)
12,28.581221,77.150371,2026-01-20 20:15:29.937840,42,56,1.0,POINT (77.15037 28.58122)
13,28.581221,77.200371,2026-01-20 20:15:30.749360,24,35,1.0,POINT (77.20037 28.58122)


In [10]:
def get_weather(lat, lon):
    url = "https://api.openweathermap.org/data/2.5/weather"
    params = {
        "lat": lat,
        "lon": lon,
        "appid": OPENWEATHER_API_KEY,
        "units": "metric"
    }
    r = requests.get(url, params=params).json()
    
    return {
        "temp": r["main"]["temp"],
        "humidity": r["main"]["humidity"],
        "pressure": r["main"]["pressure"],
        "wind_speed": r["wind"]["speed"]
    }


In [11]:
weather_rows = []

for _, row in grid_df.iterrows():
    w = get_weather(row.lat, row.lon)
    weather_rows.append({
        "lat": row.lat,
        "lon": row.lon,
        "timestamp": pd.to_datetime("now"),
        **w
    })

weather_df = pd.DataFrame(weather_rows)
weather_df.head()


Unnamed: 0,lat,lon,timestamp,temp,humidity,pressure,wind_speed
0,28.481221,77.050371,2026-01-20 20:15:35.102166,15.99,67,1017,0
1,28.481221,77.100371,2026-01-20 20:15:35.347226,15.86,63,1017,0
2,28.481221,77.150371,2026-01-20 20:15:35.582812,15.76,63,1017,0
3,28.481221,77.200371,2026-01-20 20:15:35.793850,15.73,63,1017,0
4,28.481221,77.250371,2026-01-20 20:15:36.030338,15.82,63,1017,0


In [12]:
weather_gdf = gpd.GeoDataFrame(
    weather_df,
    geometry=gpd.points_from_xy(
        weather_df["lon"],
        weather_df["lat"]
    ),
    crs="EPSG:4326"
)

# Clip (still fine)
weather_gdf = gpd.clip(weather_gdf, delhi_boundary)



In [13]:
traffic_gdf = traffic_gdf.to_crs(3857)
roads = roads.to_crs(3857)

traffic_roads = gpd.sjoin_nearest(
    traffic_gdf,
    roads,
    how="left",
    distance_col="road_dist"
)

traffic_roads.head()
traffic_roads["time_hr"] = traffic_roads["timestamp"].dt.floor("h")
traffic_roads[["timestamp", "time_hr"]].head()



Unnamed: 0,timestamp,time_hr
6,2026-01-20 20:15:25.772553,2026-01-20 20:00:00
7,2026-01-20 20:15:26.445123,2026-01-20 20:00:00
11,2026-01-20 20:15:29.231827,2026-01-20 20:00:00
11,2026-01-20 20:15:29.231827,2026-01-20 20:00:00
12,2026-01-20 20:15:29.937840,2026-01-20 20:00:00


In [14]:
traffic_roads["time_hr"] = traffic_roads["timestamp"].dt.floor("h")
weather_df["time_hr"] = pd.to_datetime(weather_df["timestamp"]).dt.floor("h")




In [15]:
final_df = traffic_roads.merge(
    weather_df,
    on="time_hr",
    how="left"
)

final_df.head()



Unnamed: 0,lat_x,lon_x,timestamp_x,speed,free_flow,confidence,geometry,u,v,key,...,landuse,road_dist,time_hr,lat_y,lon_y,timestamp_y,temp,humidity,pressure,wind_speed
0,28.531221,77.100371,2026-01-20 20:15:25.772553,19,34,1.0,POINT (8582774.017 3316115.187),3742399917,4218627516,0,...,,59.801376,2026-01-20 20:00:00,28.481221,77.050371,2026-01-20 20:15:35.102166,15.99,67,1017,0
1,28.531221,77.100371,2026-01-20 20:15:25.772553,19,34,1.0,POINT (8582774.017 3316115.187),3742399917,4218627516,0,...,,59.801376,2026-01-20 20:00:00,28.481221,77.100371,2026-01-20 20:15:35.347226,15.86,63,1017,0
2,28.531221,77.100371,2026-01-20 20:15:25.772553,19,34,1.0,POINT (8582774.017 3316115.187),3742399917,4218627516,0,...,,59.801376,2026-01-20 20:00:00,28.481221,77.150371,2026-01-20 20:15:35.582812,15.76,63,1017,0
3,28.531221,77.100371,2026-01-20 20:15:25.772553,19,34,1.0,POINT (8582774.017 3316115.187),3742399917,4218627516,0,...,,59.801376,2026-01-20 20:00:00,28.481221,77.200371,2026-01-20 20:15:35.793850,15.73,63,1017,0
4,28.531221,77.100371,2026-01-20 20:15:25.772553,19,34,1.0,POINT (8582774.017 3316115.187),3742399917,4218627516,0,...,,59.801376,2026-01-20 20:00:00,28.481221,77.250371,2026-01-20 20:15:36.030338,15.82,63,1017,0


In [18]:

print(final_df.columns)




Index(['lat_x', 'lon_x', 'timestamp_x', 'speed', 'free_flow', 'confidence',
       'geometry', 'u', 'v', 'key', 'osmid', 'highway', 'name', 'oneway',
       'reversed', 'length', 'lanes', 'maxspeed', 'bridge', 'ref', 'junction',
       'width', 'access', 'tunnel', 'area', 'service', 'landuse', 'road_dist',
       'time_hr', 'lat_y', 'lon_y', 'timestamp_y', 'temp', 'humidity',
       'pressure', 'wind_speed'],
      dtype='object')


In [19]:
final_df["traffic_volume"] = (
    final_df["free_flow"] - final_df["speed"]
)


In [20]:
# Speed filters
final_df = final_df[final_df["speed"] > 0]
final_df = final_df[final_df["speed"] <= 120]

# Traffic volume filter
final_df = final_df[final_df["traffic_volume"] >= 0]


In [21]:
major_roads = [
    "motorway", "trunk", "primary",
    "secondary", "tertiary"
]

final_df = final_df[
    final_df["highway"].isin(major_roads)
]


In [22]:
final_df = final_df[final_df["confidence"] >= 0.7]


In [23]:
final_df.isna().sum()


lat_x               0
lon_x               0
timestamp_x         0
speed               0
free_flow           0
confidence          0
geometry            0
u                   0
v                   0
key                 0
osmid               0
highway             0
name               60
oneway              0
reversed            0
length              0
lanes             120
maxspeed          120
bridge            120
ref               120
junction          120
width             120
access            100
tunnel            120
area              120
service           120
landuse           120
road_dist           0
time_hr             0
lat_y               0
lon_y               0
timestamp_y         0
temp                0
humidity            0
pressure            0
wind_speed          0
traffic_volume      0
dtype: int64

In [24]:
final_df = final_df.dropna(
    subset=["speed", "free_flow", "temp", "humidity"]
)


In [25]:
Q1 = final_df["traffic_volume"].quantile(0.25)
Q3 = final_df["traffic_volume"].quantile(0.75)
IQR = Q3 - Q1

final_df = final_df[
    (final_df["traffic_volume"] >= Q1 - 1.5 * IQR) &
    (final_df["traffic_volume"] <= Q3 + 1.5 * IQR)
]


In [29]:
final_df["hour"] = final_df["time_hr"].dt.hour
final_df["day"] = final_df["time_hr"].dt.dayofweek

final_df[["time_hr", "hour", "day"]].head()




Unnamed: 0,time_hr,hour,day
0,2026-01-20 20:00:00,20,1
1,2026-01-20 20:00:00,20,1
2,2026-01-20 20:00:00,20,1
3,2026-01-20 20:00:00,20,1
4,2026-01-20 20:00:00,20,1


In [30]:
final_df = final_df[
    (final_df["hour"] >= 6) &
    (final_df["hour"] <= 23)
]


In [33]:
ml_df = final_df[[
    "speed",
    "free_flow",
    "traffic_volume",
    "confidence",
    "lanes",
    "maxspeed",
    "temp",
    "humidity",
    "pressure",
    "wind_speed",
    "hour",
    "day",
    "highway"
]].copy()


In [34]:
ml_df["highway"] = ml_df["highway"].astype("category").cat.codes


In [35]:
ml_df.to_csv("delhi_traffic_filtered.csv", index=False)
print("Filtered dataset saved ✅")


Filtered dataset saved ✅
