In [2]:
#! pip install pyarrow

### Check the NYC dataset

Can be obtained from https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page?spm=5aebb161.2ef5001f.0.0.14b05171AFZ96Z&file=tlc-trip-record-data.page

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_parquet('yellow_tripdata_2024-01.parquet')
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-01-01 00:57:55,2024-01-01 01:17:43,1.0,1.72,1.0,N,186,79,2,17.70,1.00,0.5,0.00,0.00,1.0,22.70,2.5,0.0
1,1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.0,1.80,1.0,N,140,236,1,10.00,3.50,0.5,3.75,0.00,1.0,18.75,2.5,0.0
2,1,2024-01-01 00:17:06,2024-01-01 00:35:01,1.0,4.70,1.0,N,236,79,1,23.30,3.50,0.5,3.00,0.00,1.0,31.30,2.5,0.0
3,1,2024-01-01 00:36:38,2024-01-01 00:44:56,1.0,1.40,1.0,N,79,211,1,10.00,3.50,0.5,2.00,0.00,1.0,17.00,2.5,0.0
4,1,2024-01-01 00:46:51,2024-01-01 00:52:57,1.0,0.80,1.0,N,211,148,1,7.90,3.50,0.5,3.20,0.00,1.0,16.10,2.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2964619,2,2024-01-31 23:45:59,2024-01-31 23:54:36,,3.18,,,107,263,0,15.77,0.00,0.5,2.00,0.00,1.0,21.77,,
2964620,1,2024-01-31 23:13:07,2024-01-31 23:27:52,,4.00,,,114,236,0,18.40,1.00,0.5,2.34,0.00,1.0,25.74,,
2964621,2,2024-01-31 23:19:00,2024-01-31 23:38:00,,3.33,,,211,25,0,19.97,0.00,0.5,0.00,0.00,1.0,23.97,,
2964622,2,2024-01-31 23:07:23,2024-01-31 23:25:14,,3.06,,,107,13,0,23.88,0.00,0.5,5.58,0.00,1.0,33.46,,


In [5]:
zones = pd.read_csv('taxi_zone_lookup.csv')
zones

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,,


#### Not the best dataset for several reasons:

+ The dataset does not contain the precise location of the pickup and dropoff points. Instead, it uses the taxi zone ID, which is not very precise. Ideally, we need the latitude and longitude coordinates of the pickup and dropoff points to accurately analyze the data.

+ The dataset contains information only about the pickup and dropoff, which is insufficient to analyze the entire trip. Ideally, we need to know the checkpoints (at least) to build the root and analyze the traffic

### Check the California vehicle information dataset

Available at https://data.transportation.gov/Automobiles/Next-Generation-Simulation-NGSIM-Vehicle-Trajector/8ect-6jqj/about_data

**Btw, there are some video file datasets**

Check it: https://data.transportation.gov/Automobiles/Next-Generation-Simulation-NGSIM-Program-I-80-Vide/2577-gpny

**Column description:**

Vehicle_ID Vehicle identification number (ascending by time of entry into section). REPEATS ARE NOT ASSOCIATED.

Frame_ID Frame Identification number (ascending by start time)

Total_Frames Total number of frames in which the vehicle appears in this data set

Global_Time Elapsed time in milliseconds since Jan 1, 1970.

Local_X Lateral (X) coordinate of the front center of the vehicle in feet with respect to the left-most edge of the section in the direction of travel.

Local_Y Longitudinal (Y) coordinate of the front center of the vehicle in feet with respect to the entry edge of the section in the direction of travel.

Global_X X Coordinate of the front center of the vehicle in feet based on CA State Plane III in NAD83. Attribute Domain Val

Global_Y Y Coordinate of the front center of the vehicle in feet based on CA State Plane III in NAD83.

v_length Length of vehicle in feet

v_Width Width of vehicle in feet

v_Class Vehicle type: 1 - motorcycle, 2 - auto, 3 - truck

v_Vel Instantaneous velocity of vehicle in feet/second.

v_Acc Instantaneous acceleration of vehicle in feet/second square.

Lane_ID Current lane position of vehicle. Lane 1 is farthest left lane; lane 5 is farthest right lane. Lane 6 is the auxiliary lane between Ventura Boulevard on-ramp and the Cahuenga Boulevard off-ramp. Lane 7 is the on-ramp at Ventura Boulevard, and Lane 8 is the off-ramp at Cahuenga Boulevard.

O_Zone Origin zones of the vehicles, i.e., the place where the vehicles enter the tracking system. There are 11 origins in the study area, numbered from 101 through 111. Please refer to the data analysis report for more detailed information.


In [6]:
pd.set_option('display.max_columns', None)

In [7]:
df = pd.read_csv('Next_Generation_Simulation__NGSIM__Vehicle_Trajectories_and_Supporting_Data_20250301.csv', 
                      nrows=1000000)

In [8]:
df.head()

Unnamed: 0,Vehicle_ID,Frame_ID,Total_Frames,Global_Time,Local_X,Local_Y,Global_X,Global_Y,v_length,v_Width,v_Class,v_Vel,v_Acc,Lane_ID,O_Zone,D_Zone,Int_ID,Section_ID,Direction,Movement,Preceding,Following,Space_Headway,Time_Headway,Location
0,515,2330,1123,1118848075000,30.034,188.062,6451203.729,1873252.549,13.0,6.9,2,23.31,2.05,3,,,,,,,500,523,119.1,5.11,us-101
1,515,2330,1123,1118848075000,30.034,188.062,6451203.729,1873252.549,13.0,6.9,2,23.31,2.05,3,,,,,,,500,523,119.1,5.11,us-101
2,2224,6548,1902,1113437421700,41.429,472.901,6042814.264,2133542.012,14.3,6.9,2,26.54,-0.76,4,,,,,,,2208,2211,53.34,2.01,i-80
3,2127,6459,567,1118847624800,19.632,1775.614,6452425.122,1872172.475,13.5,6.9,2,37.52,11.2,2,,,,,,,2124,2132,48.92,1.3,us-101
4,1033,4827,592,1118848324700,6.202,1701.144,6452347.673,1872258.452,13.5,4.4,2,41.99,0.1,1,,,,,,,1029,1040,38.81,0.92,us-101


In [9]:
df.Location.value_counts()

us-101        538247
i-80          202301
lankershim    168228
peachtree      91224
Name: Location, dtype: int64

**REPRESENT TIME NORMALLY**

In [10]:
df['datetime_LA'] = pd.to_datetime(df['Global_Time'], unit='ms')
df['datetime_LA'] = df['datetime_LA'].dt.tz_localize('UTC')
df['datetime_LA'] = df['datetime_LA'].dt.tz_convert('America/Los_Angeles')
df['Global_Time_seconds'] = df['Global_Time'] / 1000
df = df.drop(columns=['Global_Time'])
df.head()

Unnamed: 0,Vehicle_ID,Frame_ID,Total_Frames,Local_X,Local_Y,Global_X,Global_Y,v_length,v_Width,v_Class,v_Vel,v_Acc,Lane_ID,O_Zone,D_Zone,Int_ID,Section_ID,Direction,Movement,Preceding,Following,Space_Headway,Time_Headway,Location,datetime_LA,Global_Time_seconds
0,515,2330,1123,30.034,188.062,6451203.729,1873252.549,13.0,6.9,2,23.31,2.05,3,,,,,,,500,523,119.1,5.11,us-101,2005-06-15 08:07:55-07:00,1118848000.0
1,515,2330,1123,30.034,188.062,6451203.729,1873252.549,13.0,6.9,2,23.31,2.05,3,,,,,,,500,523,119.1,5.11,us-101,2005-06-15 08:07:55-07:00,1118848000.0
2,2224,6548,1902,41.429,472.901,6042814.264,2133542.012,14.3,6.9,2,26.54,-0.76,4,,,,,,,2208,2211,53.34,2.01,i-80,2005-04-13 17:10:21.700000-07:00,1113437000.0
3,2127,6459,567,19.632,1775.614,6452425.122,1872172.475,13.5,6.9,2,37.52,11.2,2,,,,,,,2124,2132,48.92,1.3,us-101,2005-06-15 08:00:24.800000-07:00,1118848000.0
4,1033,4827,592,6.202,1701.144,6452347.673,1872258.452,13.5,4.4,2,41.99,0.1,1,,,,,,,1029,1040,38.81,0.92,us-101,2005-06-15 08:12:04.700000-07:00,1118848000.0


### The coordinate system looks pretty weird. I guess the only way to move one is to try to convert it to lat and lon.

### Try to convert it

Use two methods to see if they lead to the same answer

Refefences:

https://epsg.io/2227

https://epsg.io/6420

In [11]:
df1 = df.copy()
df1 = df1[['Global_X', 'Global_Y', 'Location']]
df1.columns = ['easting', 'northing', 'Location']
df1

Unnamed: 0,easting,northing,Location
0,6451203.729,1873252.549,us-101
1,6451203.729,1873252.549,us-101
2,6042814.264,2133542.012,i-80
3,6452425.122,1872172.475,us-101
4,6452347.673,1872258.452,us-101
...,...,...,...
999995,6452256.994,1872291.450,us-101
999996,2230843.984,1377146.651,peachtree
999997,6452024.672,1872479.969,us-101
999998,6042794.502,2133702.375,i-80


In [12]:
# Grok
import pyproj

def convert_to_latlon(df):
    source_crs = pyproj.CRS("EPSG:6420")
    target_crs = pyproj.CRS("EPSG:4326")
    transformer = pyproj.Transformer.from_crs(source_crs, target_crs, always_xy=True)
    
    lon, lat = transformer.transform(df['easting'], df['northing'])
    df['longitude_grok'] = lon
    df['latitude_grok'] = lat
    return df

In [13]:
# DeepSeek
from pyproj import Transformer

def converter(df):
    transformer = Transformer.from_crs("EPSG:2227", "EPSG:4326")

    # Convert coordinates using the transformer
    df['latitude_deepseek'], df['longitude_deepseek'] = transformer.transform(
        df['easting'].values,
        df['northing'].values
    )
    
    return df

In [14]:
df1 = convert_to_latlon(df1)
df1 = converter(df1)
df1 = df1[['Location', 'longitude_grok', 'latitude_grok', 'longitude_deepseek', 'latitude_deepseek', 'easting', 'northing']]
df1

Unnamed: 0,Location,longitude_grok,latitude_grok,longitude_deepseek,latitude_deepseek,easting,northing
0,us-101,-120.878952,37.138848,-120.878952,37.138848,6451203.729,1873252.549
1,us-101,-120.878952,37.138848,-120.878952,37.138848,6451203.729,1873252.549
2,i-80,-122.296902,37.840647,-122.296902,37.840647,6042814.264,2133542.012
3,us-101,-120.874747,37.135895,-120.874747,37.135895,6452425.122,1872172.475
4,us-101,-120.875014,37.136131,-120.875014,37.136131,6452347.673,1872258.452
...,...,...,...,...,...,...,...
999995,us-101,-120.875326,37.136220,-120.875326,37.136220,6452256.994,1872291.450
999996,peachtree,-134.975401,34.855817,-134.975401,34.855817,2230843.984,1377146.651
999997,us-101,-120.876125,37.136736,-120.876125,37.136736,6452024.672,1872479.969
999998,i-80,-122.296981,37.841087,-122.296981,37.841087,6042794.502,2133702.375


**Two ways which were assumed to be correct match exactly, therefore, might be correct**

In [15]:
df2 = df1.loc[df1['Location'] == 'us-101']
df2 = df2.iloc[:10, 0:]
df2

Unnamed: 0,Location,longitude_grok,latitude_grok,longitude_deepseek,latitude_deepseek,easting,northing
0,us-101,-120.878952,37.138848,-120.878952,37.138848,6451203.729,1873252.549
1,us-101,-120.878952,37.138848,-120.878952,37.138848,6451203.729,1873252.549
3,us-101,-120.874747,37.135895,-120.874747,37.135895,6452425.122,1872172.475
4,us-101,-120.875014,37.136131,-120.875014,37.136131,6452347.673,1872258.452
5,us-101,-120.875014,37.136131,-120.875014,37.136131,6452347.673,1872258.452
6,us-101,-120.877397,37.137612,-120.877397,37.137612,6451655.238,1872800.663
7,us-101,-120.878199,37.13827,-120.878199,37.13827,6451422.353,1873041.018
8,us-101,-120.878199,37.13827,-120.878199,37.13827,6451422.353,1873041.018
15,us-101,-120.875233,37.136115,-120.875233,37.136115,6452284.002,1872253.037
16,us-101,-120.875233,37.136115,-120.875233,37.136115,6452284.002,1872253.037


### Visualize the data to see if it is accurate

In [16]:
import folium
center_lat = df2['latitude_grok'].mean()
center_lng = df2['longitude_grok'].mean()
center = [center_lat, center_lng]

m = folium.Map(location=center, zoom_start=5, tiles="Cartodb Positron")
for index, row in df2.iterrows():
    folium.Marker(location=[row['latitude_grok'], row['longitude_grok']]).add_to(m)
m

**!! THE PROBLEM: these points are not on the road, they are in the field!!**

### Hypothesis

Conversion types in different locations (we have four) can be different. Hence, try to ask AI (unfortunately, could not find any relevant and suitable information anywhere else) to give the appropriate transformations

In [17]:
df = pd.read_csv('Next_Generation_Simulation__NGSIM__Vehicle_Trajectories_and_Supporting_Data_20250301.csv', 
                      nrows=1000000)

In [18]:
df['datetime_LA'] = pd.to_datetime(df['Global_Time'], unit='ms')
df['datetime_LA'] = df['datetime_LA'].dt.tz_localize('UTC')
df['datetime_LA'] = df['datetime_LA'].dt.tz_convert('America/Los_Angeles')
df['Global_Time_seconds'] = df['Global_Time'] / 1000
df = df.drop(columns=['Global_Time'])
df.head()

Unnamed: 0,Vehicle_ID,Frame_ID,Total_Frames,Local_X,Local_Y,Global_X,Global_Y,v_length,v_Width,v_Class,v_Vel,v_Acc,Lane_ID,O_Zone,D_Zone,Int_ID,Section_ID,Direction,Movement,Preceding,Following,Space_Headway,Time_Headway,Location,datetime_LA,Global_Time_seconds
0,515,2330,1123,30.034,188.062,6451203.729,1873252.549,13.0,6.9,2,23.31,2.05,3,,,,,,,500,523,119.1,5.11,us-101,2005-06-15 08:07:55-07:00,1118848000.0
1,515,2330,1123,30.034,188.062,6451203.729,1873252.549,13.0,6.9,2,23.31,2.05,3,,,,,,,500,523,119.1,5.11,us-101,2005-06-15 08:07:55-07:00,1118848000.0
2,2224,6548,1902,41.429,472.901,6042814.264,2133542.012,14.3,6.9,2,26.54,-0.76,4,,,,,,,2208,2211,53.34,2.01,i-80,2005-04-13 17:10:21.700000-07:00,1113437000.0
3,2127,6459,567,19.632,1775.614,6452425.122,1872172.475,13.5,6.9,2,37.52,11.2,2,,,,,,,2124,2132,48.92,1.3,us-101,2005-06-15 08:00:24.800000-07:00,1118848000.0
4,1033,4827,592,6.202,1701.144,6452347.673,1872258.452,13.5,4.4,2,41.99,0.1,1,,,,,,,1029,1040,38.81,0.92,us-101,2005-06-15 08:12:04.700000-07:00,1118848000.0


In [19]:
df1 = df.copy()
df1 = df1[['Global_X', 'Global_Y', 'Location']]
df1.columns = ['easting', 'northing', 'Location']
df1

Unnamed: 0,easting,northing,Location
0,6451203.729,1873252.549,us-101
1,6451203.729,1873252.549,us-101
2,6042814.264,2133542.012,i-80
3,6452425.122,1872172.475,us-101
4,6452347.673,1872258.452,us-101
...,...,...,...
999995,6452256.994,1872291.450,us-101
999996,2230843.984,1377146.651,peachtree
999997,6452024.672,1872479.969,us-101
999998,6042794.502,2133702.375,i-80


In [20]:
import pyproj

def convert_to_latlon(df):
    location = df['Location'].unique()[0]  # Ensure all rows have the same location
    crs_map = {
        'us-101': 'EPSG:2229',
        'lankershim': 'EPSG:2229',
        'i-80': 'EPSG:2227',
        'peachtree': 'EPSG:2264'
    }
    source_crs = pyproj.CRS(crs_map[location])
    target_crs = pyproj.CRS("EPSG:4326")
    transformer = pyproj.Transformer.from_crs(source_crs, target_crs, always_xy=True)
    lon, lat = transformer.transform(df['easting'], df['northing'])
    df['longitude_grok'] = lon
    df['latitude_grok'] = lat
    return df

In [21]:
df1 = convert_to_latlon(df1)
df1 = df1[['Location', 'longitude_grok', 'latitude_grok', 'easting', 'northing']]
df1

Unnamed: 0,Location,longitude_grok,latitude_grok,easting,northing
0,us-101,-118.365050,34.139234,6451203.729,1873252.549
1,us-101,-118.365050,34.139234,6451203.729,1873252.549
2,i-80,-119.729304,34.842749,6042814.264,2133542.012
3,us-101,-118.361001,34.136278,6452425.122,1872172.475
4,us-101,-118.361258,34.136513,6452347.673,1872258.452
...,...,...,...,...,...
999995,us-101,-118.361558,34.136603,6452256.994,1872291.450
999996,peachtree,-131.991497,31.947541,2230843.984,1377146.651
999997,us-101,-118.362328,34.137119,6452024.672,1872479.969
999998,i-80,-119.729379,34.843189,6042794.502,2133702.375


In [22]:
df2 = df1.loc[df1['Location'] == 'us-101']
df2 = df2.iloc[:10, 0:]
df2

Unnamed: 0,Location,longitude_grok,latitude_grok,easting,northing
0,us-101,-118.36505,34.139234,6451203.729,1873252.549
1,us-101,-118.36505,34.139234,6451203.729,1873252.549
3,us-101,-118.361001,34.136278,6452425.122,1872172.475
4,us-101,-118.361258,34.136513,6452347.673,1872258.452
5,us-101,-118.361258,34.136513,6452347.673,1872258.452
6,us-101,-118.363553,34.137996,6451655.238,1872800.663
7,us-101,-118.364325,34.138655,6451422.353,1873041.018
8,us-101,-118.364325,34.138655,6451422.353,1873041.018
15,us-101,-118.361468,34.136498,6452284.002,1872253.037
16,us-101,-118.361468,34.136498,6452284.002,1872253.037


### US-101

In [23]:
center_lat = df2['latitude_grok'].mean()
center_lng = df2['longitude_grok'].mean()
center = [center_lat, center_lng]

m = folium.Map(location=center, control_scale=True, tiles="Cartodb Positron")
for index, row in df2.iterrows():
    folium.Marker(location=[row['latitude_grok'], row['longitude_grok']]).add_to(m)
#m.save('map.html')
m

**This is fine, but let's check others**

### I-80

In [24]:
df2 = df1.loc[df1['Location'] == 'i-80']
df2 = df2.iloc[:10, 0:]

In [25]:
# not sure if it makes sense, no idea where that is
center_lat = df2['latitude_grok'].mean()
center_lng = df2['longitude_grok'].mean()
center = [center_lat, center_lng]

m = folium.Map(location=center, control_scale=True, tiles="Cartodb Positron")
for index, row in df2.iterrows():
    folium.Marker(location=[row['latitude_grok'], row['longitude_grok']]).add_to(m)
#m.save('map.html')
m

### peachtree

In [26]:
df2 = df1.loc[df1['Location'] == 'peachtree']
df2 = df2.iloc[:10, 0:]

In [27]:
# does not make sense at all, points are in the ocean!!!
center_lat = df2['latitude_grok'].mean()
center_lng = df2['longitude_grok'].mean()
center = [center_lat, center_lng]

m = folium.Map(location=center, control_scale=True, tiles="Cartodb Positron")
for index, row in df2.iterrows():
    folium.Marker(location=[row['latitude_grok'], row['longitude_grok']]).add_to(m)
#m.save('map.html')
m

### IN THE OCEAN!!! Therefore, need to seriously think about how to convert it properly

# Two problems:

+ These are highways. Not sure if they are suitable for analysis

+ The coordinates might not be accurate enough, as cars are in the fields, not on the roads

+ Hence, the dataset is not applicable unless we can convert the locations to a good enough precision and prove that this converstion is correct. In that case, the dataset is quite good

### PEMS dataset

Obtained from this repository: https://github.com/guoshnBJTU/ASTGNN/tree/main/data

Some notebooks related to the analysis of this dataset:

https://www.kaggle.com/code/jvthunder/pems08-traffic-flow-prediction

https://www.kaggle.com/code/elmahy/astgcn-for-traffic-flow-forecasting

So far found nothing really interesting, hence, use only for supplementary purposes probably

In [28]:
data = np.load('PEMS03.npz')
data.files

['data']

In [29]:
data['data'].shape

(26208, 358, 1)

In [30]:
data['data']

array([[[ 20.],
        [ 20.],
        [182.],
        ...,
        [ 63.],
        [115.],
        [ 63.]],

       [[ 22.],
        [ 22.],
        [174.],
        ...,
        [ 63.],
        [109.],
        [ 62.]],

       [[ 22.],
        [ 22.],
        [183.],
        ...,
        [ 57.],
        [115.],
        [ 57.]],

       ...,

       [[ 43.],
        [ 39.],
        [144.],
        ...,
        [124.],
        [104.],
        [ 47.]],

       [[ 27.],
        [ 27.],
        [115.],
        ...,
        [123.],
        [112.],
        [ 39.]],

       [[ 48.],
        [ 52.],
        [108.],
        ...,
        [134.],
        [115.],
        [ 38.]]])

In [31]:
df = pd.read_csv('PEMS03.csv')
df

Unnamed: 0,from,to,distance
0,317842,318711,0.872
1,318721,315955,1.322
2,315927,318236,1.222
3,318711,318721,0.233
4,318236,317843,1.220
...,...,...,...
542,314548,318135,0.586
543,314548,318736,0.586
544,318126,318052,5.382
545,318135,318383,2.183


### Don't really know what to do with that, does not seem useful for our purposes

### GeoLife GPS Trajectories

Available at https://www.microsoft.com/en-us/download/details.aspx?id=52367 

Useful code at https://www.kaggle.com/code/lehongquan/final-datamining-dataset

In [32]:
def read_plt(plt_file):
    points = pd.read_csv(plt_file, skiprows=6, header=None)

    # for clarity rename columns
    points.rename(inplace=True, columns={0: 'lat', 1: 'lon', 3: 'alt', 5: 'day', 6: 'hour'})
    
    date_format = '%Y-%m-%d %H:%M:%S'
    points['time'] = pd.to_datetime(points['day'] + ' ' + points['hour'], format=date_format)

    # remove unused columns
    points.drop(inplace=True, columns=[2, 4, 'day', 'hour'])

    return points

plt_file = "Geolife Trajectories 1.3/Data/000/Trajectory/20081023025304.plt"
df_plt = read_plt(plt_file)
df_plt

Unnamed: 0,lat,lon,alt,time
0,39.984702,116.318417,492,2008-10-23 02:53:04
1,39.984683,116.318450,492,2008-10-23 02:53:10
2,39.984686,116.318417,492,2008-10-23 02:53:15
3,39.984688,116.318385,492,2008-10-23 02:53:20
4,39.984655,116.318263,492,2008-10-23 02:53:25
...,...,...,...,...
903,40.009172,116.321211,88,2008-10-23 11:10:52
904,40.009204,116.321130,86,2008-10-23 11:10:57
905,40.009243,116.321050,85,2008-10-23 11:11:02
906,40.009269,116.320978,84,2008-10-23 11:11:07


In [33]:
df2 = df_plt.iloc[:50, 0:]

In [34]:
center_lat = df2['lat'].mean()
center_lng = df2['lon'].mean()
center = [center_lat, center_lng]

m = folium.Map(location=center, control_scale=True, tiles="Cartodb Positron")
for index, row in df2.iterrows():
    folium.Marker(location=[row['lat'], row['lon']]).add_to(m)
#m.save('map.html')
m

# I think this is by far the best dataset:

+ Coordinates in lat, lon

+ Seems like everything is updates every five seconds, hence, can built the routes and estimate the heaviness of the traffic

+ Although the data is old, the dataset was published in 2024, and by Microsoft (hence, should be realiable and make sense)

+ It is mainly Beijing, hence, makes sense to analyze the disctict we are currently at

### Other possible datasets (still need to check them):

https://utd19.ethz.ch/ - requested data

https://datarade.ai/data-categories/car-traffic-data