### Load Required Libraries
This cell imports all necessary libraries for data processing, geospatial operations, and numerical computations.

In [3]:
import pandas as pd
import geopandas as gpd
from shapely.wkt import loads
from shapely.geometry import Point, LineString
from scipy.spatial import cKDTree
import numpy as np


ModuleNotFoundError: No module named 'pandas'

### Load the Datasets
Reads the DFT (Traffic Data) and OXM (Road Network Data) datasets into pandas DataFrames.

In [12]:
# Load datasets
dft_path = "data/dft_reduced.csv"
oxm_path = "data/oxm_reduced.csv"

dft_df = pd.read_csv(dft_path)
oxm_df = pd.read_csv(oxm_path)


### Feature Selection
Selects only the relevant columns from each dataset that will be useful for analysis.

In [4]:
# Selecting relevant columns
dft_selected_cols = ["count_point_id", "year", "road_name", "latitude", "longitude", "all_motor_vehicles", "cars_and_taxis", "buses_and_coaches", "lgvs", "all_hgvs"]
oxm_selected_cols = ["name", "highway", "maxspeed", "length", "geometry"]

dft_filtered = dft_df[dft_selected_cols]
oxm_filtered = oxm_df[oxm_selected_cols]


### Handling Missing Values
Drops rows with missing road names to ensure consistency during merging.

In [5]:
# Handling missing values
dft_filtered.dropna(subset=["road_name"], inplace=True)
oxm_filtered.dropna(subset=["name"], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dft_filtered.dropna(subset=["road_name"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oxm_filtered.dropna(subset=["name"], inplace=True)


### Data Type Conversion
- Converts 'maxspeed' from string format to numeric values.
- Converts road geometries from WKT format to Shapely objects for spatial operations.

In [6]:
# Convert 'maxspeed' to numeric
oxm_filtered["maxspeed"] = oxm_filtered["maxspeed"].astype(str).str.extract("(\\d+)").astype(float)

# Convert geometry column from WKT format to Shapely LineString objects
oxm_filtered["geometry"] = oxm_filtered["geometry"].apply(lambda x: loads(x) if isinstance(x, str) else x)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oxm_filtered["maxspeed"] = oxm_filtered["maxspeed"].astype(str).str.extract("(\\d+)").astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oxm_filtered["geometry"] = oxm_filtered["geometry"].apply(lambda x: loads(x) if isinstance(x, str) else x)


### Geospatial Processing
- Converts latitude/longitude from the DFT dataset into a GeoDataFrame.
- Converts road geometries in OXM into a GeoDataFrame.
- Reprojects datasets to a metric-based coordinate system for accurate distance calculations.

In [7]:
# Convert DFT latitude/longitude into a GeoDataFrame
dft_gdf = gpd.GeoDataFrame(dft_filtered, geometry=gpd.points_from_xy(dft_filtered.longitude, dft_filtered.latitude), crs="EPSG:4326")

# Convert OXM dataframe into a GeoDataFrame
oxm_gdf = gpd.GeoDataFrame(oxm_filtered, geometry=oxm_filtered["geometry"], crs="EPSG:4326")

# Reproject to metric-based CRS for accurate distance calculations
dft_gdf = dft_gdf.to_crs(epsg=3857)
oxm_gdf = oxm_gdf.to_crs(epsg=3857)

# Extract centroid of road segments in OXM for proximity matching
oxm_gdf["centroid"] = oxm_gdf.geometry.centroid


### Nearest-Neighbor Matching
- Uses KDTree to find the nearest road segment for each traffic count point.
- Associates traffic data with the closest road segment based on geospatial proximity.

In [8]:
# Build KDTree for nearest-neighbor search
oxm_tree = cKDTree(np.vstack([oxm_gdf.centroid.x, oxm_gdf.centroid.y]).T)

# Find nearest road segment for each traffic count point
distances, indices = oxm_tree.query(np.vstack([dft_gdf.geometry.x, dft_gdf.geometry.y]).T)

# Attach nearest road data to DFT
dft_gdf["nearest_road"] = oxm_gdf.iloc[indices].name.values
dft_gdf["nearest_road_length"] = oxm_gdf.iloc[indices].length.values
dft_gdf["nearest_road_maxspeed"] = oxm_gdf.iloc[indices].maxspeed.values


### Merge Traffic Data with Road Network
Merges the DFT dataset with the OXM dataset based on the nearest matched road segment.

In [9]:
# Merge DFT with matched OXM data
merged_geo_df = dft_gdf.merge(oxm_gdf, left_on="nearest_road", right_on="name", how="left")

# Convert back to standard DataFrame
merged_df_final = pd.DataFrame(merged_geo_df.drop(columns=['geometry', 'centroid'], errors='ignore'))


### Save Processed Dataset


In [11]:
# Save final merged dataset
merged_df_final.to_csv("D:/coding project/DV project/data/final-data.csv", index=False)


### new data with time stamp 


In [None]:
import pandas as pd

#loading the data
file_path = "data/Original/dft_traffic_counts_raw_counts.csv" 
df = pd.read_csv(file_path)

# Filter data where region_name is "London"
df_london = df[df["region_name"] == "London"]

# Randomly select 100,000 rows from London data
df_sample = df_london.sample(n=100000, random_state=42)


  df = pd.read_csv(file_path)


### Removing missing rows for columns "start_junction_road_name" and "end_junction_road_name"

In [15]:
# Remove rows where "start_junction_road_name" or "end_junction_road_name" are missing
df_filtered = df_sample.dropna(subset=["start_junction_road_name", "end_junction_road_name"])

#saving the reduced dataset
output_path = "D:/Coding Projects/Traffic-management/data/traffic-df.csv"
df_filtered.to_csv(output_path, index= False)


In [None]:
import pandas as pd

# Load the dataset
file_path = "data/traffic-df.csv"
df = pd.read_csv(file_path)

# Filter data for London region only
df_london = df[df["region_name"] == "London"]

# Convert 'count_date' to datetime format
df_london['count_date'] = pd.to_datetime(df_london['count_date'], dayfirst=True, errors='coerce')

# Create 'day_type' to classify weekdays and weekends
df_london['day_type'] = df_london['count_date'].dt.dayofweek.apply(lambda x: 'Weekend' if x >= 5 else 'Weekday')

# Count occurrences of weekdays and weekends
day_type_counts = df_london['day_type'].value_counts()

# Print results
print(day_type_counts)


day_type
Weekday    55871
Name: count, dtype: int64


  df_london['count_date'] = pd.to_datetime(df_london['count_date'], dayfirst=True, errors='coerce')
