In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import pickle


In [2]:
# ============================================
# 1. Load cleaned dataset
# ============================================
print("Loading cleaned dataset...")
df = pd.read_csv("cleaned_us101_final_clean.csv")

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

Loading cleaned dataset...
Dataset shape: (33536, 17)
Columns: ['Vehicle_ID', 'Frame_ID', 'Global_Time', 'Local_X', 'Local_Y', 'v_Vel', 'v_Acc', 'Lane_ID', 'Preceding', 'Following', 'delta_x', 'delta_y', 'delta_t', 'speed_calc', 'accel_calc', 'heading', 'lane_change']


In [3]:

# ============================================
# 2. Convert units (ft → meter)
# ============================================
feet_to_meter = 0.3048
convert_cols = ["Local_X", "Local_Y", "v_Vel", "v_Acc",
                "speed_calc", "accel_calc"]

for col in convert_cols:
    df[col] = df[col] * feet_to_meter

print("Unit conversion complete (ft → m).")


Unit conversion complete (ft → m).


In [4]:
# ============================================
# 3. Check for missing values
# ============================================
print("\nMissing values:")
print(df.isna().sum())

print("\nUnique Vehicle_ID count:", df["Vehicle_ID"].nunique())


Missing values:
Vehicle_ID     0
Frame_ID       0
Global_Time    0
Local_X        0
Local_Y        0
v_Vel          0
v_Acc          0
Lane_ID        0
Preceding      0
Following      0
delta_x        0
delta_y        0
delta_t        0
speed_calc     0
accel_calc     0
heading        0
lane_change    0
dtype: int64

Unique Vehicle_ID count: 488


In [5]:

# ============================================
# 4. Define feature and target columns
# ============================================
feature_cols = ["Local_X", "Local_Y", "v_Vel",
                "v_Acc", "speed_calc", "accel_calc"]

target_cols = ["Local_X", "Local_Y"]   # prediction target

print("\nFeature columns:", feature_cols)
print("Target columns:", target_cols)


Feature columns: ['Local_X', 'Local_Y', 'v_Vel', 'v_Acc', 'speed_calc', 'accel_calc']
Target columns: ['Local_X', 'Local_Y']


In [6]:
# ============================================
# 5. Train/Val/Test split by Vehicle_ID (no leakage)
# ============================================
print("\nSplitting dataset by Vehicle_ID...")

vehicle_ids = df["Vehicle_ID"].unique()

train_ids, temp_ids = train_test_split(vehicle_ids, test_size=0.30, random_state=42)
val_ids, test_ids = train_test_split(temp_ids, test_size=0.50, random_state=42)

train_df = df[df["Vehicle_ID"].isin(train_ids)].reset_index(drop=True)
val_df   = df[df["Vehicle_ID"].isin(val_ids)].reset_index(drop=True)
test_df  = df[df["Vehicle_ID"].isin(test_ids)].reset_index(drop=True)

print("Train size:", train_df.shape, "Vehicles:", train_df["Vehicle_ID"].nunique())
print("Val size:", val_df.shape,   "Vehicles:", val_df["Vehicle_ID"].nunique())
print("Test size:", test_df.shape, "Vehicles:", test_df["Vehicle_ID"].nunique())

# No overlap check
assert len(set(train_ids) & set(val_ids)) == 0
assert len(set(train_ids) & set(test_ids)) == 0
assert len(set(val_ids) & set(test_ids)) == 0

print("Vehicle ID split verified. No leakage.")



Splitting dataset by Vehicle_ID...
Train size: (23449, 17) Vehicles: 341
Val size: (5019, 17) Vehicles: 73
Test size: (5068, 17) Vehicles: 74
Vehicle ID split verified. No leakage.


In [7]:
# ============================================
# 6. Fit MinMaxScaler on TRAIN only
# ============================================
print("\nFitting MinMaxScaler on training set...")

scaler = MinMaxScaler()
scaler.fit(train_df[feature_cols + target_cols])

# Apply scaling
train_df[feature_cols + target_cols] = scaler.transform(train_df[feature_cols + target_cols])
val_df[feature_cols + target_cols]   = scaler.transform(val_df[feature_cols + target_cols])
test_df[feature_cols + target_cols]  = scaler.transform(test_df[feature_cols + target_cols])

print("Scaling complete.")
print("\nSample of scaled train_df:")
print(train_df.head())


Fitting MinMaxScaler on training set...
Scaling complete.

Sample of scaled train_df:
   Vehicle_ID  Frame_ID    Global_Time   Local_X   Local_Y     v_Vel  \
0         229      1221  1118847964100  0.016298  0.000000  0.437667   
1         229      1231  1118847965100  0.027930  0.012258  0.446000   
2         229      1241  1118847966100  0.037263  0.024933  0.454333   
3         229      1251  1118847967100  0.051285  0.037638  0.454167   
4         229      1261  1118847968100  0.066020  0.050319  0.450667   

      v_Acc  Lane_ID  Preceding  Following  delta_x  delta_y  delta_t  \
0  0.500000        1        223          0    0.513   26.264      1.0   
1  0.644645        1        223          0    0.506   26.293      1.0   
2  0.500000        1        223          0    0.406   27.189      1.0   
3  0.500000        1        223          0    0.610   27.254      1.0   
4  0.527027        1        223          0    0.641   27.200      1.0   

   speed_calc  accel_calc    heading  lan

In [8]:
# ============================================
# 7. Save scaler + datasets
# ============================================
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

train_df.to_csv("train_us101.csv", index=False)
val_df.to_csv("val_us101.csv", index=False)
test_df.to_csv("test_us101.csv", index=False)

print("\nSaved outputs:")
print("scaler.pkl")
print("train_us101.csv")
print("val_us101.csv")
print("test_us101.csv")
print("\nProcessing complete.")


Saved outputs:
scaler.pkl
train_us101.csv
val_us101.csv
test_us101.csv

Processing complete.
