In [None]:
# 1. Import libraries
import pandas as pd
 
# 2. Load dataset
df = pd.read_csv('traffic_volume_dataset.csv', low_memory=False)
 
# 3. Convert startDate to datetime
df['startDate'] = pd.to_datetime(df['startDate'], dayfirst=True, errors='coerce')
 
# 4. Drop rows with invalid or missing dates
df = df.dropna(subset=['startDate'])
 
# 5. Filter for data from 2023 onwards
df = df[df['startDate'].dt.year >= 2023]
 
# 6. Drop irrelevant or high-cardinality columns
df = df.drop(['regionName', 'siteDescription', 'siteReference'], axis=1)
 
# 7. Convert trafficCount to numeric and drop missing
df['trafficCount'] = pd.to_numeric(df['trafficCount'], errors='coerce')
df = df.dropna(subset=['trafficCount'])
 
# 8. Drop rows with missing key features
df = df.dropna(subset=['laneNumber', 'flowDirection', 'classWeight'])
 
# 9. Encode classWeight (low cardinality)
df = pd.get_dummies(df, columns=['classWeight'], drop_first=True)
 
# 10. Extract date features
df['day'] = df['startDate'].dt.day
df['month'] = df['startDate'].dt.month
df['year'] = df['startDate'].dt.year
 
# 11. Drop original startDate column
df = df.drop(['startDate'], axis=1)
 
# 12. Downcast numeric columns to reduce memory usage
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = pd.to_numeric(df[col], downcast='float')
 
# 13. Optional: Save cleaned dataset
df.to_csv('traffic_volume_cleaned_2023.csv', index=False)
 
# 14. Preview cleaned data
print(df.info())
print(df.head())