In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("traffic-count-vehicle-classification-2014-2017.csv")

#Removing these two columns as they mostly are missing values, only 5k rows with data.
#Also checking the db dictionary guide, I don't believe these features provide valuable insights
df = df.drop(["road_segment_1" , "road_segment_2"],axis = 1)

In [2]:
#Remove 112 samples where no vehicle data was captured
df1 = df.dropna(subset = ["vehicle_class_1"])

#Removing 5k rows in speed related columns where there are missing values
#Represents removal of about 8% of data. Should we use KNN instead and generate values?
df1 = df1.dropna(subset = ["average_speed", "85th_percentile_speed"])

#Replacing nan values in these columns with 0 vehicles
df1[["motorcycle", "bike"]] = df1[["motorcycle", "bike"]].fillna(0)

#Convert date and time from str to datetime
df1['date'] = pd.to_datetime(df1['date'], format='%Y-%m-%d')
df1['time'] = pd.to_datetime(df1['time'], format='%H:%M')

#Replacing "-" values with nan in max speed
df1['maximum_speed'] = df1['maximum_speed'].replace('-', np.nan)

#Convert max speed to numeric
df1['maximum_speed'] = pd.to_numeric(df1['maximum_speed'])

#Replacing nans in maximum speed with the median
if 'maximum_speed' in df1.columns:
    # Calculate the median of the 'maximum_speed' column
    median_average_speed = df1['maximum_speed'].median()
 
    # Replace NaN values in the 'maximum_speed' column with the mean
    df1['maximum_speed'] =  df1['maximum_speed'].fillna(median_average_speed)
 
    # Display the updated DataFrame
    print(df1.head())
else:
    print("The 'maximum_speed' column does not exist in the DataFrame.")

#Encoding string features into labels
df1[["road_name", "location", "suburb", "direction"]] = df1[["road_name", "location", "suburb", "direction"]].apply(LabelEncoder().fit_transform)

        date        road_name                    location      suburb  \
0 2014-10-11  Wolseley Parade  North East of Derby Street  Kensington   
1 2014-10-11  Wolseley Parade  North East of Derby Street  Kensington   
2 2014-10-11  Wolseley Parade  North East of Derby Street  Kensington   
3 2014-11-11  Wolseley Parade  North East of Derby Street  Kensington   
5 2014-11-11  Wolseley Parade  North East of Derby Street  Kensington   

   speed_limit direction                time  vehicle_class_1  \
0           50         W 1900-01-01 16:00:00             24.0   
1           50         W 1900-01-01 17:00:00             42.0   
2           50         W 1900-01-01 18:00:00             33.0   
3           50         W 1900-01-01 02:00:00              3.0   
5           50         W 1900-01-01 05:00:00              1.0   

   vehicle_class_2  vehicle_class_3  ...  vehicle_class_10  vehicle_class_11  \
0              0.0              0.0  ...               0.0               0.0   
1         

In [4]:
df1.to_csv(r"traffic_count_cleaned_data.csv", index=False)

In [3]:
for column in df1.columns:
    print(f"Column: {column}, Type of first element: {type(df1[column].iloc[0])}")


Column: date, Type of first element: <class 'pandas._libs.tslibs.timestamps.Timestamp'>
Column: road_name, Type of first element: <class 'numpy.int64'>
Column: location, Type of first element: <class 'numpy.int64'>
Column: suburb, Type of first element: <class 'numpy.int64'>
Column: speed_limit, Type of first element: <class 'numpy.int64'>
Column: direction, Type of first element: <class 'numpy.int64'>
Column: time, Type of first element: <class 'pandas._libs.tslibs.timestamps.Timestamp'>
Column: vehicle_class_1, Type of first element: <class 'numpy.float64'>
Column: vehicle_class_2, Type of first element: <class 'numpy.float64'>
Column: vehicle_class_3, Type of first element: <class 'numpy.float64'>
Column: vehicle_class_4, Type of first element: <class 'numpy.float64'>
Column: vehicle_class_5, Type of first element: <class 'numpy.float64'>
Column: vehicle_class_6, Type of first element: <class 'numpy.float64'>
Column: vehicle_class_7, Type of first element: <class 'numpy.float64'>
Co