In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv("traffic-count-vehicle-classification-2014-2017.csv")

#Removing these two columns as they mostly are missing values, only 5k rows with data.
#Also checking the db dictionary guide, I don't believe these features provide valuable insights
df = df.drop(["road_segment_1" , "road_segment_2"],axis = 1)

Cleaning of "traffic-count-vehicle-classification-2014-2017" data

In [5]:
###Code by Logan Guilding###

#Remove 112 samples where no vehicle data was captured
df1 = df.dropna(subset = ["vehicle_class_1"])

#Removing 5k rows in speed related columns where there are missing values
#Represents removal of about 8% of data. Should we use KNN instead and generate values?
df1 = df1.dropna(subset = ["average_speed", "85th_percentile_speed"])

#Replacing nan values in these columns with 0 vehicles
df1[["motorcycle", "bike"]] = df1[["motorcycle", "bike"]].fillna(0)

#Convert date and time from str to datetime
df1['date'] = pd.to_datetime(df1['date'], format='%Y-%m-%d')
df1['time'] = pd.to_datetime(df1['time'], format='%H:%M').dt.time

#Replacing "-" values with nan in max speed
df1['maximum_speed'] = df1['maximum_speed'].replace('-', np.nan)

#Convert max speed to numeric
df1['maximum_speed'] = pd.to_numeric(df1['maximum_speed'])

#Replacing nans in maximum speed with the median
if 'maximum_speed' in df1.columns:
    # Calculate the median of the 'maximum_speed' column
    median_average_speed = df1['maximum_speed'].median()
 
    # Replace NaN values in the 'maximum_speed' column with the mean
    df1['maximum_speed'] =  df1['maximum_speed'].fillna(median_average_speed)
 
    # Display the updated DataFrame
    print(df1.head())
else:
    print("The 'maximum_speed' column does not exist in the DataFrame.")

#Encoding string features into labels
df1[["road_name", "location", "suburb", "direction"]] = df1[["road_name", "location", "suburb", "direction"]].apply(LabelEncoder().fit_transform)

        date        road_name                    location      suburb  \
0 2014-10-11  Wolseley Parade  North East of Derby Street  Kensington   
1 2014-10-11  Wolseley Parade  North East of Derby Street  Kensington   
2 2014-10-11  Wolseley Parade  North East of Derby Street  Kensington   
3 2014-11-11  Wolseley Parade  North East of Derby Street  Kensington   
5 2014-11-11  Wolseley Parade  North East of Derby Street  Kensington   

   speed_limit direction      time  vehicle_class_1  vehicle_class_2  \
0           50         W  16:00:00             24.0              0.0   
1           50         W  17:00:00             42.0              0.0   
2           50         W  18:00:00             33.0              0.0   
3           50         W  02:00:00              3.0              0.0   
5           50         W  05:00:00              1.0              0.0   

   vehicle_class_3  ...  vehicle_class_10  vehicle_class_11  vehicle_class_12  \
0              0.0  ...               0.0      

Cleaning of "road-corridors"data

In [6]:
###Code by Nauman Abid###

df2 = pd.read_csv("road-corridors.csv")

# dropping nan values 
df2 = df2.dropna(subset=['seg_descr'])

# Dropping the 'Geo Shape' column from df2
df2 = df2.drop(columns=['Geo Shape'])
df2 # we see we successfully removed it


# Function to split 'Geo Point' into latitude and longitude
def split_geo_point(geo_point):
    try:
        latitude, longitude = map(float, geo_point.split(','))
        return latitude, longitude
    except ValueError:
        return pd.NA, pd.NA

# Apply the function to create new columns for latitude and longitude
df2[['Latitude', 'Longitude']] = df2['Geo Point'].apply(lambda x: pd.Series(split_geo_point(x)))

# Remove the 'Geo Point' column and the others as we have street info on the other dataset
df2.drop(columns=['Geo Point','str_type','seg_descr'], inplace=True)

# Display the updated DataFrame
df2.head()

Unnamed: 0,seg_id,dtupdate,status_id,poly_area,gisid,street_id,seg_part,Latitude,Longitude
0,21428,20210923,1,901,1136,0,1,-37.807205,144.950775
1,20395,20210923,3,481,3329,1116,1,-37.818487,144.96164
2,20734,20210923,3,424,1538,1128,1,-37.806575,144.966704
3,30258,20210923,4,78,192,1555,1,-37.79587,144.965727
4,21429,20210923,2,6118,1137,1089,1,-37.807839,144.94973


Combining of cleaned data

In [7]:
#Merging them together using road_segment and seg_id columns
merged_df = pd.merge(df1, df2, left_on='road_segment', right_on='seg_id', how='left')

merged_df.head()

Unnamed: 0,date,road_name,location,suburb,speed_limit,direction,time,vehicle_class_1,vehicle_class_2,vehicle_class_3,...,road_segment,seg_id,dtupdate,status_id,poly_area,gisid,street_id,seg_part,Latitude,Longitude
0,2014-10-11,110,105,5,50,3,16:00:00,24.0,0.0,0.0,...,21727,21727,20210923,2,5235,2301,1186,1,-37.796238,144.929383
1,2014-10-11,110,105,5,50,3,17:00:00,42.0,0.0,0.0,...,21727,21727,20210923,2,5235,2301,1186,1,-37.796238,144.929383
2,2014-10-11,110,105,5,50,3,18:00:00,33.0,0.0,1.0,...,21727,21727,20210923,2,5235,2301,1186,1,-37.796238,144.929383
3,2014-11-11,110,105,5,50,3,02:00:00,3.0,0.0,0.0,...,21727,21727,20210923,2,5235,2301,1186,1,-37.796238,144.929383
4,2014-11-11,110,105,5,50,3,05:00:00,1.0,0.0,0.0,...,21727,21727,20210923,2,5235,2301,1186,1,-37.796238,144.929383


In [None]:
merged_df.to_csv('merged_traffic_data.csv', index=False)