### Import the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from helpers.utils import load_data

### Load the datasets

In [2]:
traffic, stations, mappings = load_data()

### Data Cleaning

#### Let's check for duplicate rows in the traffic dataset

In [3]:
duplicates = traffic[traffic.duplicated()]
print(f"Traffic dataset contains {len(duplicates)} duplicate rows")
duplicates.head()

Traffic dataset contains 750678 duplicate rows


Unnamed: 0,date,day_of_data,day_of_week,direction_of_travel,fips_state_code,functional_classification,month_of_data,station_id,traffic_volume_counted_after_0000_to_0100,traffic_volume_counted_after_0100_to_0200,...,traffic_volume_counted_after_1400_to_1500,traffic_volume_counted_after_1500_to_1600,traffic_volume_counted_after_1600_to_1700,traffic_volume_counted_after_1700_to_1800,traffic_volume_counted_after_1800_to_1900,traffic_volume_counted_after_1900_to_2000,traffic_volume_counted_after_2000_to_2100,traffic_volume_counted_after_2100_to_2200,traffic_volume_counted_after_2200_to_2300,traffic_volume_counted_after_2300_to_2400
9440,2015-11-02,2,2,1,54,3R,11,16,25,23,...,136,157,176,238,210,130,77,53,45,28
12434,2015-01-21,21,4,5,29,2U,1,4913,49,26,...,351,432,887,1407,1546,1015,377,243,193,115
14835,2015-08-09,9,1,7,54,4U,8,45,23,20,...,260,300,200,187,219,159,136,108,45,34
14888,2015-04-25,25,7,1,13,3U,4,222,27,20,...,252,258,265,258,213,172,122,93,75,50
16694,2015-04-06,6,2,3,39,1R,4,155,184,121,...,893,923,932,898,731,573,476,373,323,230


Given that it is very unlikely for a data row to be exactly identical to another for all of its 32 columns (including the traffic volume for all the time intervals). It is highly likely that these data rows are genuine cases of duplicate entries, arising from errors made from the data collection/collation process. As such, let's drop these duplicate rows from our subsequent analysis.

In [4]:
traffic = traffic.drop_duplicates(ignore_index = True)
print(f"Number of rows left in unduplicated traffic dataset: {len(traffic)}")

Number of rows left in unduplicated traffic dataset: 6389713


#### Let's also check for invalid values in the traffic volume columns (i.e. negative values)

In [5]:
start_column = "traffic_volume_counted_after_0000_to_0100"
end_column = "traffic_volume_counted_after_2300_to_2400"

invalid = traffic[(traffic.loc[:, start_column: end_column] < 0).any(axis=1)]
print(f"There are {len(invalid)} rows containing negative values for traffic volume")
invalid.head()

There are 851 rows containing negative values for traffic volume


Unnamed: 0,date,day_of_data,day_of_week,direction_of_travel,fips_state_code,functional_classification,month_of_data,station_id,traffic_volume_counted_after_0000_to_0100,traffic_volume_counted_after_0100_to_0200,...,traffic_volume_counted_after_1400_to_1500,traffic_volume_counted_after_1500_to_1600,traffic_volume_counted_after_1600_to_1700,traffic_volume_counted_after_1700_to_1800,traffic_volume_counted_after_1800_to_1900,traffic_volume_counted_after_1900_to_2000,traffic_volume_counted_after_2000_to_2100,traffic_volume_counted_after_2100_to_2200,traffic_volume_counted_after_2200_to_2300,traffic_volume_counted_after_2300_to_2400
22298,2015-03-08,8,1,3,51,5R,3,130018,10,7,...,247,270,270,238,162,113,49,43,19,2
32468,2015-03-08,8,1,3,51,4R,3,150326,14,5,...,96,78,58,71,75,63,60,21,25,16
43542,2015-03-08,8,1,5,51,1R,3,781229,8,-1,...,40,50,44,36,22,26,28,18,16,3
45564,2015-03-08,8,1,3,51,1U,3,40766,667,-1,...,2733,2899,2886,2759,2709,2161,1698,1295,877,602
55491,2015-03-08,8,1,3,51,1U,3,90122,427,-1,...,689,682,704,678,586,530,451,294,210,135


#### Let's inspect what the negative values are and their respective counts

In [9]:
invalid_traffic_volume = invalid.loc[:, start_column: end_column].values

negative_values, counts = np.unique(
    invalid_traffic_volume[invalid_traffic_volume < 0],
    return_counts = True
)

print("Value: Count")
for value, count in zip(negative_values, counts):
    print(f"{str(value).rjust(5, ' ')}: {count}")

Value: Count
-3061: 1
   -1: 1448


In most cases (1448 out of 1449), -1 was the negative value. These negative values could be because the sensors were down for that particular time interval. Let's assume that the negative values were indeed due to the down sensors. In such a scenario, no values would be recorded for traffic volume (i.e. traffic volume = 0). As such, let's replace the negative values with 0.

In [10]:
replacements = {col: {value: 0 for value in negative_values} for col in traffic.columns[-24:]}
traffic = traffic.replace(replacements)

#### Let's add new columns to house the full name representation of the categorical variables (i.e. fips_state_code)

In [39]:
def add_new_map_column(key: str, source: str) -> None:
    """
        Creates a new column in traffic dataframe to house the full name representations (obtained from mappings)
        of a categorical variable.
        
        Args
        ----------
        key: str
            key of the mapping in mappings
            
        source: str
            source column to map from
            
        Returns
        ----------
        None
    """
    mapping = mappings.get(key)
    
    newcol = key if key not in traffic.columns else f"{key}_map"
    assert newcol not in traffic.columns, f"The column {newcol} already exists."
    
    traffic[newcol] = traffic[source].apply(lambda x: mapping[x])
    print(f"The column {newcol} was successsfully added to the traffic dataframe.")

In [40]:
source_columns = ["fips_state_code", "fips_state_code", 
                  "direction_of_travel", "functional_classification"]

# Add the new mapping columns
for key, source in zip(mappings.keys(), source_columns):
    add_new_map_column(key, source)

The column fips_state_abb was successsfully added to the traffic dataframe.
The column fips_state_full was successsfully added to the traffic dataframe.
The column direction_of_travel_map was successsfully added to the traffic dataframe.
The column functional_classification_map was successsfully added to the traffic dataframe.


### Metric to measure traffic volume

#### Let's find out the number of days of data that each state has with a box plot

In [42]:
columns = ["date", "fips_state_full"]

num_days = traffic[columns].groupby("fips_state_full") \
                           .count() \
                           .reset_index() \
                           .rename(columns = {"date": "num_days"})

# Create box plot of the number of days
px.box(num_days, y="num_days", 
       points="all",
       labels={"num_days": "Number of days of data"},
       hover_data=["fips_state_full"])

Based on the box plot above, it could be observed that the number of days of data that each state has varies greatly. On the low end, we have District of Columbia with only 2,795 days of data while on the high end, we have Florida with 551,220 days of data (fips_state_code = 12). Magnitude wise, the highest number of days of data is approximately 197 times of the lowest. <br>

As such, total traffic volume would not be a fair metric to use for comparing the traffic volume across each state. This is because states with more days of data are more likely to also have higher total traffic volume, just by virtue of having more days of data. <br>

#### With that in mind, let's use average daily traffic volume instead to measure traffic volume, since it is normalized by the number of days of data and hence resolves the aforementioned issue.

### Top 5 Patterns

#### Let's see if there is any pattern between the average traffic volume and the geographical location of the state

In [49]:
# Find total daily traffic volume in each row
traffic["total_daily_traffic_volume"] = traffic \
                                        .loc[:, start_column: end_column] \
                                        .sum(axis=1)

columns = ["fips_state_abb", "fips_state_full", "total_daily_traffic_volume"]
group_columns = ["fips_state_abb", "fips_state_full"]

# Average by state
traffic_state = traffic[columns].groupby(group_columns) \
                                .mean() \
                                .reset_index() \
                                .rename(columns = {"total_daily_traffic_volume": "average_daily_traffic_volume"})

traffic_state.average_daily_traffic_volume = traffic_state.average_daily_traffic_volume.astype(int)

#### Visualize the average daily traffic volume by state with a choropleth map

In [55]:
# Create choropleth map of the US states and their average daily traffic volumes
fig = go.Figure()

fig.add_trace(go.Choropleth(
    locations=traffic_state.fips_state_abb,
    z=traffic_state.average_daily_traffic_volume,
    locationmode="USA-states",
    colorscale="Reds",
    autocolorscale=False,
    colorbar_title="Traffic Vol",
    text=traffic_state.fips_state_full,
    marker_line_color="white",
))

fig.update_layout(
    title="2015 US Average Daily Traffic Volume by State",
    geo = dict(
        scope="usa", # Limit map scope to USA
        projection=go.layout.geo.Projection(type = 'albers usa'),
        showlakes=True, # lakes
        lakecolor="rgb(255, 255, 255)"),
    )

### Pattern 1:
From the choropleth map above, it could be observed that in general, the coastal/border states of conterminous US have higher average daily traffic volume as compared to the inland states.

#### Let's see if there is any pattern between the average daily traffic volume and the time of the day

In [78]:
columns = list(traffic.loc[:, start_column: end_column].columns)
columns.append("fips_state_full")

# Average traffic volume by state
traffic_time = traffic[columns].groupby("fips_state_full") \
                               .mean() \
                               .astype(int)

# Transpose the dataframe
traffic_time = traffic_time.T \
                           .reset_index() \
                           .rename(columns = {"index": "time_of_day"})

# Map traffic_volume_counted_after_0000_to_0100 to 0, traffic_volume_counted_after_0100_to_0200 to 1 and so on and so forth
traffic_time = traffic_time.reset_index() \
                           .rename(columns = {"time_of_day": "time_of_day_name",
                                              "index": "time_of_day"})
traffic_time.columns.name = None

#### Visualize the average daily traffic volume by state and time of the day with a line plot

In [88]:
state_columns = list(traffic_time.columns)
state_columns.remove("time_of_day")
state_columns.remove("time_of_day_name")

# Create line plot
fig = go.Figure()

for state in state_columns:
    fig.add_trace(go.Scatter(
        x=traffic_time.time_of_day,
        y=traffic_time[state],
        mode="lines",
        name=state
    ))
    
# Set number of ticks to 24 for x axis
fig.update_xaxes(nticks=24)

fig.update_layout(
    title="2015 US Average Daily Traffic Volume by State and Time of the day",
    xaxis_title="Time of the day",
    yaxis_title="Average Daily Traffic Volume",
    legend_title_text="States"
)

fig.show()

print("To view the line plot for a specific state, double click on that state in the legend.")
traffic_time[["time_of_day", "time_of_day_name"]]

To view the line plot for a specific state, double click on that state in the legend.


Unnamed: 0,time_of_day,time_of_day_name
0,0,traffic_volume_counted_after_0000_to_0100
1,1,traffic_volume_counted_after_0100_to_0200
2,2,traffic_volume_counted_after_0200_to_0300
3,3,traffic_volume_counted_after_0300_to_0400
4,4,traffic_volume_counted_after_0400_to_0500
5,5,traffic_volume_counted_after_0500_to_0600
6,6,traffic_volume_counted_after_0600_to_0700
7,7,traffic_volume_counted_after_0700_to_0800
8,8,traffic_volume_counted_after_0800_to_0900
9,9,traffic_volume_counted_after_0900_to_1000


Interpretation of the line plot above should be based on the time_of_day mapping above. (i.e. 0 represents 	traffic_volume_counted_after_0000_to_0100, 1 represents traffic_volume_counted_after_0100_to_0200 and so on and so forth. What this means is that when we look at the average daily traffic volume at time_of_the_day = 0, we are actually looking at the average daily traffic volume from 0000 hrs to 0100 hrs.

### Pattern 2:
In general, the following traffic patterns were observed across all the US states:
<br>
<br>&emsp;1) Recorded total traffic volume starts increasing at around 4am. This increasing trend persists until around 8am.
<br>&emsp;2) Recorded total traffic volume starts increasing again at around 10am. This increasing trend persists until around 5pm.
<br>&emsp;3) Recorded total traffic volume then starts decreasing at around 5pm. This decreasing trend persists until around 3am.

#### Let's see if there is any pattern between the average daily traffic volume and the month