In [None]:
import dash
from dash import html
from dash import dcc
from dash.dependencies import Input, Output
import altair as alt
from IPython.display import IFrame
import pandas as pd
import numpy as np
import os
import glob
import zipfile
import shutil
from haversine import haversine, Unit
import matplotlib.pyplot as plt
import dash_bootstrap_components as dbc
from datetime import datetime
import plotly.express as px
import vegafusion

Process Data

In [None]:
def calculate_distance(row):
    start_coords = (row['start_lat'], row['start_lng'])
    end_coords = (row['end_lat'], row['end_lng'])
    return round(haversine(start_coords, end_coords, unit=Unit.KILOMETERS)*1000,2)

zip_path = '../data/raw/*.zip'
files = glob.glob(zip_path)
for zip_file in files:
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        temp_dir = 'temp_extracted'
        zip_ref.extractall(temp_dir)
path = os.path.join(temp_dir, '*.csv')
csv_files = glob.glob(path)
data = pd.DataFrame()
for file in csv_files:
    df = pd.read_csv(file, parse_dates=['started_at','ended_at'])
    df.head()
    data = pd.concat([data, df], ignore_index=True)
shutil.rmtree(temp_dir, ignore_errors=True)

data.dropna(subset=['end_lat', 'end_lng'], inplace=True)
data['ride_duration'] = round((data['ended_at'] - data['started_at']).dt.total_seconds()/60, 2) # in minutes
data['ride_distance'] = data.apply(calculate_distance, axis=1) # in meters
data.drop(data[data['ride_duration'] < 0].index, inplace=True)

In [None]:
data.isna().sum()

In [None]:
data.head()

In [None]:
len(data[(data['started_at'] > '2023-02-01 00:00:00') & (data['started_at'] < '2023-02-28 00:00:00')])

In [None]:
hourly_data = pd.DataFrame(data['started_at'].dt.hour.value_counts().sort_index(), columns=['count']).reset_index()

alt.data_transformers.enable('vegafusion')
data['started_at'] = pd.to_datetime(data['started_at'])
data['ended_at'] = pd.to_datetime(data['ended_at'])
df = data[(data['started_at'] > '2023-01-01 00:00:00') & (data['started_at'] < '2023-01-31 00:00:00')]

start_date = df['started_at'].min()
end_date = df['ended_at'].max()

filtered_df = df[(df['started_at']>=str(start_date)) & (df['ended_at']<=str(end_date))]

by_hour = alt.Chart(filtered_df).mark_bar().encode(
    x=alt.X('hours(started_at):O', title=None),
    y=alt.Y('count():Q', axis=alt.Axis(ticks=False), title=None),
    color=alt.Color('rideable_type:N', title=None, scale=alt.Scale(scheme='greenblue')),
    tooltip=['rideable_type:N', 'count():Q']).transform_filter(
        alt.FieldOneOfPredicate(field='rideable_type', oneOf=['electric_bike', 'classic_bike']))

by_day = alt.Chart(filtered_df).mark_bar().encode(
    x=alt.X('day(started_at):O', title=None),
    y=alt.Y('count():Q', axis=alt.Axis(ticks=False), title=None),
    color=alt.Color('rideable_type:N')
    )

alt.concat(by_hour | by_day).configure_view(stroke=None)



# by_month = alt.Chart(data).mark_bar().encode(
#     x=alt.X('month(started_at):O'),
#     y=alt.Y('mean(ride_duration):Q'),
#     color=alt.Color('rideable_type:N')
#     )

# by_duration = alt.Chart(data).mark_bar(clip=True).encode(
#     x=alt.X('ride_duration:Q', bin=alt.Bin(maxbins=50,extent=[0,140])),
#     y=alt.Y('count():Q'),
#     color=alt.Color('rideable_type:N'),
#     tooltip=['ride_duration:Q', 'count()']
# ).interactive()


### Map of bike stations

In [None]:
import folium
from IPython.display import IFrame

number_of_stations = 50 # Number of bike stations we want to show.
df = data # Currently showing for all data, we could change this to a singe month if we want.

top_stations = df['start_station_name'].value_counts().head(number_of_stations)
top_locations = df[df['start_station_name'].isin(top_stations.index)].drop_duplicates('start_station_name')[['start_station_name', 'start_lat', 'start_lng']]

points = [
    (row['start_lat'], row['start_lng'], row['start_station_name'], top_stations[row['start_station_name']]) 
    for index, row in top_locations.iterrows()
]

map_chicago = folium.Map(location=[41.8781, -87.6298], zoom_start=12)

for point in points:
    tooltip_text = f"Rank: {top_stations.index.get_loc(point[2]) + 1} - {point[2]}"  # Rank and station name
    folium.Marker(
        location=[point[0], point[1]],
        tooltip=tooltip_text
    ).add_to(map_chicago)

map_chicago
map_chicago.save('map_chicago.html')
IFrame(src='map_chicago.html', width=800, height=600)


In [None]:
# Set up the app with the Bootstrap theme
alt.data_transformers.enable('vegafusion')
df = data[(data['started_at'] > '2023-01-01 00:00:00') & (data['started_at'] < '2023-01-31 00:00:00')]
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.MINTY])

# Define the layout with Dash Bootstrap Components
app.layout = html.Div(
    dbc.Container(
        [
            html.H1("Bike Sharing Analysis", className="text-primary", style={'text-align': 'center'}),
            dbc.Row(
                [
                    dbc.Col(
                        [ 
                            html.Div(
                                [
                                    dcc.DatePickerRange(
                                        id='date-picker-range',
                                        start_date=df['started_at'].min(),
                                        end_date=df['ended_at'].max(),
                                        display_format='YYYY-MM-DD',
                                        className="mt-3",
                                        min_date_allowed=df['started_at'].min(), 
                                        max_date_allowed=df['ended_at'].max()
                                    )
                                ]
                            )
                        ],
                        style={'display': 'flex', 'flexDirection': 'column',  'alignItems': 'center', 'border': '1px solid black'}
                    )
                ]
            ),
            dbc.Row(
                [
                    html.Div(
                        [
                            dbc.Table(
                                # Table header
                                children=[
                                    html.Tbody(
                                        [
                                            html.Tr(
                                                [
                                                    html.Th(scope='row', className='table-dark', children="Rider Trends"),
                                                ]
                                            ),
                                        ]
                                    )
                                ], 
                                style={'text-align':'center', 'padding':'0vh', 'margin':'0vh'}
                            )
                        ]
                    )
                ]
            ),
            dbc.Row(
                [
                    dbc.Col(
                        [
                            html.Iframe(
                                id='rider-trend-bar',
                                srcDoc='',
                                style={'width': '100%', 'height':'100%', 'margin': 'auto', 'border': '1px solid black'}
                            )
                        ], 
                        width=10, 
                        style={'height':'45vh','margin-left': '0px', 'margin-right': '0px', 'paddingRight': '0px'}
                    ),
                    dbc.Col(
                        [
                            dcc.Checklist(
                                id='rider-trend-box',
                                options=[
                                    {'label': "Classic Bike", 'value': 'classic_bike'},
                                    {'label': "Docked Bike", 'value': 'docked_bike'},
                                    {'label': "Electric Bike", 'value': 'electric_bike'}
                                ],
                                value=['classic_bike', 'docked_bike', 'electric_bike'],
                                labelStyle={'display': 'block', 'margin-bottom': '5px'},
                                inputClassName="form-check-input",
                                labelClassName="form-check-label",
                                className="form-check",
                                style={'text-align': 'left', 'width': 'fit-content', 'border': '1px solid black'}
                            )
                        ], 
                        width=2, 
                        style={'margin': 'auto', 'paddingLeft': '0px', 'text-align': 'left'}
                    )
                ]
            ),
            dbc.Row(
                [
                    dcc.RadioItems(
                        id='rider-trend-radio',
                        options=[
                            {'label': "Number of Rides", 'value': 'count()'},
                            {'label': "Average Duration", 'value': 'mean(ride_duration)'}
                        ],
                        value='count()',
                        labelStyle={'display': 'inline-block', 'margin-left': '20px', 'margin-right': '20px'},
                        inputClassName="form-check-input",
                        labelClassName="form-check-label",
                        className="form-check",
                        style={'text-align': 'center', 'margin': 'auto', 'width': 'fit-content', 'border': '1px solid black'}
                    )
                ], 
                style={'text-align': 'center'}
            ),
            dbc.div(),
            dbc.Row(
                [
                    html.Div(
                        [
                            dbc.Table(
                                # Table header
                                children=[
                                    html.Tbody(
                                        [
                                            html.Tr(
                                                [
                                                    html.Th(scope='row', className='table-dark', children="Map of Stations"),
                                                ]
                                            ),
                                        ]
                                    )
                                ], 
                                style={'text-align':'center', 'padding':'0vh', 'margin':'0vh'}
                            )
                        ]
                    )
                ]
            ),
            dbc.Row(
                [
                    dbc.Col(
                        [
                            html.Iframe(
                                id='map-iframe',
                                srcDoc=open('map_chicago.html', 'r').read(),
                                style={'width': '100%', 'height':'45vh', 'border': '1px solid black'}
                            )
                        ], 
                        width=10, 
                        style={'height':'45vh','margin-left': '0px', 'margin-right': '0px', 'paddingRight': '0px'}
                    )
                ]
            )
        ]
    )
)

@app.callback(
    Output('rider-trend-bar', 'srcDoc'),
    [Input('rider-trend-radio', 'value'),
    Input('rider-trend-box', 'value'),
    Input('date-picker-range', 'start_date'),
     Input('date-picker-range', 'end_date')]
)
def plot_rider_trend(func, cat, start_date, end_date):
    filtered_df = df[(df['started_at'] >= str(start_date)) & (df['ended_at']<= str(end_date))]

    by_hour = alt.Chart(filtered_df).mark_bar().encode(
        x=alt.X('hours(started_at):O', title=None),
        y=alt.Y(f'{func}:Q', axis=alt.Axis(ticks=False), title=None),
        color=alt.Color('rideable_type:N', title=None, scale=alt.Scale(scheme='viridis')),
        tooltip=[
            alt.Tooltip('rideable_type:N', title='Ride Type'),
            alt.Tooltip(f'{func}:Q', title=None)
        ]
    ).transform_filter(alt.FieldOneOfPredicate(field='rideable_type', oneOf=cat))
    
    by_day = alt.Chart(filtered_df).mark_bar().encode(
        x=alt.X('day(started_at):O', title=None),
        y=alt.Y(f'{func}:Q', axis=alt.Axis(ticks=False), title=None),
        color=alt.Color('rideable_type:N', scale=alt.Scale(scheme='viridis')),
        tooltip=[
            alt.Tooltip('rideable_type:N', title='Ride Type'),
            alt.Tooltip(f'{func}:Q', title=None)
        ]
    ).transform_filter(alt.FieldOneOfPredicate(field='rideable_type', oneOf=cat))
    
    chart_1 = alt.concat(by_hour | by_day).configure_view(stroke=None)
    vega_lite_json = chart_1.to_json(format='vega')

    chart_obj = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
            <script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
            <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
        </head>
        <body>
        <center>
            <div id="altair-chart-container"></div>
            <script>
                var spec = {vega_lite_json};
                vegaEmbed('#altair-chart-container', spec);
            </script>
        </center>
        </body>
        </html>
    """
    return chart_obj


if __name__ == '__main__':
    app.run_server(debug=True)


Jacob's Heat Map

In [None]:
# Set up the app with the Bootstrap theme
alt.data_transformers.enable('vegafusion')
df = data[(data['started_at'] > '2023-01-01 00:00:00') & (data['started_at'] < '2023-01-31 00:00:00')]
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.MINTY])

top20_start = df['start_station_name'].value_counts()[:20].index.values

app.layout = html.Div([
    html.Iframe(
        id='heat',
        style={'border-width': '0', 'width': '100%', 'height': '400px'}
        ),
    
    # dropdown to select the stations to visualize
    dcc.Dropdown(
                id="station-select",
                options=[{"label": i, "value": i} for i in top20_start],
                value=top20_start[:10],
                multi=True,
        ),
    dcc.RadioItems(
                        id='heatmap-radio',
                        options=[
                            {'label': "Start Station", 'value': 'start'},
                            {'label': "End Station", 'value': 'end'},
                            {'label': "Net Bikes", 'value': 'net'}
                        ],
                        value='start',
                        labelStyle={'display': 'inline-block', 'margin-left': '20px', 'margin-right': '20px'},
                        # inputClassName="form-check-input",
                        # labelClassName="form-check-label",
                        # className="form-check",
                        style={'text-align': 'center', 'margin': 'auto', 'width': 'fit-content', 'border': '1px solid black'}
                    )
    

        ])

# Set up callbacks/backend
@app.callback(
    Output('heat', 'srcDoc'),
    Input('station-select', 'value'),
    Input('heatmap-radio', 'value')
    )
def plot_altair(stations, heat_type):

    if heat_type == 'start':
        
        heatmap = alt.Chart(data[data['start_station_name'].isin(stations)]).mark_rect().encode(
            x=alt.X('hours(started_at):O', title=None),
            y='start_station_name:N',
            color='count()',
            tooltip=['start_station_name:N','hours(started_at):O', 'count():Q']
            )

    elif heat_type == 'end':
        
        heatmap = alt.Chart(data[data['end_station_name'].isin(stations)]).mark_rect().encode(
            x=alt.X('hours(ended_at):O', title=None),
            y='end_station_name:N',
            color='count()',
            tooltip=['end_station_name:N','hours(ended_at):O', 'count():Q']
            )
        
    elif heat_type == 'net':

        # get the net bikes df
        start_values = df[df['start_station_name'].isin(stations)]['start_station_name'].groupby([data['started_at'].dt.hour, data['start_station_name']]).count()
        start_values.index.names = ['time', 'station_name']
        start_values = pd.DataFrame(start_values).rename(columns={'start_station_name':'value'}) 
        end_values = df[df['end_station_name'].isin(stations)]['end_station_name'].groupby([data['ended_at'].dt.hour, data['end_station_name']]).count()
        end_values.index.names = ['time', 'station_name']
        end_values = pd.DataFrame(end_values).rename(columns={'end_station_name':'value'}) 

        net_bikes = end_values - start_values
        net_bikes.reset_index(inplace=True)
        net_bikes

        # get min/max values
        diff_min = net_bikes['value'].min()
        diff_max = net_bikes['value'].max()

        # make heatmap
        heatmap = alt.Chart(net_bikes).mark_rect().encode(
            x=alt.X('time:O', title=None),
            y='station_name:N',
            color=alt.Color('value:Q', scale=alt.Scale(range=['#D4322C', 'white', '#4A74B4'], domainMid=0, domain=[diff_min, diff_max])).title("Net Bikes at Station"),
            tooltip= [alt.Tooltip('station_name:N', title="Station Name"), alt.Tooltip('ended_at:O', title="Time of Day"), alt.Tooltip('value:Q', title='Net Bikes')]
        )
    

    vega_lite_json = heatmap.to_json(format='vega')

    chart_obj = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
            <script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
            <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
        </head>
        <body>
        <center>
            <div id="altair-chart-container"></div>
            <script>
                var spec = {vega_lite_json};
                vegaEmbed('#altair-chart-container', spec);
            </script>
        </center>
        </body>
        </html>
    """
    return chart_obj


if __name__ == '__main__':
    app.run_server(debug=False)
