In [28]:
%%capture
# Import the needed libraries 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
pd.set_option('display.max_columns', 999)
import matplotlib.ticker as ticker
import matplotlib.animation as animation
from IPython.display import HTML
import plotly.express as px
from datetime import datetime
from bokeh.plotting import figure, output_file, show
import seaborn as sns
from scipy import stats
from IPython.display import HTML
pd.options.display.float_format = '{:.4f}'.format
pd.set_option('display.max_rows', 400)

# Read needed DFs from CSV files generated in 'Preparing the data' script
bike_journey_data=pd.read_csv('bike_journey_data.csv')
weather_data=pd.read_csv('weather_data_27_mar_2020.csv')
bike_location_data=pd.read_csv('bike_location_data.csv')

In [29]:
%%capture
# This collapses down the dataset to get the count by Station for each hour
bike_journey_data['c'] = 1
def collapse_dataset(bike_journey_data, renamed_count):
    dataset_collapse = bike_journey_data.loc[:, ('Month', 'c')].groupby(['Month']).sum()
    dataset_collapse.reset_index(inplace=True)
    dataset_collapse.rename(columns={'c': str(renamed_count)}, inplace=True)
    return dataset_collapse

collapse_dataset = collapse_dataset(bike_journey_data, 'c')

print(collapse_dataset)

period_collapse = bike_journey_data.loc[:, ('Hours', 'c', 'StartStation Name')].groupby(['Hours', 'StartStation Name']).sum()
period_collapse.reset_index(inplace=True)
period_collapse.rename(columns={'c': 'Count', 'Hours': 'Hours', 'StartStation Name':'StartStation Name'}, inplace=True)

period_collapse = bike_journey_data.loc[:, ('Day', 'c', 'Hours')].groupby(['Hours']).sum()
period_collapse.reset_index(inplace=True)
period_collapse.rename(columns={'c': 'Count', 'Hours': 'Hours'}, inplace=True)

fig = px.bar(period_collapse, x="Hours", y="Count", orientation='v')

fig.show()

In [30]:
%%capture
# This filters the dataset by weekday and weekend to generate the distribution plot
ax=f, axes = plt.subplots(1, 0, figsize=(15, 8), sharex=True)
sns.despine(left=True)

Monday = bike_journey_data.loc[bike_journey_data['Week Day'] == 0]
Tuesday=bike_journey_data.loc[bike_journey_data['Week Day'] == 1]
Wednesday=bike_journey_data.loc[bike_journey_data['Week Day'] == 2]
Thursday=bike_journey_data.loc[bike_journey_data['Week Day'] == 3]
Friday=bike_journey_data.loc[bike_journey_data['Week Day'] == 4]
Saturday=bike_journey_data.loc[bike_journey_data['Week Day'] == 5]
Sunday=bike_journey_data.loc[bike_journey_data['Week Day'] == 6]

sns.distplot(Monday[['Hours']], hist=False, color="g", bins=True, label='Monday', kde_kws={"shade": True})
sns.distplot(Tuesday[['Hours']], hist=False, color="b", label='Tuesday', kde_kws={"shade": True})
sns.distplot(Wednesday[['Hours']], hist=False, color="y", label='Wednesday', kde_kws={"shade": True} )
sns.distplot(Thursday[['Hours']], hist=False, color="m", label='Thursday', kde_kws={"shade": True}  )
sns.distplot(Friday[['Hours']], hist=False, color="c", label='Friday', kde_kws={"shade": True} )
sns.distplot(Saturday[['Hours']], hist=False, color="r", label='Saturday', kde_kws={"shade": True})
sns.distplot(Sunday[['Hours']], hist=False, color="k", label='Sunday', kde_kws={"shade": True})

plt.xlabel("Hour of the day", fontsize=16)
plt.ylabel("Density of bike trips", fontsize=16)
#plt.title("Bike Usage by Day of the Week and Hour of the Day", fontsize=20, color="b")
plt.xlim(0, 24)
plt.xticks([0, 4, 8, 12, 16, 20, 24], fontsize=15)
plt.yticks(fontsize=15)

plt.legend(fontsize=15)

plt.show()
plt.savefig("distribution.png", format='png')

In [31]:
%%capture
# This filters the dataset by weekday and weekend and aggregates all weekdays
bike_journey_data["weekday"] = bike_journey_data['Week Day'].apply(lambda x: 1 if x>=0 else 0) & bike_journey_data['Week Day'].apply(lambda x: 1 if x<=4 else 0)
bike_journey_data["weekend"] = bike_journey_data['Week Day'].apply(lambda x: 1 if x>=5 else 0) & bike_journey_data['Week Day'].apply(lambda x: 1 if x==6 else 0)

import seaborn as sns
import matplotlib.pyplot as plt

ax=f, axes = plt.subplots(1, 0, figsize=(13, 8), sharex=False)
sns.despine(left=True)

Weekday = bike_journey_data.loc[bike_journey_data['weekday'] == 1]
Weekend=bike_journey_data.loc[bike_journey_data['weekend'] == 1]

sns.distplot(Weekday[['Hours']], hist=False, color="c", label='Weekday', kde=True, kde_kws=dict(linewidth=3))
sns.distplot(Weekend[['Hours']], hist=False, color="y", label='Weekend', kde=True, kde_kws=dict(linewidth=3))

plt.xlabel("Time (am/pm)", fontsize=16)
plt.ylabel("Density", fontsize=16)
plt.title("Bike Usage by Day of the Week and Hour of the Day", fontsize=20, color="k")
plt.xlim(0, 24)
plt.legend(['Weekday', 'Weekend'], fontsize=15)
plt.xticks([0, 4, 8, 12, 16, 20, 24], fontsize=15)
plt.yticks(fontsize=15)

#plt.show()
plt.savefig("distribution_bike_trips.png", format='png')

In [32]:
%%capture
# Grouping the bike journey dataset by summing the daily bike trips
daily_bike_journeys = bike_journey_data.loc[:, ('c', 'id')].groupby(['id']).sum()
daily_bike_journeys.reset_index(inplace=True)
daily_bike_journeys.rename(columns={'c': 'daily_bike_trips', 'id': 'id'}, inplace=True)

# Grouping the weather dataset by taking the average of the daily temp, humidity, and precipitation 
temperature_by_day=weather_data.groupby(['id']).agg({'temperature':np.mean})
rain_by_day=weather_data.groupby(['id']).agg({'precipIntensity':np.mean})
humidity_by_day=weather_data.groupby(['id']).agg({'humidity':np.mean})
temperature_by_day['id_new']=temperature_by_day.index
temperature_by_day['rain_by_day']=rain_by_day
temperature_by_day['humidity']=humidity_by_day

# Merging bike journey with weather data
bike_weather_data= daily_bike_journeys.merge(right=temperature_by_day,
                             left_on = 'id',
                             right_on = 'id_new')

# Generating a dummy var if precipitation intensity in any day is above the mean
bike_weather_data["rain"] = bike_weather_data['rain_by_day'].apply(lambda x: 'rain' if x > 0.0810 else 'no rain')
rain_dataset = bike_weather_data.loc[bike_weather_data['rain'] == 'rain']
no_rain_dataset = bike_weather_data.loc[bike_weather_data['rain'] == 'no rain']
df1 = bike_weather_data[['id','temperature','daily_bike_trips']]

In [33]:
%%capture
# Grouping bike journeys by summing it by bike station, weekday, weekend, and truncating it to largest 30 values
bike_journey_data['bike_trips_by_station'] = 1
def bike_trips_by_station(bike_journey_data, renamed_count):
    bike_trips_by_station = bike_journey_data.loc[:, ('StartStation Id', 'bike_trips_by_station')].groupby(['StartStation Id']).count()
    bike_trips_by_station.reset_index(inplace=True)
    bike_trips_by_station.rename(columns={'bike_trips_by_station': str(renamed_count)}, inplace=True)
    return bike_trips_by_station
bike_trips_by_station = bike_trips_by_station(bike_journey_data, 'bike_trips_by_station')

bike_count_station= bike_journey_data.merge(right=bike_trips_by_station,
                             left_on = 'StartStation Id',
                             right_on = 'StartStation Id')

weekday = bike_count_station.loc[bike_count_station['weekday'] == 1]
weekend = bike_count_station.loc[bike_count_station['weekday'] == 0]

weekday = weekday.loc[:, ('StartStation Name', 'bike_trips_by_station_y')].groupby(['StartStation Name']).count()
weekend = weekend.loc[:, ('StartStation Name', 'bike_trips_by_station_y')].groupby(['StartStation Name']).count()
df_merge = pd.merge(weekday, weekend, on='StartStation Name')

df_merge = (df_merge.sort_values(by='bike_trips_by_station_y_x', ascending=False).head(30))
df_merge['Station']=df_merge.index
df_merge.rename(columns={'bike_trips_by_station_y_x': 'Weekday, Count', 'bike_trips_by_station_y_y': 'Weekend, Count'}, inplace=True)

colors = ["#cab3ff", "green"]

x= df_merge[['Weekday, Count','Weekend, Count', 'Station']]
y= x.set_index('Station')
z=y.groupby('Station').mean().sort_values(by='Weekday, Count', ascending=False)

z.plot.bar(stacked=True, color=colors,figsize=(15,7))

#plt.title("Number of Rentals per Station", fontsize=20, color="k", ha='right')

plt.legend(['Weekday, Count', 'Weekend, Count'], fontsize=15)

plt.yticks(fontsize=15)

plt.xticks(fontsize=15)

plt.xlabel("", fontsize=16)

plt.ylabel("Bike trips", fontsize=16)

plt.show()


In [34]:
%%capture
# Grouping the bike journey dataset by summing the daily bike trips
daily_bike_journeys = bike_journey_data.loc[:, ('c', 'id')].groupby(['id']).sum()
daily_bike_journeys.reset_index(inplace=True)
daily_bike_journeys.rename(columns={'c': 'daily_bike_trips', 'id': 'id'}, inplace=True)

# Grouping the weather dataset by taking the average of the daily temp, humidity, and precipitation 
temperature_by_day=weather_data.groupby(['id']).agg({'temperature':np.mean})
rain_by_day=weather_data.groupby(['id']).agg({'precipIntensity':np.mean})
humidity_by_day=weather_data.groupby(['id']).agg({'humidity':np.mean})
temperature_by_day['id_new']=temperature_by_day.index
temperature_by_day['rain_by_day']=rain_by_day
temperature_by_day['humidity']=humidity_by_day

# Merging bike journey with weather data
bike_weather_data= daily_bike_journeys.merge(right=temperature_by_day,
                             left_on = 'id',
                             right_on = 'id_new')

# Generating a dummy var if precipitation intensity in any day is above the mean
bike_weather_data["rain"] = bike_weather_data['rain_by_day'].apply(lambda x: 'rain' if x > 0.0810 else 'no rain')
rain_dataset = bike_weather_data.loc[bike_weather_data['rain'] == 'rain']
no_rain_dataset = bike_weather_data.loc[bike_weather_data['rain'] == 'no rain']
df1 = bike_weather_data[['id','temperature','daily_bike_trips']]

In [35]:
%%capture
# Regression scatterplot between average daily temperature and bike trips
slope, intercept, r_value, p_value, std_err = stats.linregress(bike_weather_data['temperature'],bike_weather_data['daily_bike_trips'])

sns.lmplot(x="temperature", y="daily_bike_trips", data=bike_weather_data, hue="rain", palette="Set1", height=7, aspect=1.6, x_ci='ci', fit_reg=True, ci=95, 
               legend=False, markers=['^', 'o'])

plt.xlabel("Temperature (°C)", fontsize=16)
plt.ylabel("Bike trips per day", fontsize=16)
plt.legend(fontsize=15, loc='center right')
#plt.title("Bike usage vs Temperature", fontsize=20, color="k", loc='left')
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.show()

In [36]:
%%capture
bike_weather= bike_journey_data.merge(right=weather_data,
                             left_on = 'id_hours',
                             right_on = 'id_Hours')

bike_weather['bike_trips_hour'] = 1
def bike_trips_hour(bike_weather, renamed_count):
    bike_trips_hour = bike_weather.loc[:, ('id_x', 'bike_trips_hour')].groupby(['id_x']).count()
    bike_trips_hour.reset_index(inplace=True)
    bike_trips_hour.rename(columns={'bike_trips_by_station': str(renamed_count)}, inplace=True)
    return bike_trips_hour
bike_trips_hour = bike_trips_hour(bike_weather, 'bike_trips_hour')

bike_weather_2=bike_weather.merge(right=bike_trips_hour,
                             left_on = 'id_x',
                             right_on = 'id_x')

In [37]:
%%capture
# Bike usage and weather conditions / daily picture
fig,ax = plt.subplots(figsize=(15, 8))
# make a plot
ax.plot(bike_weather_2.id_x, bike_weather_2.bike_trips_hour_y, color="#b3cfff", marker='o',
        linewidth=0.5, markersize=2)

# set x-axis label
ax.set_xlabel("Time",fontsize=16)
# set y-axis label
ax.grid(False)
ax.set_ylabel("Number of bike usage per day",color="k",fontsize=16)

# twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(bike_weather_2.id_x, bike_weather_2.temperature,color="#90d595", marker='o',
        linewidth=0.5, markersize=2)
ax2.set_ylabel("Average daily temperature (°C)",color="k",fontsize=16)
ax2.grid(False)
#for tick in ax.get_xticklabels():
 #   tick.set_rotation(90)

#plt.title("Relationship between Bike Usage and Temperature", fontsize=20, color="b")

plt.xlim(0, 365)

plt.xticks([1, 40, 80, 120, 160, 200, 240, 280, 320, 350, 365], fontsize=18)

plt.show()

# save the plot as a file
#fig.savefig('two_different_y_axis_for_single_python_plot_with_twinx.jpg',
          #  format='jpeg',
          #  dpi=1000)

In [38]:
%%capture
# Collapsing bike journeys by source bike station and hour of the day 
period_collapse_start = bike_journey_data.loc[:, ('Hours', 'c', 'StartStation Name')].groupby(['Hours', 'StartStation Name']).sum()
period_collapse_start.reset_index(inplace=True)
period_collapse_start.rename(columns={'c': 'Count', 'Hours': 'Hours', 'StartStation Name':'Station'}, inplace=True)

# Cutting the bike journeys into quantiles based on the number of bike counts
bin_labels_5 = ['0-quantile', '.1-quantile', '.2-quantile', '.3-quantile', '.4-quantile', '.5-quantile', '.6-quantile', '.7-quantile', '.8-quantile', '.9-quantile']
period_collapse_start['quantile'] = pd.qcut(period_collapse_start['Count'],
                              q=[0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1],
                              labels=bin_labels_5)

colors = dict(zip(['0-quantile', '.1-quantile', '.2-quantile', '.3-quantile', '.4-quantile', '.5-quantile', '.6-quantile', '.7-quantile', '.8-quantile', '.9-quantile'],['#adb0ff', '#b8ffb3', '#ffefb3', '#ffc6b3', '#b3cfff', '#ffb3ff', '#90d595', '#cab3ff', '#e48381','#aafbff']))
group_lk = period_collapse_start.set_index('Station')['quantile'].to_dict()

# Plotting top 10 largest source bike stations by time of the day using a bar race chart
fig, ax = plt.subplots(figsize=(15, 8))
def draw_barchart(current_time_1):
    df = period_collapse_start[period_collapse_start['Hours'].eq(current_time_1)].sort_values(by='Count', ascending=True).tail(10)
    ax.clear()
    ax.barh(df['Station'], df['Count'], color=[colors[group_lk[x]] for x in df['Station']])
    dx = df['Count'].max() / 200
    for i, (Count, Station) in enumerate(zip(df['Count'], df['Station'])):
        ax.text(Count-dx, i,     Station,           size=14, weight=600, ha='right', va='bottom')
        ax.text(Count-dx, i-.25, group_lk[Station], size=10, color='#444444', ha='right', va='baseline')
        ax.text(Count+dx, i,     f'{Count:,.0f}',  size=14, ha='left',  va='center')
    # ... polished styles
    ax.text(1, 0.4, current_time_1, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
    ax.text(0, 1.06, 'Number of bike trips', transform=ax.transAxes, size=12, color='#777777')
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis='x', colors='#777777', labelsize=12)
    ax.set_yticks([])
    ax.margins(0, 0.01)
    ax.grid(which='major', axis='x', linestyle='-')
    ax.set_axisbelow(True)
    ax.text(0, 1.12, '',
            transform=ax.transAxes, size=24, weight=600, ha='left')
    ax.text(1, 0, 'by @albegjonbalaj; credit @pratapvardhan @jburnmurdoch', transform=ax.transAxes, ha='right',
            color='#777777', bbox=dict(facecolor='white', alpha=0.8, edgecolor='white'))
    plt.box(False)
    
draw_barchart(17)

fig, ax = plt.subplots(figsize=(15, 8))
animator = animation.FuncAnimation(fig, draw_barchart, frames=range(0, 24))
HTML(animator.to_jshtml())
#animator2.save('animation.mp4', writer=writer)

animator.save('animation_v3.gif', writer='imagemagick', fps=40)

In [40]:
%%capture
#Collapsing bike journeys by destination bike station and hour of the day 
period_collapse_end = bike_journey_data.loc[:, ('Hours', 'c', 'EndStation Name')].groupby(['Hours', 'EndStation Name']).sum()
period_collapse_end.reset_index(inplace=True)
period_collapse_end.rename(columns={'c': 'Count', 'Hours': 'Hours', 'EndStation Name':'Station'}, inplace=True)
period_collapse_end.head()

# Cutting the bike journeys into quantiles based on the number of bike counts
bin_labels_5 = ['0-quantile', '.1-quantile', '.2-quantile', '.3-quantile', '.4-quantile', '.5-quantile', '.6-quantile', '.7-quantile', '.8-quantile', '.9-quantile']
period_collapse_end['quantile'] = pd.qcut(period_collapse_end['Count'],
                              q=[0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1],
                              labels=bin_labels_5)
period_collapse_end.head(100)

colors = dict(zip(['0-quantile', '.1-quantile', '.2-quantile', '.3-quantile', '.4-quantile', '.5-quantile', '.6-quantile', '.7-quantile', '.8-quantile', '.9-quantile'],
                  ['#adb0ff', '#b8ffb3', '#ffefb3', '#ffc6b3', '#b3cfff', '#ffb3ff', '#90d595', '#cab3ff', '#e48381','#aafbff']))
group_lk = period_collapse_end.set_index('Station')['quantile'].to_dict()

# Plotting top 10 largest destination bike stations by time of the day using a bar race chart
fig, ax = plt.subplots(figsize=(15, 8))
def draw_barchart(current_time_1):
    df = period_collapse_end[period_collapse_end['Hours'].eq(current_time_1)].sort_values(by='Count', ascending=True).tail(10)
    ax.clear()
    ax.barh(df['Station'], df['Count'], color=[colors[group_lk[x]] for x in df['Station']])
    dx = df['Count'].max() / 200
    for i, (Count, Station) in enumerate(zip(df['Count'], df['Station'])):
        ax.text(Count-dx, i,     Station,           size=14, weight=600, ha='right', va='bottom')
        ax.text(Count-dx, i-.25, group_lk[Station], size=10, color='#444444', ha='right', va='baseline')
        ax.text(Count+dx, i,     f'{Count:,.0f}',  size=14, ha='left',  va='center')
    # ... polished styles
    ax.text(1, 0.4, current_time_1, transform=ax.transAxes, color='#777777', size=46, ha='right', weight=800)
    ax.text(0, 1.06, 'Number of bike trips', transform=ax.transAxes, size=12, color='#777777')
    ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
    ax.xaxis.set_ticks_position('top')
    ax.tick_params(axis='x', colors='#777777', labelsize=12)
    ax.set_yticks([])
    ax.margins(0, 0.01)
    ax.grid(which='major', axis='x', linestyle='-')
    ax.set_axisbelow(True)
    ax.text(0, 1.12, '',
            transform=ax.transAxes, size=24, weight=600, ha='left')
    ax.text(1, 0, 'by @albegjonbalaj; credit @pratapvardhan @jburnmurdoch', transform=ax.transAxes, ha='right',
            color='#777777', bbox=dict(facecolor='white', alpha=0.8, edgecolor='white'))
    plt.box(False)
    
draw_barchart(17)

fig, ax = plt.subplots(figsize=(15, 8))
animator2 = animation.FuncAnimation(fig, draw_barchart, frames=range(0, 24))
#HTML(animator2.to_jshtml())
#animator2.save('animation.mp4', writer=writer)

animator2.save('animation2.gif', writer='imagemagick', fps=60)