# Creating dataset to analyze bike sharing demand in Washington D.C.

In [2]:
import pandas as pd
import glob
from datetime import datetime


In [3]:
# display all rows and columns in the dataframes
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)


## Bike sharing demand dataset

In [4]:
# merge the bike sharing demand csv files into one dataframe for Washington, D.C.
path = r'C:\Users\julia\Documents\Studium\Master\2. Semester\Master-Seminar Applied Econometrics - Data Science Basics\bike-sharing\datasets\bike-sharing'
all_files = glob.glob(path + "/*.csv")

df_list = []

for file in all_files:
    df_comp = pd.read_csv(file)
    df_list.append(df_comp)
    
df_bike = pd.concat(df_list, axis=0, ignore_index=True)


In [5]:
# get overview of dataframe
print(df_bike.shape)
df_bike.head()


(22544730, 9)


Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,3548,2011-01-01 00:01:29,2011-01-01 01:00:37,31620,5th & F St NW,31620,5th & F St NW,W00247,Member
1,346,2011-01-01 00:02:46,2011-01-01 00:08:32,31105,14th & Harvard St NW,31101,14th & V St NW,W00675,Casual
2,562,2011-01-01 00:06:13,2011-01-01 00:15:36,31400,Georgia & New Hampshire Ave NW,31104,Adams Mill & Columbia Rd NW,W00357,Member
3,434,2011-01-01 00:09:21,2011-01-01 00:16:36,31111,10th & U St NW,31503,Florida Ave & R St NW,W00970,Member
4,233,2011-01-01 00:28:26,2011-01-01 00:32:19,31104,Adams Mill & Columbia Rd NW,31106,Calvert & Biltmore St NW,W00346,Casual


In [8]:
# read in dataset that contains location information on bike stations
bike_station_df = pd.read_csv('datasets/bike-stations/Capital_Bike_Share_Locations.csv')
bike_station_df.head()


Unnamed: 0,OBJECTID,ID,ADDRESS,TERMINAL_NUMBER,LATITUDE,LONGITUDE,INSTALLED,LOCKED,INSTALL_DATE,REMOVAL_DATE,TEMPORARY_INSTALL,NUMBER_OF_BIKES,NUMBER_OF_EMPTY_DOCKS,X,Y,SE_ANNO_CAD_DATA,OWNER
0,292811730,479,Columbus Ave & Gramercy Blvd,32069,39.12333,-77.16486,YES,NO,,,NO,8,3,385744.186615,161710.152819,,
1,292811731,480,Reston Pkwy & Spectrum Dr,32225,38.964208,-77.35428,YES,NO,,,NO,5,10,369296.122237,144092.525801,,
2,292811732,481,Key West Ave & Great Seneca Hwy,32070,39.105642,-77.20386,YES,NO,,,NO,3,7,382367.38936,159753.369415,,
3,292811733,482,Virginia Ave & 25th St NW,31297,38.8995,-77.054155,YES,NO,,,NO,12,7,395302.353355,136851.190868,,DC
4,292811734,483,15th & M St NW,31298,38.905424,-77.034674,YES,NO,,,NO,10,9,396992.472005,137507.980428,,DC


In [9]:
# drop all irrelevant columns
bike_station_df_drop = bike_station_df.drop(['OBJECTID', 'ID', 'INSTALLED', 'LOCKED', 'INSTALL_DATE',
                                             'REMOVAL_DATE', 'TEMPORARY_INSTALL', 'NUMBER_OF_BIKES',
                                             'NUMBER_OF_EMPTY_DOCKS', 'X', 'Y', 'SE_ANNO_CAD_DATA',
                                             'OWNER'], axis=1)

bike_station_df_drop.head()


Unnamed: 0,ADDRESS,TERMINAL_NUMBER,LATITUDE,LONGITUDE
0,Columbus Ave & Gramercy Blvd,32069,39.12333,-77.16486
1,Reston Pkwy & Spectrum Dr,32225,38.964208,-77.35428
2,Key West Ave & Great Seneca Hwy,32070,39.105642,-77.20386
3,Virginia Ave & 25th St NW,31297,38.8995,-77.054155
4,15th & M St NW,31298,38.905424,-77.034674


In [11]:
# read in the neighborhood dataset
neighborhood_df = pd.read_csv('datasets/bike-stations/DC_Neighborhood_Cluster_Demographics.csv')

# drop all irrelevant columns




Unnamed: 0,neighborhood_cluster,Cluster_num,Short_Name,Full_Name,lon_ctr,lat_ctr,Population,"Population,_1980","Population,_1990","Population,_2000","Population,_2010","Pct_chng_pop,_1980_to_1990","Pct_chng_pop,_1990_to_1900","Pct_chng_pop,_2000_to_2010",Children,"Pct_children,_1980","Pct_children,_1990","Pct_children,_2000","Pct_children,_2010","Pct_chng_child_pop,_1980_to_90","Pct_chng_child_pop,_1990_to_00","Pct_chng_child_pop,_2000_to_10",Race/Ethnicity,"Pct_black_non-Hispanic,_1990","Pct_black_non-Hispanic,_2000","Pct_black_non-Hispanic,_2010","Pct_white_non-Hispanic,_1990","Pct_white_non-Hispanic,_2000","Pct_white_non-Hispanic,_2010","Pct_Hispanic,_1990","Pct_Hispanic,_2000","Pct_Hispanic,_2010",Pct_Asian/PI_non-Hispanic_1990,Pct_Asian/PI_non-Hispanic_2000,Pct_Asian/PI_non-Hispanic_2010,Foreign-Born,"Pct_foreign_born,_1980","Pct_foreign_born,_1990","Pct_foreign_born,_2000","Pct_foreign_born,_2007-11",Families,Pct_female-head_fams_w_chld_90,Pct_female-head_fams_w_chld_00,Pct_feml-head_fam_w_chld_07-11,Low_weight_births,Pct_low_weight_births_1998,Pct_low_weight_births_1999,Pct_low_weight_births_2000,Pct_low_weight_births_2001,Pct_low_weight_births_2002,Pct_low_weight_births_2003,Pct_low_weight_births_2004,Pct_low_weight_births_2005,Pct_low_weight_births_2006,Pct_low_weight_births_2007,Pct_low_weight_births_2008,Pct_low_weight_births_2009,Pct_low_weight_births_2010,Pct_low_weight_births_2011,Births_to_teen_mothers,Pct_birth_to_teen_mothers_1998,Pct_birth_to_teen_mothers_1999,Pct_birth_to_teen_mothers_2000,Pct_birth_to_teen_mothers_2001,Pct_birth_to_teen_mothers_2002,Pct_birth_to_teen_mothers_2003,Pct_birth_to_teen_mothers_2004,Pct_birth_to_teen_mothers_2005,Pct_birth_to_teen_mothers_2006,Pct_birth_to_teen_mothers_2007,Pct_birth_to_teen_mothers_2008,Pct_birth_to_teen_mothers_2009,Pct_birth_to_teen_mothers_2010,Pct_birth_to_teen_mothers_2011,Poverty,"Poverty_rate_(Pct),_1980","Poverty_rate_(Pct),_1990","Poverty_rate_(Pct),_2000","Poverty_rate_(Pct),_2007-11","Pct_children_in_poverty,_1990","Pct_children_in_poverty,_2000",Pct_child_in_poverty_2007-11,Employment,"Unemployment_rate_(Pct),_1980","Unemployment_rate_(Pct),_1990","Unemployment_rate_(Pct),_2000","Unemployment_rate,_2007-11",Pct_pop_16+_yrs_employed_1980,Pct_pop_16+_yrs_employed_1990,Pct_pop_16+_yrs_employed_2000,Pct_pop_16+_yrs_employed_07-11,Education,Pct_persons_wo_HS_diploma_1980,Pct_persons_wo_HS_diploma_1990,Pct_persons_wo_HS_diploma_2000,Pct_persons_wo_HS_dip_2007-11,Isolation,"Pct_HHs_with_a_phone,_2000","Pct_HHs_with_a_phone,_2007-11","Pct_HHs_with_a_car,_2000","Pct_HHs_with_a_car,_2007-11",Family_income_(2010_$),"Avg._family_income,_1979","Avg._family_income,_1989","Avg._family_income,_1999","Avg._family_income,_2007-11",Pct_chng_avg_fam_inc_1980-90,Pct_chng_avg_fam_inc_1990-00,Pct_chng_avg_fam_inc_90to07-11,Food_stamps,Person_receiv_food_stamps_2000,Person_receiv_food_stamps_2001,Person_receiv_food_stamps_2002,Person_receiv_food_stamps_2003,Person_receiv_food_stamps_2004,Person_receiv_food_stamps_2005,Person_receiv_food_stamps_2006,Person_receiv_food_stamps_2007,Person_receiv_food_stamps_2008,Person_receiv_food_stamps_2009,Person_receiv_food_stamps_2010,Person_receiv_food_stamps_2011,Person_receiv_food_stamps_2012,Person_receiv_food_stamps_2013,TANF,"Persons_receiving_TANF,_2000","Persons_receiving_TANF,_2001","Persons_receiving_TANF,_2002","Persons_receiving_TANF,_2003","Persons_receiving_TANF,_2004","Persons_receiving_TANF,_2005","Persons_receiving_TANF,_2006","Persons_receiving_TANF,_2007","Persons_receiving_TANF,_2008","Persons_receiving_TANF,_2009","Persons_receiving_TANF,_2010","Persons_receiving_TANF,_2011","Persons_receiving_TANF,_2012","Persons_receiving_TANF,_2013","Violent_Crimes_(per_1,000_pop)","Violent_crimes,_2000","Violent_crimes,_2001","Violent_crimes,_2002","Violent_crimes,_2003","Violent_crimes,_2004","Violent_crimes,_2005","Violent_crimes,_2006","Violent_crimes,_2007","Violent_crimes,_2008","Violent_crimes,_2009","Violent_crimes,_2010","Violent_crimes,_2011",Property_Crimes_(per_1000_pop),"Property_crimes,_2000","Property_crimes,_2001","Property_crimes,_2002","Property_crimes,_2003","Property_crimes,_2004","Property_crimes,_2005","Property_crimes,_2006","Property_crimes,_2007","Property_crimes,_2008","Property_crimes,_2009","Property_crimes,_2010","Property_crimes,_2011",Housing_Units,"Occupied_housing_units,_1980","Occupied_housing_units,_1990","Occupied_housing_units,_2000","Occupied_housing_units,_2010",Mobility,Pct_same_house_5_yrs_ago_1990,Pct_same_house_5_yrs_ago_2000,Rental_Vacancy,Rental_vacancy_rate_(Pct)_1980,Rental_vacancy_rate_(Pct)_1990,Rental_vacancy_rate_(Pct)_2000,Rental_vacanc_rate_Pct_2007-11,Homeownership,"Homeownership_rate_(Pct),_1980","Homeownership_rate_(Pct),_1990","Homeownership_rate_(Pct),_2000",Homeownrshp_rate_(Pct)_2007-11,Sales_of_Single-Family_Homes,"Number_of_sales,_1995","Number_of_sales,_1996","Number_of_sales,_1997","Number_of_sales,_1998","Number_of_sales,_1999","Number_of_sales,_2000","Number_of_sales,_2001","Number_of_sales,_2002","Number_of_sales,_2003","Number_of_sales,_2004","Number_of_sales,_2005","Number_of_sales,_2006","Number_of_sales,_2007","Number_of_sales,_2008","Number_of_sales,_2009","Number_of_sales,_2010","Number_of_sales,_2011","Number_of_sales,_2012",Price_of_SingFam_Homes_($2012),"Median_sales_price,_1995","Median_sales_price,_1996","Median_sales_price,_1997","Median_sales_price,_1998","Median_sales_price,_1999","Median_sales_price,_2000","Median_sales_price,_2001","Median_sales_price,_2002","Median_sales_price,_2003","Median_sales_price,_2004","Median_sales_price,_2005","Median_sales_price,_2006","Median_sales_price,_2007","Median_sales_price,_2008","Median_sales_price,_2009","Median_sales_price,_2010","Median_sales_price,_2011","Median_sales_price,_2012",Pct_ann_change_med_price_02-12,Pct_ann_change_med_price_07-12,Pct_ann_change_med_price_11-12,Mrtge_Lend_(Home_Purch_Loans),Loans_Per1000_housing_units_97,Loans_Per1000_housing_units_98,Loans_Per1000_housing_units_99,Loans_Per1000_housing_units_00,Loans_Per1000_housing_units_01,Loans_Per1000_housing_units_02,Loans_Per1000_housing_units_03,Loans_Per1000_housing_units_04,Loans_Per1000_housing_units_05,Loans_Per1000_housing_units_06,Mortgage_Borrower_Incom($2006),"Median_borrower_income,_1997","Median_borrower_income,_1998","Median_borrower_income,_1999","Median_borrower_income,_2000","Median_borrower_income,_2001","Median_borrower_income,_2002","Median_borrower_income,_2003","Median_borrower_income,_2004","Median_borrower_income,_2005","Median_borrower_income,_2006",Subprime_Lending,"Pct_subprime_loans,_1997","Pct_subprime_loans,_1998","Pct_subprime_loans,_1999","Pct_subprime_loans,_2000","Pct_subprime_loans,_2001","Pct_subprime_loans,_2002","Pct_subprime_loans,_2003","Pct_subprime_loans,_2004","Pct_subprime_loans,_2005","Pct_subprime_loans,_2006",Home_Receiv_Foreclosure_Notice,Home_receiv_foreclos_notice_95,Home_receiv_foreclos_notice_96,Home_receiv_foreclos_notice_97,Home_receiv_foreclos_notice_98,Home_receiv_foreclos_notice_99,Home_receiv_foreclos_notice_00,Home_receiv_foreclos_notice_01,Home_receiv_foreclos_notice_02,Home_receiv_foreclos_notice_03,Home_receiv_foreclos_notice_04,Home_receiv_foreclos_notice_05,Home_receiv_foreclos_notice_06,Home_receiv_foreclos_notice_07,Home_receiv_foreclos_notice_08,Home_receiv_foreclos_notice_09,Home_receiv_foreclos_notice_10,Home_receiv_foreclos_notice_11,Home_receiv_foreclos_notice_12,Foreclos_Notice_per_1000_Homes,"Foreclosure_notice_rate,_1995","Foreclosure_notice_rate,_1996","Foreclosure_notice_rate,_1997","Foreclosure_notice_rate,_1998","Foreclosure_notice_rate,_1999","Foreclosure_notice_rate,_2000","Foreclosure_notice_rate,_2001","Foreclosure_notice_rate,_2002","Foreclosure_notice_rate,_2003","Foreclosure_notice_rate,_2004","Foreclosure_notice_rate,_2005","Foreclosure_notice_rate,_2006","Foreclosure_notice_rate,_2007","Foreclosure_notice_rate,_2008","Foreclosure_notice_rate,_2009","Foreclosure_notice_rate,_2010","Foreclosure_notice_rate,_2011","Foreclosure_notice_rate,_2012",Home_Foreclosure_Completion,Foreclosure_completion_1995,Foreclosure_completion_1996,Foreclosure_completion_1997,Foreclosure_completion_1998,Foreclosure_completion_1999,Foreclosure_completion_2000,Foreclosure_completion_2001,Foreclosure_completion_2002,Foreclosure_completion_2003,Foreclosure_completion_2004,Foreclosure_completion_2005,Foreclosure_completion_2006,Foreclosure_completion_2007,Foreclosure_completion_2008,Foreclosure_completion_2009,Foreclosure_completion_2010,Foreclosure_completion_2011,Foreclosure_completion_2012,Trustee_Deed_Sale_per1000_Home,"Trustee_deed_sale_rate,_1995","Trustee_deed_sale_rate,_1996","Trustee_deed_sale_rate,_1997","Trustee_deed_sale_rate,_1998","Trustee_deed_sale_rate,_1999","Trustee_deed_sale_rate,_2000","Trustee_deed_sale_rate,_2001","Trustee_deed_sale_rate,_2002","Trustee_deed_sale_rate,_2003","Trustee_deed_sale_rate,_2004","Trustee_deed_sale_rate,_2005","Trustee_deed_sale_rate,_2006","Trustee_deed_sale_rate,_2007","Trustee_deed_sale_rate,_2008","Trustee_deed_sale_rate,_2009","Trustee_deed_sale_rate,_2010","Trustee_deed_sale_rate,_2011","Trustee_deed_sale_rate,_2012",Number_of_Schools,"Number_of_schools,_2000","Number_of_schools,_2001","Number_of_schools,_2002","Number_of_schools,_2003","Number_of_schools,_2004","Number_of_schools,_2005","Number_of_schools,_2006","Number_of_schools,_2007","Number_of_schools,_2008","Number_of_schools,_2009","Number_of_schools,_2010","Number_of_schools,_2011","Number_of_schools,_2012",Number_of_DCPS_Schools,"Number_of_DCPS_schools,_2000","Number_of_DCPS_schools,_2001","Number_of_DCPS_schools,_2002","Number_of_DCPS_schools,_2003","Number_of_DCPS_schools,_2004","Number_of_DCPS_schools,_2005","Number_of_DCPS_schools,_2006","Number_of_DCPS_schools,_2007","Number_of_DCPS_schools,_2008","Number_of_DCPS_schools,_2009","Number_of_DCPS_schools,_2010","Number_of_DCPS_schools,_2011","Number_of_DCPS_schools,_2012",Number_of_Charter_Schools,Number_of_charter_schools_2000,Number_of_charter_schools_2001,Number_of_charter_schools_2002,Number_of_charter_schools_2003,Number_of_charter_schools_2004,Number_of_charter_schools_2005,Number_of_charter_schools_2006,Number_of_charter_schools_2007,Number_of_charter_schools_2008,Number_of_charter_schools_2009,Number_of_charter_schools_2010,Number_of_charter_schools_2011,Number_of_charter_schools_2012,Total_Audited_School_Enrollment,"Total_school_enrollment,_2001","Total_school_enrollment,_2002","Total_school_enrollment,_2003","Total_school_enrollment,_2004","Total_school_enrollment,_2005","Total_school_enrollment,_2006","Total_school_enrollment,_2007","Total_school_enrollment,_2008","Total_school_enrollment,_2009","Total_school_enrollment,_2010","Total_school_enrollment,_2011","Total_school_enrollment,_2012",DCPS_Audited_School_Enrollment,"DCPS_school_enrollment,_2001","DCPS_school_enrollment,_2002","DCPS_school_enrollment,_2003","DCPS_school_enrollment,_2004","DCPS_school_enrollment,_2005","DCPS_school_enrollment,_2006","DCPS_school_enrollment,_2007","DCPS_school_enrollment,_2008","DCPS_school_enrollment,_2009","DCPS_school_enrollment,_2010","DCPS_school_enrollment,_2011","DCPS_school_enrollment,_2012",Charter_Audited_School_Enrlmnt,Charter_school_enrollment_2001,Charter_school_enrollment_2002,Charter_school_enrollment_2003,Charter_school_enrollment_2004,Charter_school_enrollment_2005,Charter_school_enrollment_2006,Charter_school_enrollment_2007,Charter_school_enrollment_2008,Charter_school_enrollment_2009,Charter_school_enrollment_2010,Charter_school_enrollment_2011,Charter_school_enrollment_2012,CAS_math_prof,CAS_read_prof
0,Cluster 1,1,Adams Morgan,"Kalorama Heights, Adams Morgan, Lanier Heights",-77.047702,38.919379,,18149,17812,18167,18338,-1.9,2.0,0.9,,13.0,11.0,9.0,7.2,-21.0,-15.0,-20.0,,26.0,18.0,11.0,56.0,60.0,69.0,15.0,16.0,13.0,2.5,6.2,6.8,,19.0,23.0,25.0,21.0,,38.0,26.0,6.0,,8.3,7.6,12.0,8.2,3.0,9.5,7.8,7.2,5.9,6.7,8.1,7.7,6.9,7.9,,5.0,7.1,9.5,5.4,6.7,3.6,3.9,3.3,4.8,5.8,5.6,3.2,1.4,2.8,,15.0,10.0,10.0,5.6,21.0,26.0,5.8,,6.0,3.6,2.7,5.4,73,78,79,81,,20.0,15.0,11.0,4.5,,99,96,61,60,,103602,121128,160609,162891,17.0,33.0,1.4,,546,526,537,551,566,534,527,517,465,562,656,791,857,866,,247.0,241.0,234.0,212.0,179,155.0,129.0,122.0,119.0,137.0,146.0,164,134,151,,9.0,12.0,15.0,13.0,13.0,12.0,11.0,12.0,11.0,9.7,8.9,9.3,,52,59,58,62,42,43,49,48,41,39,36,41,,9942,10279,10979,10992,,40,37,,6.0,5.8,2.4,4.8,,25.0,31.0,34.0,38,,34,37,50,64,54,49,44,38,40,49,38,27,28,16,25,19,26,14,,407000.0,362000.0,498000.0,533000.0,507000.0,702000.0,767000.0,927000.0,1395000.0,1039000.0,1055000.0,1073000.0,1053000.0,1258000.0,1100000.0,1152000.0,1076000.0,970000.0,0.5,-1.6,-9.9,,72.0,122.0,141.0,119,132,153,141,142,170,133,,109825,102733,102147,110781,119742,123689,122658,127530,132321,140907,,0.3,0.2,0.6,2.2,1.2,0.6,1.2,1.1,2.0,1.6,,10,20,14,19,21,10,20,13,15,11,8,12,18,25,39,31,3,0,,2.7,5.4,3.8,5.1,5.6,2.7,5.4,3.4,3.9,2.8,2.0,2.9,4.3,5.9,9.1,7.1,0.7,0.0,,0,1,0,4,2,0,0,0,1,1,2,2,4,5,14,10,1,1,,0.0,0.3,0.0,1.1,0.5,0.0,0.0,0.0,0.3,0.3,0.5,0.5,0.9,1.2,3.3,2.3,0.2,0.2,,4,4,4,5,2,2,4,4,4,4,4,5,4,,3,3,3,3,2,2,2,2,2,3,3,3,3,,1,1,1,2,0,0,2,2,2,1,1,2,1,,1913,1972,2140,649,548,666,757,813,1156,1273,1589,1520,,1184,1185,1072,649,548,543,599,626,930,1034,1074,1084,,729,787,1068,0,0,123,158,187,226,239,515,436,0.680769,0.634615
1,Cluster 2,2,Columbia Heights,"Columbia Heights, Mt. Pleasant, Pleasant Plain...",-77.031769,38.9287,,44081,45728,46779,47378,3.7,2.3,1.3,,23.0,22.0,22.0,15.0,0.5,2.0,-32.0,,66.0,53.0,38.0,11.0,13.0,31.0,21.0,30.0,27.0,1.6,3.2,4.4,,12.0,26.0,33.0,24.0,,54.0,48.0,50.0,,8.8,13.0,9.8,9.4,11.0,7.9,11.0,8.8,10.0,8.9,7.4,7.5,9.2,8.6,,15.0,16.0,12.0,14.0,14.0,8.9,9.0,11.0,12.0,14.0,12.0,9.4,8.6,6.7,,24.0,24.0,26.0,17.0,33.0,35.0,24.0,,8.3,8.6,9.7,8.0,55,61,58,71,,44.0,42.0,42.0,21.0,,94,95,48,57,,54778,56232,63766,87204,2.7,13.0,37.0,,6456,6179,5968,6056,6053,6262,5953,5740,5686,6680,8075,9180,9400,9775,,3423.0,3448.0,3373.0,3198.0,3172,3058.0,2640.0,2466.0,2394.0,2557.0,2723.0,2664,2578,2576,,16.0,15.0,18.0,21.0,16.0,17.0,17.0,17.0,17.0,15.0,13.0,13.0,,43,45,49,47,41,35,44,43,44,44,42,47,,17039,17275,17458,19514,,51,45,,8.7,9.0,4.9,3.9,,24.0,26.0,26.0,32,,231,260,259,391,395,319,305,306,304,362,291,183,152,97,108,143,165,73,,162000.0,152000.0,168000.0,172000.0,171000.0,200000.0,222000.0,298000.0,336000.0,408000.0,535000.0,578000.0,557000.0,550000.0,471000.0,516000.0,510000.0,460000.0,4.4,-3.8,-9.8,,35.0,46.0,61.0,78,78,86,94,111,128,142,,81630,74332,73289,87251,93799,93110,95294,109082,115267,119018,,7.8,16.0,6.0,5.5,5.1,3.7,4.0,6.5,11.0,6.2,,78,97,124,150,144,135,86,84,78,60,41,84,94,152,230,177,3,4,,14.0,18.0,23.0,27.0,26.0,25.0,16.0,15.0,13.0,9.9,6.6,12.0,13.0,20.0,30.0,23.0,0.4,0.5,,37,35,42,69,82,42,37,19,3,8,6,10,33,53,72,44,13,1,,6.7,6.4,7.6,13.0,15.0,7.6,6.7,3.4,0.5,1.3,1.0,1.4,4.5,7.0,9.4,5.6,1.6,0.1,,12,12,12,12,20,22,22,22,19,17,17,16,18,,9,9,9,9,9,10,9,8,6,6,6,6,6,,3,3,3,3,11,12,13,14,13,11,11,10,12,,5077,4500,4607,5833,6051,6524,6720,6677,7062,7313,6430,7606,,4705,4047,4063,3855,4025,4101,4006,3737,3617,3657,3504,3614,,372,453,544,1978,2026,2423,2714,2940,3445,3656,2926,3992,0.582751,0.529604
2,Cluster 3,3,Shaw,"Howard University, Le Droit Park, Cardozo/Shaw",-77.023415,38.921257,,11335,10925,10128,12174,-3.6,-7.3,20.0,,18.0,15.0,14.0,7.8,-17.0,-14.0,-34.0,,78.0,67.0,44.0,13.0,18.0,41.0,8.4,12.0,9.4,0.7,1.7,5.0,,5.5,9.8,16.0,15.0,,62.0,56.0,27.0,,8.2,12.0,13.0,15.0,18.0,9.7,13.0,6.3,11.0,15.0,6.4,16.0,9.5,8.6,,17.0,19.0,19.0,11.0,9.5,4.3,9.3,8.1,9.8,8.1,5.0,9.0,7.4,2.5,,27.0,22.0,26.0,17.0,24.0,43.0,15.0,,9.1,8.1,8.5,7.5,45,51,55,61,,45.0,36.0,30.0,15.0,,97,94,48,66,,50089,61652,72581,101583,23.0,18.0,40.0,,1283,1163,1141,1139,1153,1102,1011,970,951,1057,1254,1352,1383,1365,,661.0,571.0,546.0,534.0,579,529.0,396.0,403.0,373.0,346.0,348.0,296,291,308,,28.0,29.0,25.0,24.0,22.0,24.0,26.0,23.0,22.0,19.0,17.0,22.0,,113,103,104,112,96,91,93,84,89,88,68,75,,3742,4030,3957,5461,,46,39,,7.4,9.2,5.0,3.7,,28.0,31.0,33.0,42,,55,58,58,122,143,119,201,146,118,112,98,74,60,60,54,60,60,35,,146000.0,156000.0,148000.0,180000.0,156000.0,202000.0,352000.0,395000.0,482000.0,593000.0,631000.0,758000.0,639000.0,665000.0,673000.0,667000.0,625000.0,650000.0,5.1,0.4,4.0,,39.0,56.0,79.0,125,159,134,120,107,235,200,,78726,80125,84454,99208,87660,100812,118199,126060,118655,120877,,6.1,10.0,5.2,5.6,2.8,3.1,4.7,8.3,5.9,1.5,,25,33,36,35,43,29,25,29,23,20,20,17,32,51,81,50,2,2,,12.0,16.0,18.0,17.0,21.0,14.0,12.0,14.0,11.0,9.7,8.5,6.6,12.0,17.0,27.0,17.0,0.7,0.6,,7,11,16,20,15,12,8,5,2,1,4,1,8,10,9,13,2,1,,3.5,5.5,8.0,10.0,7.5,6.0,4.0,2.5,1.0,0.5,1.7,0.4,3.0,3.4,3.1,4.3,0.7,0.3,,9,8,8,8,9,11,12,12,9,7,6,6,9,,4,3,3,3,4,4,4,4,3,2,2,3,3,,5,5,5,5,5,7,8,8,6,5,4,3,6,,1891,2037,2128,2502,2446,2404,2332,2093,1924,1790,951,2216,,961,927,910,1209,1141,1043,925,794,562,481,708,709,,930,1110,1218,1293,1305,1361,1407,1299,1362,1309,243,1507,0.550725,0.531401
3,Cluster 4,4,Georgetown,"Georgetown, Burleith/Hillandale",-77.066409,38.909686,,16071,17919,18741,20464,11.0,4.6,9.2,,7.0,6.1,6.1,8.1,-3.5,4.4,46.0,,4.1,3.7,3.7,86.0,85.0,81.0,5.6,4.2,6.4,4.4,6.6,8.4,,11.0,15.0,14.0,15.0,,21.0,10.0,12.0,,3.3,3.1,6.4,4.5,3.5,4.7,8.5,5.5,8.1,3.3,4.4,4.5,4.8,7.0,,0.8,0.0,0.6,0.0,0.0,0.6,0.0,0.0,0.6,0.5,0.0,0.0,0.0,0.5,,16.0,16.0,15.0,11.0,9.9,6.9,1.0,,2.0,2.4,2.1,2.9,68,68,66,54,,6.4,4.4,2.0,2.0,,100,95,83,80,,180692,228758,272708,304995,27.0,19.0,12.0,,24,29,31,27,26,36,30,45,40,59,68,87,100,83,,10.0,11.0,11.0,9.0,12,12.0,,8.0,8.0,12.0,13.0,14,10,7,,6.6,6.2,6.0,5.6,5.5,4.7,5.4,6.0,3.9,3.8,5.0,3.9,,54,55,44,44,45,46,41,45,48,45,41,44,,6629,7004,7446,7375,,30,26,,5.3,5.9,2.6,9.0,,42.0,48.0,52.0,55,,165,188,266,356,315,248,225,236,242,207,203,174,160,149,121,151,160,88,,484000.0,435000.0,491000.0,493000.0,565000.0,619000.0,769000.0,898000.0,881000.0,940000.0,1116000.0,1014000.0,1008000.0,1083000.0,1062000.0,1056000.0,996000.0,1063000.0,1.7,1.1,6.6,,74.0,102.0,99.0,83,83,86,79,94,90,64,,152972,140517,106934,145035,167044,170269,190590,183188,191889,206642,,2.1,1.0,0.9,2.3,2.5,1.0,0.9,1.8,2.6,1.6,,18,12,16,22,15,11,9,11,10,8,7,8,11,19,40,25,0,0,,3.5,2.3,3.1,4.3,2.9,2.1,1.7,2.1,1.9,1.5,1.3,1.5,2.1,3.5,7.4,4.7,0.0,0.0,,4,2,5,2,3,0,0,1,1,1,1,1,1,4,4,6,1,2,,0.8,0.4,1.0,0.4,0.6,0.0,0.0,0.2,0.2,0.2,0.2,0.2,0.2,0.7,0.7,1.1,0.2,0.4,,3,3,3,3,3,2,3,3,3,3,3,3,3,,3,3,3,3,3,2,2,2,3,3,3,3,3,,0,0,0,0,0,0,1,1,0,0,0,0,0,,1003,1069,1049,1001,566,698,725,971,1145,1300,1237,1267,,1003,1069,1049,1001,566,645,639,971,1145,1300,1237,1267,,0,0,0,0,0,53,86,0,0,0,0,0,0.859155,0.887324
4,Cluster 5,5,Foggy Bottom,"West End, Foggy Bottom, GWU",-77.048975,38.902386,,10731,11104,11723,16160,3.5,5.6,38.0,,1.8,1.4,1.2,1.4,-17.0,-8.4,62.0,,5.1,6.1,6.7,82.0,75.0,75.0,4.7,5.3,5.7,7.2,13.0,12.0,,15.0,17.0,19.0,19.0,,4.7,15.0,4.1,,4.4,7.3,9.5,9.6,16.0,12.0,6.6,13.0,4.2,6.0,8.8,5.3,4.7,11.0,,3.2,4.1,0.0,0.0,2.8,0.0,0.0,4.8,0.0,2.6,0.0,0.0,0.7,0.4,,13.0,21.0,28.0,29.0,9.2,10.0,11.0,,2.9,4.3,28.0,3.6,63,59,49,39,,6.2,5.0,5.6,2.3,,100,96,44,37,,155597,224514,259442,211064,44.0,16.0,-19.0,,62,49,42,30,38,36,47,77,85,73,94,108,110,107,,,,,,10,,8.0,25.0,31.0,19.0,27.0,33,30,24,,3.8,4.6,4.3,4.1,3.1,2.5,1.7,2.0,3.1,3.6,3.0,3.0,,54,39,39,37,32,29,26,26,28,24,22,26,,6324,5787,5811,6413,,30,22,,7.8,9.7,3.3,11.0,,23.0,30.0,29.0,31,,6,7,10,16,12,11,7,10,17,12,9,14,2,3,7,7,4,2,,,,353000.0,413000.0,415000.0,474000.0,,576000.0,710000.0,712000.0,,802000.0,,,,,,,,,,,49.0,83.0,107.0,99,117,114,124,123,170,119,,113495,112209,77822,102506,117479,94658,103931,100428,116454,119534,,0.0,0.4,0.5,1.2,0.7,0.9,0.9,0.1,0.9,0.7,,12,10,7,4,6,3,0,5,4,3,6,14,12,10,16,12,2,0,,4.4,3.7,2.6,1.5,2.2,1.1,0.0,1.8,1.5,1.1,2.2,4.5,3.8,3.2,4.9,3.6,0.6,0.0,,0,1,0,0,1,1,0,0,0,0,0,3,5,4,5,5,0,0,,0.0,0.4,0.0,0.0,0.4,0.4,0.0,0.0,0.0,0.0,0.0,1.0,1.6,1.3,1.5,1.5,0.0,0.0,,1,1,1,1,1,1,1,0,0,1,1,1,1,,1,1,1,1,1,1,1,0,0,1,1,1,1,,0,0,0,0,0,0,0,0,0,0,0,0,0,,328,326,323,338,338,347,0,0,458,468,527,548,,328,326,323,338,338,347,0,0,458,468,527,548,,0,0,0,0,0,0,0,0,0,0,0,0,0.666667,0.75


In [7]:
# drop columns that are irrelevant
df_bike_drop = df_bike.drop(['Duration', 'Start station', 'End station number', 
                             'End station', 'Bike number', 'End date'], axis=1)

# create new column that only contains the date
df_bike_drop['date'] = df_bike_drop['Start date'].apply(lambda x: x[:10])

# drop the Start date column
df_bike_drop.drop(['Start date'], axis=1, inplace=True)




In [None]:
onehot = OneHotEncoder()
enc.fit(X)
df_bike_drop

In [38]:
# create two new columns for number of casual and registered customers per day
df_bike_drop['casual'] = df_bike_drop['Member type'].apply(lambda x: 1 if x == 'Casual' else 0)
df_bike_drop['registered'] = df_bike_drop['Member type'].apply(lambda x: 1 if x == 'Member' else 0)

# create new dataframe with date, casual, registered and total customers
cust_list = ['casual', 'registered']
df_bike_list = []

for cust in cust_list:
    series_bike_users = df_bike_drop.groupby('date')[cust].sum()
    df_bike_users = series_bike_users.to_frame()
    df_bike_list.append(df_bike_users)
    
# concat both dataframes saved in list and add new column containing total number of customers
df_bike_cust = pd.concat(df_bike_list, axis=1)
df_bike_cust.reset_index(inplace=True)
df_bike_cust['total_cust'] = df_bike_cust['casual'] + df_bike_cust['registered']
df_bike_cust



Unnamed: 0,date,casual,registered,total_cust
0,2011-01-01,330,629,959
1,2011-01-02,130,651,781
2,2011-01-03,120,1181,1301
3,2011-01-04,107,1429,1536
4,2011-01-05,82,1489,1571
5,2011-01-06,88,1485,1573
6,2011-01-07,148,1345,1493
7,2011-01-08,68,871,939
8,2011-01-09,54,748,802
9,2011-01-10,41,1257,1298


## Holiday dataset

In [39]:
# manual creation of dataframe containing information on holidays in Washington, D.C.
# source for this is https://dchr.dc.gov/page/holiday-schedules-2018, https://dchr.dc.gov/page/holiday-schedules-2016-and-2017
# https://dchr.dc.gov/page/holiday-schedules-2014-and-2015, https://dchr.dc.gov/page/holiday-schedules-2012-and-2013
# https://dchr.dc.gov/page/holiday-schedules-2010-and-2011

df_holiday = pd.DataFrame(columns=['date', 'holiday'])

dates = ['2018-01-01', '2018-01-15', '2018-02-19', '2018-04-16',
         '2018-05-28', '2018-07-04', '2018-09-03', '2018-10-08',
         '2018-11-12', '2018-11-22', '2018-12-25', '2017-01-02',
         '2017-01-16', '2017-01-20', '2017-02-20', '2017-04-17',
         '2017-05-29', '2017-07-04', '2017-09-04', '2017-10-09',
         '2017-11-10', '2017-11-23', '2017-12-25', '2016-01-01',
         '2016-01-18', '2016-02-15', '2016-04-15', '2016-05-30',
         '2016-07-04', '2016-09-05', '2016-10-10', '2016-11-11',
         '2016-11-24', '2016-12-26', '2015-01-01', '2015-01-19',
         '2015-02-16', '2015-04-16', '2015-05-25', '2015-07-03',
         '2015-09-07', '2015-10-12', '2015-11-11', '2015-11-26',
         '2015-12-25', '2014-01-01', '2014-01-20', '2014-02-17',
         '2014-04-16', '2014-05-26', '2014-07-04', '2014-09-01', 
         '2014-10-13', '2014-11-11', '2014-11-27', '2014-12-25',
         '2013-01-01', '2013-01-21', '2013-01-20', '2013-02-18', 
         '2013-04-16', '2013-05-27', '2013-07-04', '2013-09-02',
         '2013-10-14', '2013-11-11', '2013-11-28', '2013-12-25',
         '2012-01-02', '2012-01-16', '2012-02-20', '2012-04-16',
         '2012-05-28', '2012-07-04', '2012-09-03', '2012-10-08',
         '2012-11-12', '2012-11-22', '2012-12-25', '2011-01-17',
         '2011-02-21', '2011-04-15', '2011-05-30', '2011-07-04',
         '2011-09-05', '2011-10-10', '2011-11-11', '2011-11-24',
         '2011-12-26']

df_holiday['date'] = dates
df_holiday['holiday'] = 1
df_holiday.head()


Unnamed: 0,date,holiday
0,2018-01-01,1
1,2018-01-15,1
2,2018-02-19,1
3,2018-04-16,1
4,2018-05-28,1


## Weather dataset

In [71]:
# read in the weather data for Washington, D.C.
path = r'C:\Users\julia\Documents\Studium\Master\2. Semester\Master-Seminar Applied Econometrics - Data Science Basics\bike-sharing\datasets\weather-data'
all_files = glob.glob(path + "/*.csv")

df_list_weather = []

for file in all_files:
    df_comp_weather = pd.read_csv(file, low_memory=False)
    df_list_weather.append(df_comp_weather)
    
df_weather = pd.concat(df_list_weather, axis=0, ignore_index=True, sort=False)


In [72]:
# number of rows and columns in weather dataframe
df_weather.shape


(268114, 54)

In [73]:
# get an idea of the datatypes on the weather dataframe
df_weather.dtypes


STATION             object
NAME                object
LATITUDE           float64
LONGITUDE          float64
ELEVATION          float64
DATE                object
AWND               float64
AWND_ATTRIBUTES     object
PRCP               float64
PRCP_ATTRIBUTES     object
TAVG               float64
TAVG_ATTRIBUTES     object
TMAX               float64
TMAX_ATTRIBUTES     object
TMIN               float64
TMIN_ATTRIBUTES     object
TOBS               float64
TOBS_ATTRIBUTES     object
WT01               float64
WT01_ATTRIBUTES     object
WT02               float64
WT02_ATTRIBUTES     object
WT03               float64
WT03_ATTRIBUTES     object
WT04               float64
WT04_ATTRIBUTES     object
WT05               float64
WT05_ATTRIBUTES     object
WT06               float64
WT06_ATTRIBUTES     object
WT08               float64
WT08_ATTRIBUTES     object
WT09               float64
WT09_ATTRIBUTES     object
WT11               float64
WT11_ATTRIBUTES     object
WT13               float64
W

In [74]:
df_weather.describe(include='all')

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,AWND_ATTRIBUTES,PRCP,PRCP_ATTRIBUTES,TAVG,TAVG_ATTRIBUTES,TMAX,TMAX_ATTRIBUTES,TMIN,TMIN_ATTRIBUTES,TOBS,TOBS_ATTRIBUTES,WT01,WT01_ATTRIBUTES,WT02,WT02_ATTRIBUTES,WT03,WT03_ATTRIBUTES,WT04,WT04_ATTRIBUTES,WT05,WT05_ATTRIBUTES,WT06,WT06_ATTRIBUTES,WT08,WT08_ATTRIBUTES,WT09,WT09_ATTRIBUTES,WT11,WT11_ATTRIBUTES,WT13,WT13_ATTRIBUTES,WT14,WT14_ATTRIBUTES,WT16,WT16_ATTRIBUTES,WT17,WT17_ATTRIBUTES,WT18,WT18_ATTRIBUTES,WT21,WT21_ATTRIBUTES,WT22,WT22_ATTRIBUTES,WT15,WT15_ATTRIBUTES,WT19,WT19_ATTRIBUTES
count,268114,268114,268114.0,268114.0,268114.0,268114,11506.0,11506,263107.0,263107,6150.0,6150,42469.0,42469,42456.0,42456,31298.0,31298,3911.0,3911,328.0,328,1959.0,1959,295.0,295,58.0,58,266.0,266,985.0,985,12.0,12,346.0,346,800.0,800,229.0,229,1032.0,1032,9.0,9,178.0,178,41.0,41,10.0,10,9.0,9,1.0,1
unique,223,222,,,,2922,,3,,44,,1,,7,,7,,13,,4,,2,,4,,5,,4,,3,,2,,1,,3,,1,,1,,1,,1,,1,,1,,1,,1,,1
top,USW00093721,"WASHINGTON REAGAN NATIONAL AIRPORT, VA US",,,,2017-07-29,,",,W",,",,N",,"H,,S",,",,7",,",,7",,",,7,0700",,",,W",,",,W",,",,7",,",,7",,",,7",,",,7",,",,W",,",,W",,",,7",,",,X",,",,X",,",,X",,",,X",,",,X",,",,X",,",,X",,",,X",,",,X"
freq,2922,2922,,,,118,,10776,,206670,,6150,,28426,,28411,,12594,,2992,,240,,1026,,159,,29,,207,,797,,12,,295,,800,,229,,1032,,9,,178,,41,,10,,9,,1
mean,,,38.971833,-76.989676,81.557438,,3.166974,,3.721575,,14.31252,,18.989263,,8.445051,,11.00015,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,
std,,,0.194212,0.252576,49.302938,,1.510937,,10.440575,,9.635069,,10.127129,,9.679169,,9.750956,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,0.0,,,
min,,,38.4674,-77.497598,0.0,,0.2,,0.0,,-12.9,,-11.7,,-20.0,,-18.3,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,
25%,,,38.8472,-77.18306,43.9,,2.1,,0.0,,6.5,,10.6,,0.6,,2.8,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,
50%,,,38.991,-77.010883,83.2,,2.9,,0.0,,15.3,,20.0,,8.3,,11.7,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,
75%,,,39.1147,-76.7843,115.8,,4.0,,2.0,,23.0,,27.8,,17.2,,19.4,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,,1.0,


In [75]:
# show all available weather situation columns and create a list without the attribute columns
weathersit_list = list(df_weather.columns)[12:]
weathersit = [x for x in weathersit_list if len(x) < 5 and 'WT' in x]
weathersit


['WT01',
 'WT02',
 'WT03',
 'WT04',
 'WT05',
 'WT06',
 'WT08',
 'WT09',
 'WT11',
 'WT13',
 'WT14',
 'WT16',
 'WT17',
 'WT18',
 'WT21',
 'WT22',
 'WT15',
 'WT19']

In [76]:
# create a new data frame that contains only average temperature, windspeed, precipitation and weathersituation
# per day across all stations in Washington, D.C.
var_list = ['TAVG', 'TMIN', 'TMAX', 'TOBS', 'PRCP', 'AWND', weathersit]
df_avg_list = []

for var in var_list:
    if type(var) != list:
        data_avg_vars = df_weather.groupby('DATE')[var].mean()
        df_avg_vars = data_avg_vars.to_frame()
        df_avg_list.append(df_avg_vars)
    else:
        for sit in weathersit:
            data_wsit = df_weather.groupby('DATE')[sit].max()
            df_wsit = data_wsit.to_frame()
            df_avg_list.append(df_wsit)
            
df_weather_aggr = pd.concat(df_avg_list, axis=1)
df_weather_aggr


Unnamed: 0_level_0,TAVG,TMIN,TMAX,TOBS,PRCP,AWND,WT01,WT02,WT03,WT04,WT05,WT06,WT08,WT09,WT11,WT13,WT14,WT16,WT17,WT18,WT21,WT22,WT15,WT19
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2011-01-01,,-1.566667e+00,11.973333,2.772727,0.069333,2.575000,1.0,,,,,,1.0,,,1.0,,1.0,,,,,,
2011-01-02,,8.800000e-01,13.806667,7.327273,1.037349,3.925000,1.0,1.0,,,,,,,,1.0,1.0,1.0,,,,,,
2011-01-03,,-3.442857e+00,7.464286,-3.060000,1.878824,3.625000,,,,,,,,,,,,,,,,,,
2011-01-04,,-5.957143e+00,4.642857,-3.100000,0.000000,1.800000,,,,,,,,,,,,,,,,,,
2011-01-05,,-4.293333e+00,6.113333,-1.772727,0.000000,2.950000,,,,,,,,,,,,,,,,,,
2011-01-06,,-4.993333e+00,4.233333,-4.245455,0.000000,1.600000,,,,,,,,,,,,,,1.0,,,,
2011-01-07,,-3.885714e+00,2.857143,-1.570000,0.101205,2.550000,1.0,,,,,,1.0,,,1.0,,,,1.0,,,,
2011-01-08,,-3.980000e+00,0.486667,-2.472727,0.285057,4.575000,1.0,1.0,,,,,1.0,,,1.0,,,,1.0,,1.0,,
2011-01-09,,-6.293333e+00,0.226667,-6.072727,0.283529,6.425000,,,,,,,,,1.0,,,,,,,,,
2011-01-10,,-6.238462e+00,0.507692,-5.177778,0.000000,2.975000,,,,,,,,,1.0,,,,,,,,,


In [77]:
# reset index and change column names
df_weather_aggr.reset_index(inplace=True)

df_weather_aggr = df_weather_aggr.rename(columns={'DATE': 'date', 'TAVG': 'temp_avg', 'TMAX': 'temp_max',
                                                  'TMIN': 'temp_min', 'TOBS': 'temp_observ', 'PRCP': 'precip', 
                                                  'AWND': 'wind','WT01': 'wt_fog', 'WT02': 'wt_heavy_fog', 
                                                  'WT03': 'wt_thunder', 'WT04': 'wt_sleet', 'WT05': 'wt_hail', 
                                                  'WT06': 'wt_glaze', 'WT08':'wt_haze', 'WT09':'wt_drift_snow', 
                                                  'WT11': 'wt_high_wind', 'WT13': 'wt_mist', 'WT14': 'wt_drizzle', 
                                                  'WT15': 'wt_freeze_drizzle', 'WT16': 'wt_rain', 'WT17': 'wt_freeze_rain', 
                                                  'WT18': 'wt_snow', 'WT19': 'wt_unknown', 'WT21': 'wt_ground_fog', 
                                                  'WT22': 'wt_ice_fog'})
df_weather_aggr.head()


Unnamed: 0,date,temp_avg,temp_min,temp_max,temp_observ,precip,wind,wt_fog,wt_heavy_fog,wt_thunder,wt_sleet,wt_hail,wt_glaze,wt_haze,wt_drift_snow,wt_high_wind,wt_mist,wt_drizzle,wt_rain,wt_freeze_rain,wt_snow,wt_ground_fog,wt_ice_fog,wt_freeze_drizzle,wt_unknown
0,2011-01-01,,-1.566667,11.973333,2.772727,0.069333,2.575,1.0,,,,,,1.0,,,1.0,,1.0,,,,,,
1,2011-01-02,,0.88,13.806667,7.327273,1.037349,3.925,1.0,1.0,,,,,,,,1.0,1.0,1.0,,,,,,
2,2011-01-03,,-3.442857,7.464286,-3.06,1.878824,3.625,,,,,,,,,,,,,,,,,,
3,2011-01-04,,-5.957143,4.642857,-3.1,0.0,1.8,,,,,,,,,,,,,,,,,,
4,2011-01-05,,-4.293333,6.113333,-1.772727,0.0,2.95,,,,,,,,,,,,,,,,,,


In [81]:
# check the shape of the above dataframe
df_weather_aggr.shape


(2922, 25)

## Combination of all three separate datasets

In [80]:
# merge the holiday dataframe with the bike customer dataframe
df_bike_holiday = df_bike_cust.merge(df_holiday, how='left', on='date')

# check the shape of the new dataframe
df_bike_holiday.shape


(2918, 5)

In [82]:
# merge the df_bike_holiday dataframe with the weather dataframe
df_bike_holiday_weather = df_weather_aggr.merge(df_bike_holiday, how='left', on='date')

# check the shape of the new dataframe
df_bike_holiday_weather.shape


(2922, 29)

In [83]:
# look at the new dataframe
df_bike_holiday_weather.head()


Unnamed: 0,date,temp_avg,temp_min,temp_max,temp_observ,precip,wind,wt_fog,wt_heavy_fog,wt_thunder,wt_sleet,wt_hail,wt_glaze,wt_haze,wt_drift_snow,wt_high_wind,wt_mist,wt_drizzle,wt_rain,wt_freeze_rain,wt_snow,wt_ground_fog,wt_ice_fog,wt_freeze_drizzle,wt_unknown,casual,registered,total_cust,holiday
0,2011-01-01,,-1.566667,11.973333,2.772727,0.069333,2.575,1.0,,,,,,1.0,,,1.0,,1.0,,,,,,,330.0,629.0,959.0,
1,2011-01-02,,0.88,13.806667,7.327273,1.037349,3.925,1.0,1.0,,,,,,,,1.0,1.0,1.0,,,,,,,130.0,651.0,781.0,
2,2011-01-03,,-3.442857,7.464286,-3.06,1.878824,3.625,,,,,,,,,,,,,,,,,,,120.0,1181.0,1301.0,
3,2011-01-04,,-5.957143,4.642857,-3.1,0.0,1.8,,,,,,,,,,,,,,,,,,,107.0,1429.0,1536.0,
4,2011-01-05,,-4.293333,6.113333,-1.772727,0.0,2.95,,,,,,,,,,,,,,,,,,,82.0,1489.0,1571.0,


In [84]:
# write the final dataframe to a csv file
df_bike_holiday_weather.to_csv('bike_sharing_dataset.csv', sep=',', index=False)

