In [108]:
import plotly.express as px
import plotly.graph_objects as go 
import pandas as pd
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv("bike_sharing_daily_data.csv")

# Here I do some preprocessing, you can proceed to Analyst part below this part

## Before starting to analyst data we need to make sure our data has a good characteristics. In order to answer that we can look into those 6 principles of good data such as:
- ### Accuracy
- ### Completeness
- ### Consistency
- ### Timeliness
- ### Unique
- ### Validity

### 1. Accuracy refer to how accurate those to our objective. 
### 2. Completeness refer how particular piece of data can capture the actual event that happen. For this Bike Sharing I believe the data already capture important event which how many bike rental in the day grouped into casual and registered users.
### 3. Consistency refer to whether a certain piece of data has the same format with others. For example if we expect the date will in format %m-%d-%Y we should check whether the entire date column has the this format applied.
### 4. Timeliness refer to whether our information is available right when it matters. For this case it maybe that we need to know the performance of bike share at the end of year and because of that we should already have the data before the end of year.
### 5. Validity refer to whether the data has a valid information. For example we expect that the column for body weight should be the weight of a person not other measurements.

### Lets see a glimpse of our data

In [3]:
df.sample(5)

Unnamed: 0,date,season,holiday,weekday,workingday,weather,temp,atemp,hum,windspeed,casual,registered,trips
318,2020-11-14,Winter,0,2,1,Misty and Cloudy,0.53,0.507579,0.68875,0.199633,449,3746,4195
580,2021-08-03,Fall,0,5,1,Misty and Cloudy,0.765833,0.722867,0.6425,0.215792,1328,5847,7175
206,2020-07-25,Fall,0,2,1,Clear/Partly Cloudy,0.771667,0.696979,0.540833,0.200258,750,3840,4590
303,2020-10-30,Winter,0,1,1,Clear/Partly Cloudy,0.34,0.356063,0.703333,0.10635,362,3307,3669
719,2021-12-20,Winter,0,4,1,Misty and Cloudy,0.33,0.335217,0.667917,0.132463,314,3814,4128


### Lets inspect for null values first, Because if we find one we will discard it.

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        731 non-null    object 
 1   season      731 non-null    object 
 2   holiday     731 non-null    int64  
 3   weekday     731 non-null    int64  
 4   workingday  731 non-null    int64  
 5   weather     731 non-null    object 
 6   temp        731 non-null    float64
 7   atemp       731 non-null    float64
 8   hum         731 non-null    float64
 9   windspeed   731 non-null    float64
 10  casual      731 non-null    int64  
 11  registered  731 non-null    int64  
 12  trips       731 non-null    int64  
dtypes: float64(4), int64(6), object(3)
memory usage: 74.4+ KB


### Looks like there is no null values. But we can inspect the date column because I believed its coded as string not as Pandas Datetime (it will matter when we are going to filter entry in 2020 alone). It will probably matter for Visualization too.

In [5]:
df['date'] = pd.to_datetime(df['date'],format="%Y-%m-%d")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        731 non-null    datetime64[ns]
 1   season      731 non-null    object        
 2   holiday     731 non-null    int64         
 3   weekday     731 non-null    int64         
 4   workingday  731 non-null    int64         
 5   weather     731 non-null    object        
 6   temp        731 non-null    float64       
 7   atemp       731 non-null    float64       
 8   hum         731 non-null    float64       
 9   windspeed   731 non-null    float64       
 10  casual      731 non-null    int64         
 11  registered  731 non-null    int64         
 12  trips       731 non-null    int64         
dtypes: datetime64[ns](1), float64(4), int64(6), object(2)
memory usage: 74.4+ KB


In [7]:
df.sample(5)

Unnamed: 0,date,season,holiday,weekday,workingday,weather,temp,atemp,hum,windspeed,casual,registered,trips
185,2020-07-04,Fall,0,2,1,Clear/Partly Cloudy,0.746667,0.696338,0.590417,0.126258,1031,3634,4665
541,2021-06-25,Fall,0,1,1,Clear/Partly Cloudy,0.715833,0.654042,0.504167,0.300383,1139,5640,6779
530,2021-06-14,Summer,0,4,1,Clear/Partly Cloudy,0.648333,0.624383,0.569583,0.253733,1180,6183,7363
111,2020-04-21,Summer,0,5,1,Misty and Cloudy,0.336667,0.321954,0.729583,0.219521,177,1506,1683
368,2021-01-03,Spring,0,3,1,Misty and Cloudy,0.1075,0.119337,0.414583,0.1847,95,2273,2368


In [8]:
## Lets try filter on some date
df[df['date'] <= '2020-12-31']

Unnamed: 0,date,season,holiday,weekday,workingday,weather,temp,atemp,hum,windspeed,casual,registered,trips
0,2020-01-01,Spring,0,6,0,Misty and Cloudy,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2020-01-02,Spring,0,0,0,Misty and Cloudy,0.363478,0.353739,0.696087,0.248539,131,670,801
2,2020-01-03,Spring,0,1,1,Clear/Partly Cloudy,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,2020-01-04,Spring,0,2,1,Clear/Partly Cloudy,0.200000,0.212122,0.590435,0.160296,108,1454,1562
4,2020-01-05,Spring,0,3,1,Clear/Partly Cloudy,0.226957,0.229270,0.436957,0.186900,82,1518,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2020-12-27,Spring,0,3,1,Clear/Partly Cloudy,0.299130,0.279974,0.503913,0.293961,255,2047,2302
362,2020-12-28,Spring,0,4,1,Clear/Partly Cloudy,0.248333,0.263892,0.574167,0.119412,254,2169,2423
363,2020-12-29,Spring,0,5,1,Clear/Partly Cloudy,0.311667,0.318812,0.636667,0.134337,491,2508,2999
364,2020-12-30,Spring,0,6,0,Clear/Partly Cloudy,0.410000,0.414121,0.615833,0.220154,665,1820,2485


# Analyst Start Here

## Lets we see the trend in 2020
### Trend of overall trips in 2020

In [9]:
df_2020 = df[df['date'] <= '2020-12-31']
px.line(df_2020,x='date',y='trips',title='Trend of bikeshare in 2020')

### Trend of casual and registered in 2020

In [10]:
px.line(df_2020,x='date',y=['casual','registered'],title='Bikeshare trend among casual and registered users in 2020')

## Does Weather affecting the trips count? 

In [11]:
px.bar(df_2020.groupby('weather').sum()['trips'].reset_index(),x='weather',y='trips')

### Lets we see over the membership type

In [12]:
px.bar(df_2020.groupby('weather').sum()[['casual','registered']].reset_index(),x='weather',y=['casual','registered'],barmode='group')

### Looks like in bad weather people dont wanna to use bikeshare. And of course when the weather is good people tend to use it even more to enjoy the day

## Lets see the difference between each membership type
### Overall of trend of trips by casual and registered type

In [13]:
px.line(df_2020,x='date',y=['casual','registered'])

### According to the data we know. Registered user is a user who subscribed to the bike share for daily use and the Casual user is just someone who use bikeshare not so often, kinda like someone who only use bikeshare on certain occasion
### Day of week : 
#### 0 -> Sunday
#### 1 -. Monday, etc

In [14]:
px.bar(df_2020.groupby('weekday')[['casual','registered']].sum().reset_index(),x='weekday',y=['casual','registered'],barmode='group')

### We can see that casual users usually has a higher number in the  normal working day (Monday - Friday), and Registered users will at peak when Saturday and Sunday (Probably they are tourist)

## Lets see the affect of holiday for our bikeshare services
### We will compare the amount of bikeshare for a holiday with 7 days before and 7 days after

In [16]:
df_2020[df_2020['holiday']==1]

Unnamed: 0,date,season,holiday,weekday,workingday,weather,temp,atemp,hum,windspeed,casual,registered,trips
16,2020-01-17,Spring,1,1,0,Misty and Cloudy,0.175833,0.176771,0.5375,0.194017,117,883,1000
51,2020-02-21,Spring,1,1,0,Misty and Cloudy,0.303333,0.284075,0.605,0.307846,195,912,1107
104,2020-04-14,Summer,1,5,0,Clear/Partly Cloudy,0.446667,0.441913,0.67125,0.226375,642,2484,3126
149,2020-05-29,Summer,1,1,0,Clear/Partly Cloudy,0.733333,0.671092,0.685,0.131225,1549,2549,4098
184,2020-07-03,Fall,1,1,0,Misty and Cloudy,0.726667,0.665417,0.637917,0.081479,3065,2978,6043
247,2020-09-04,Fall,1,1,0,Misty and Cloudy,0.673333,0.625646,0.790417,0.212696,1236,2115,3351
282,2020-10-09,Winter,1,1,0,Clear/Partly Cloudy,0.570833,0.542925,0.73375,0.042304,1514,3603,5117
314,2020-11-10,Winter,1,5,0,Clear/Partly Cloudy,0.324167,0.306817,0.44625,0.314675,440,2928,3368
327,2020-11-23,Winter,1,4,0,Clear/Partly Cloudy,0.373333,0.372471,0.549167,0.167304,560,935,1495
359,2020-12-25,Spring,1,1,0,Clear/Partly Cloudy,0.321739,0.315535,0.506957,0.239465,430,887,1317


In [116]:
def plot_compare_holiday(current_df: pd.DataFrame,df_index: int):
    temp_df = current_df.loc[df_index-7 : df_index,:]
    temp_df = temp_df.append(current_df.loc[df_index+1: df_index+7,:])
    title_date = f"{current_df.loc[df_index,'date'].year}-{current_df.loc[df_index,'date'].month}-{current_df.loc[df_index,'date'].day}"
    # For debug purposes
    # print(temp_df[['date','trips']])
    fig = px.bar(temp_df,x='date',y='trips',color='holiday',title=f"Bikeshare before and after holiday at {title_date}")
    return fig

In [117]:
for row in df_2020[df_2020['holiday']==1].itertuples():
    fig = plot_compare_holiday(df_2020,row.Index)
    fig.show()