In [None]:
import numpy as np 
import pandas as pd 


# Problem Statement
Factors effecting the booking of the cycles. To find the relation between the Season,Weather or Working day with rental.

# Understanding the data

- ****timestamp**** - timestamp field for grouping the data
- ****cnt**** - the count of a new bike shares
- ****t1**** - real temperature in C
- ****t2**** - temperature in C "feels like"
- ****hum**** - humidity in percentage
- ****windspeed**** - wind speed in km/h
- ****weathercode**** - category of the weather
    - 1 = Clear ; mostly clear but have some values with haze/fog/patches of fog/ fog in vicinity 
    - 2 = scattered clouds / few clouds 
    - 3 = Broken clouds 
    - 4 = Cloudy 
    - 7 = Rain/ light Rain shower/ Light rain 
    - 10 = rain with thunderstorm 
    - 26 = snowfall
    
    
- ***isholiday**** - boolean field - 1 holiday / 0 non holiday
- ****isweekend**** - boolean field - 1 if the day is weekend
- ****season**** - category(0-spring ; 1-summer; 2-fall; 3-winter)


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import boxcox,yeojohnson

In [None]:
data = pd.read_csv('../input/london-bike-sharing-dataset/london_merged.csv')

In [None]:
data.head()

In [None]:
data.shape

****We have around 17K booking records.****

In [None]:
data.info()

****Most of the data is numerical instead of timestamp****

In [None]:
data.describe().T

### From the above data we can infer - 
- Season, Holiday, Weekend, Weather despite being numerical is categorical in nature
- Avg real temp is around 12.5° celcius
- Avg feels like temperature is around 11.5° celcius
- Riders count average is around 1143 but max value is 7860 so there is a hint of outliers

# Adding Month and Year column for better understanding of data

In [None]:
data['timestamp'] = pd.to_datetime(data['timestamp'])

In [None]:
data['year'] = data['timestamp'].dt.year

In [None]:
data['month'] = data['timestamp'].dt.month

In [None]:
data.head()

# Non Graphical Analysis

In [None]:
data.isnull().sum()

****There is no null data.****

In [None]:
data['season'].value_counts(ascending=False)

****Almost all the four seasons share the equal amount of records****

In [None]:
data['is_holiday'].value_counts()

****More booking data is from Non Holiday than a holiday****

In [None]:
data['weather_code'].value_counts(ascending=False)

### Dropping weather code 26 and 10 as they have very less number of records.

In [None]:
data = data[(data['weather_code']!=10) & (data['weather_code']!=26)]

In [None]:
data['is_weekend'].value_counts()

****More booking data is from Non Weekend than a weekends****

In [None]:
data.groupby(['weather_code'])['cnt'].mean()

****Most booking data is from weather type 1 followed by type 2 and type 3.****

# Univariate Analysis

In [None]:
sns.set(style="darkgrid")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data,x='t1', ax=ax_box)
sns.histplot(data=data, x="t1", ax=ax_hist,kde=True)
ax_box.set(xlabel='')
plt.show()

# Treating outliers

In [None]:
Q3 = data['t1'].quantile(0.75)
Q1 = data['t1'].quantile(0.25)
IQR = Q3-Q1
upper = Q3+(1.5*IQR)
lower = Q1-(1.5*IQR)
(upper,lower)

In [None]:
data = data[(data['t1']>lower) & (data['t1']<upper)]

## Plotting graph to check data after outlier treatment

In [None]:
sns.set(style="darkgrid")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data,x='t1', ax=ax_box)
sns.histplot(data=data, x="t1", ax=ax_hist,kde=True)
ax_box.set(xlabel='')
plt.show()

### From the above graph we can infer - 
- Avg real temperature is around 13° celcius 
- 25% of temperature is around 9.5° celcius
- 75% of temperature is around 15.5° celcius

In [None]:
sns.set(style="darkgrid")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data,x='t2', ax=ax_box)
sns.histplot(data=data, x="t2", ax=ax_hist,kde=True)
ax_box.set(xlabel='')
plt.show()

### From the above graph we can infer - 
- Avg feels like temperature is around 13° celcius 
- 25% of feels like temperature is around 5° celcius
- 75% of feels like temperature is around 16° celcius

In [None]:
sns.set(style="darkgrid")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data,x='cnt', ax=ax_box)
sns.histplot(data=data, x="cnt", ax=ax_hist,kde=True)
ax_box.set(xlabel='')
plt.show()

## There are lot of outliers lets transform the data.

In [None]:
data['cnt'] = (yeojohnson(data['cnt'])[0])
sns.set(style="darkgrid")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data,x='cnt', ax=ax_box)
sns.histplot(data=data, x="cnt", ax=ax_hist,kde=True)
ax_box.set(xlabel='')
plt.show()

### From the above graph we can infer - 
- Avg cnt is around 2000
- 25% of temperature is around 1500
- 75% of temperature is around 2200

In [None]:
sns.countplot(data=data,x='season')
plt.show()

****As already infered, almost all the four seasons share the equal amount of data****

In [None]:
type_value_count = data['weather_code'].value_counts(normalize=True)*100
plt.pie(type_value_count,labels=[1,2,3,7,4],autopct='%1.2f%%')
plt.title('Number of booking in each weather condition')
plt.show()

****Most booking data is from weather type 1 followed by type 2 and type 3.****

In [None]:
sns.set(style="darkgrid")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data,x='wind_speed', ax=ax_box)
sns.histplot(data=data, x="wind_speed", ax=ax_hist,kde=True)
ax_box.set(xlabel='')
plt.show()

# Treating outliers

In [None]:
Q3 = data['wind_speed'].quantile(0.75)
Q1 = data['wind_speed'].quantile(0.25)
IQR = Q3-Q1
upper = Q3+(1.5*IQR)
lower = Q1-(1.5*IQR)
(upper,lower)

In [None]:
data = data[(data['wind_speed']>lower) & (data['wind_speed']<upper)]

In [None]:
sns.set(style="darkgrid")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data,x='wind_speed', ax=ax_box)
sns.histplot(data=data, x="wind_speed", ax=ax_hist,kde=True)
ax_box.set(xlabel='')
plt.show()

## From the above graphs, we can say - 
 - Windspeed is normally in the range of 0 to 35
 - Avg wind speed is 15
 - 25% of wind speed is under 12
 - 75% of wind speed is under 19

In [None]:
type_value_count = data['is_holiday'].value_counts(normalize=True)*100
plt.pie(type_value_count,labels=['Non Holiday','Holiday'],autopct='%1.2f%%')
plt.title('Number of booking on Holiday/ Non Holiday')
plt.show()

****Most of the booking data is from Non Holiday than a Holiday.****

In [None]:
sns.set(style="darkgrid")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data,x='hum', ax=ax_box)
sns.histplot(data=data, x="hum", ax=ax_hist,kde=True)
ax_box.set(xlabel='')
plt.show()

In [None]:
Q3 = data['hum'].quantile(0.75)
Q1 = data['hum'].quantile(0.25)
IQR = Q3-Q1
upper = Q3+(1.5*IQR)
lower = Q1-(1.5*IQR)
(upper,lower)

In [None]:
data = data[(data['hum']>lower) & (data['hum']<upper)]

In [None]:
sns.set(style="darkgrid")
f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)})
sns.boxplot(data=data,x='hum', ax=ax_box)
sns.histplot(data=data, x="hum", ax=ax_hist,kde=True)
ax_box.set(xlabel='')
plt.show()

# Bi-Variate Analysis

In [None]:
sns.violinplot(y='cnt',x='season',data=data)
plt.title('Count of booking across each season')
plt.show()

In [None]:
sns.boxplot(y='cnt',x='weather_code',data=data)
plt.title('Count of booking across each weather')
plt.show()

### From the above graph we can infer - 
- Most count of booking came from season 2. 
- Season 1 sees less number of count of booking.

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='season',hue='weather_code',data=data)
plt.title('Count of booking across each season and weather')
plt.show()

### From the above graph we can infer - 
- Almost all the weather condition is same across the seasons
- Weather 1,2 and 3 sees less booking compared to Weather 4 and 7 across all season

In [None]:
sns.scatterplot(x='t1',y='cnt',data=data)
plt.title('Count of booking around the temperature')
plt.xlabel('Temperature')
plt.show()

****There is a uniform distribution of count of booking across all the temperature.****

In [None]:
sns.scatterplot(x='t2',y='cnt',data=data)
plt.title('Count of booking around feels like temperature')
plt.xlabel('Feels like Temperature')
plt.show()

****There is a uniform distribution of count of booking across all the feels like temperature.****

In [None]:
sns.scatterplot(x='hum',y='cnt',data=data)
plt.title('Count of booking around humidity')
plt.show()

****There is higher booking when the humidity id less.****

In [None]:
sns.scatterplot(x='wind_speed',y='cnt',data=data)
plt.title('Count of booking around humidity')
plt.show()

****There is a uniform distribution of count of booking for all wind speeds.****

In [None]:
year_data = data.groupby(['year'])['cnt'].sum()
year_data = year_data.reset_index()
sns.barplot(x='year',y='cnt',data=year_data)
plt.title('Count of booking per year')
plt.show()

****Booking from 2017 is nearly zero so far.****

In [None]:
month_data = data.groupby(['month'])['cnt'].sum()
month_data = month_data.reset_index()
sns.barplot(x='month',y='cnt',data=month_data)
plt.title('Count of booking per month')
plt.show()

****Almost all the months have same number of bookings.****

In [None]:
mon_year_data = data.groupby(['year','month'])['cnt'].sum()
mon_year_data = pd.DataFrame(mon_year_data)
mon_year_data.reset_index(inplace = True)
myy = mon_year_data.pivot('month','year','cnt').fillna(0)

In [None]:
sns.heatmap(myy)
plt.title('Count of booking across years and months')
plt.xlabel('Year')
plt.ylabel('Month')
plt.show()

****As infered earlier the data booking is almost same across all the months.****

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(data.corr(),annot=True)
plt.show()

### From the above graph we can infer -
- There is a high correlation of actual temperature with feels like teperature.
- Similarly temp and feels like temp as well as month and seasons are having good correlation.

# Hypothesis Testing

Decribing a fuction to yield result on the basis of given significance value - 0.05¶

In [None]:
def htResult(p_value):
    significance_level = 0.05
    if p_value <= significance_level: 
        print('Reject NULL HYPOTHESIS') 
    else: 
        print('Fail to Reject NULL HYPOTHESIS') 

## Question - 
Does holiday has an effect on the number of electric cycles rented?

****Null hypothesis, H0****        : There is no effect of holiday on cycles rented.

****Alternate hypothesis, Ha****   : There is some effect of holiday on cycles rented.

In [None]:
working = data[data['is_holiday']==0]
not_working = data[data['is_holiday']==1]

In [None]:
sns.histplot(x='cnt', data=working, kde = True)
plt.show()

In [None]:
sns.histplot(x='cnt', data=not_working, kde = True)
plt.show()

### From the above graphs we can say that distribution of data is almost normal.
## Normality check of the data (Q-Q plot)

In [None]:
stats.probplot(working['cnt'], plot= plt, dist="norm")
plt.title('Q-Q plot for working day booking count')
plt.show()

In [None]:
stats.probplot(not_working['cnt'], plot= plt, dist="norm")
plt.title('Q-Q plot for non working day booking count')
plt.show()

****From both the Q-Q Plot we can say that the values are near normal****

## Performing t-test
Basic assupmtions met

In [None]:
t_test = stats.ttest_ind(working['cnt'], not_working['cnt'])
p_value = t_test.pvalue
print("P-value :",p_value)

In [None]:
htResult(p_value)

# Answer - 
****We reject Null Hypothesis, which means holiday has effect on bikes rented.****

## Question -
To check if Weather is dependent on the season.

****Null hypothesis, H0****        : Weather is not dependant on the season.

****Alternate hypothesis, Ha****   : Weather is dependant on the season.

In [None]:
cont = pd.crosstab(data['weather_code'],data['season'])

## Performing Chi Square Test

In [None]:
c, p, dof, expected = stats.chi2_contingency(cont)

In [None]:
print('P-value :',p)

In [None]:
htResult(p)

# Answer -
****We reject Null Hypothesis, which means weather is dependant on the season****

# Question - 
To check if No. of cycles rented is similar or different in different season.

## Checking the normality of the data

In [None]:
season_1 = data[data['season']==0].sample(4000)
season_2 = data[data['season']==1].sample(4000)
season_3 = data[data['season']==2].sample(4000)
season_4 = data[data['season']==3].sample(4000)

In [None]:
sns.histplot(x='cnt', data=season_1, kde = True)
plt.show()

In [None]:
stats.probplot(season_1['cnt'], plot= plt, dist="norm")
plt.title('Q-Q plot for season 1 booking count')
plt.show()

In [None]:
sns.histplot(x='cnt', data=season_2, kde = True)
plt.show()

In [None]:
stats.probplot(season_2['cnt'], plot= plt, dist="norm")
plt.title('Q-Q plot for season 2 booking count')
plt.show()

In [None]:
sns.histplot(x='cnt', data=season_3, kde = True)
plt.show()

In [None]:
stats.probplot(season_3['cnt'], plot= plt, dist="norm")
plt.title('Q-Q plot for season 3 booking count')
plt.show()

In [None]:
sns.histplot(x='cnt', data=season_4, kde = True)
plt.show()

In [None]:
stats.probplot(season_4['cnt'], plot= plt, dist="norm")
plt.title('Q-Q plot for season 4 booking count')
plt.show()

****From the above graphs we can say the values are near to normal for all seasons.****

# Performing Anova 
## Season
****Null hypothesis, H0****        : No. of cycles rented is similar across various season.

****Alternate hypothesis, Ha****   : No. of cycles rented is different across various season.



In [None]:
p_value = stats.f_oneway(season_1['cnt'],season_2['cnt'],season_3['cnt'],season_4['cnt'])[1]
print("P-Value : ",p_value)

In [None]:
htResult(p_value)

## Answer -
****We reject Null Hypothesis, which means No. of cycles rented is different across various seasons.****

# Recommendations - 
- Months like Dec, Jan and Feb sees less amount of booking to increase it we can provide a discount on the booking to attract more customres.
- More users are booking the bike when humidity is less so we can provide offer to add a refreshement during humid weather.
- Working days sees more booking so we can provide an offer during weekends or holidays to bring in more bookings.