In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.max_columns', None) ## To display all the columns
pd.set_option('display.max_rows', None) ## To display all the columns
pd.set_option('display.float_format', lambda x: '%.3f' % x) ## No scientific notation

In [None]:
train = pd.read_csv(r'../input/train.csv')
#test = pd.read_csv("D:/Projects/Bike Sharing Demand/test.csv")

In [None]:
train.shape, test.shape

In [None]:
train.head(), test.head()

In [None]:
train.isnull().sum(), test.isnull().sum()

#### There are no missing values in training & test data

In [None]:
pd.value_counts(train.season)

#### Season:: 1 = Spring, 2 = Summer, 3 = Fall, 4 = Winter. Also, distribution for every season is equal.

In [None]:
pd.value_counts(train.holiday)

#### Assuming 0: Non-holiday & 1: Holiday

In [None]:
pd.value_counts(train.weather)

#### Weather:: 1 = Clear, Few clouds, Partly cloudy, Partly cloudy    2 = Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist    3 = Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds    4 = Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

#### People prefer bike sharing mostly during clear & partly cloudy days.

In [None]:
### Number of working days
pd.value_counts(train.workingday)

In [None]:
### Dropping both casual and registered columns since sum of both these columns equals to the count column which is also the target variable
train = train.drop(['casual', 'registered'], axis = 1)
train.shape

In [None]:
### Obtaining separate date and time columns
train['datetime'] = pd.to_datetime(train['datetime'])
train['date'] = train['datetime'].dt.strftime('%m/%d/%Y') ##Date
train['time'] = train['datetime'].dt.strftime('%H:%M:%S') ##Time
#train = train.drop(['datetime'], axis = 1)
train.head(), train.shape

In [None]:
## Day of the week
train['date'] = pd.to_datetime(train['date'], format = '%m/%d/%Y')
train['day_of_week'] = train['date'].dt.weekday_name
train.head()

In [None]:
## Correlation
import matplotlib.pyplot as plt

corr = train.corr()
corr.style.background_gradient(cmap='coolwarm')

#### temp and atemp are highly correlated as expected since both are representing the temperature - therefore we will drop one of the column to avoid multicollinearity

In [None]:
train = train.drop(['atemp'], axis = 1)
train.shape

In [None]:
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day
train["hour"] = [t.hour for t in pd.DatetimeIndex(train.datetime)]
train.shape

In [None]:
## Mapping day of the week as weekday or weekend -- 0:: Weekend & 1:: Weekday
train['weekend'] = train.day_of_week.map({"Saturday" : 0, "Sunday": 0, "Monday": 1, "Tuesday": 1, "Wednesday": 1,
                                              "Thursday": 1, "Friday": 1})
train = train.drop(['day_of_week'], axis = 1)
train.head()

In [None]:
## Dropping datetime, date and time column
train_new = train.drop(['datetime', 'date', 'time'], axis = 1)
train_new.shape

In [None]:
train_new.head()

In [None]:
## Dividing the dataset into features and target variables
x_train = train_new.loc[:,['season', 'holiday', 'workingday', 'weather', 'temp', 'humidity', 'windspeed', 'year', 'month', 
                           'day', 'hour', 'weekend']]
y_train = train_new['count']

In [None]:
## Implementing all the feature engineering on test dataset
test['datetime'] = pd.to_datetime(test['datetime'])
test['date'] = test['datetime'].dt.strftime('%m/%d/%Y') ##Date
test['time'] = test['datetime'].dt.strftime('%H:%M:%S') ##Time
test['date'] = pd.to_datetime(test['date'], format = '%m/%d/%Y')
test['day_of_week'] = test['date'].dt.weekday_name
test = test.drop(['atemp'], axis = 1)
test['year'] = test['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day
test["hour"] = [t.hour for t in pd.DatetimeIndex(test.datetime)]
test['weekend'] = test.day_of_week.map({"Saturday" : 0, "Sunday": 0, "Monday": 1, "Tuesday": 1, "Wednesday": 1,
                                              "Thursday": 1, "Friday": 1})
test = test.drop(['day_of_week'], axis = 1)
test = test.drop(['date', 'time'], axis = 1)
test.shape, test.head()

In [None]:
x_test = test.loc[:, ['season', 'holiday', 'workingday', 'weather', 'temp', 'humidity', 'windspeed', 'year', 'month', 
                      'day', 'hour', 'weekend']]
y_test = test['datetime']

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [None]:
xg_reg = xgb.XGBRegressor(objective = 'reg:linear', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10,
                          n_estimators = 10)

In [None]:
xg_reg.fit(x_train, y_train)

In [None]:
pred = xg_reg.predict(x_test)

In [None]:
# Converting prediction array to dataframe
pred = pd.DataFrame(pred, columns = ['pred'])

# Rounding the values to nearest integer
pred['pred'] = round(pred['pred'])

# Joining prediction dataframe 
result = pd.concat([x_test, pred], axis = 1)

In [None]:
result = pd.concat([y_test, result], axis = 1)
result.head()

In [None]:
## Creating a submission file
final_output = result.loc[:, ['datetime', 'pred']] 