In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import (train_test_split,
                                     GridSearchCV)
from sklearn import metrics

from sklearn.metrics import (mean_squared_error,
                             mean_squared_log_error,
                             make_scorer)

import xgboost as xgb

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
def create_datetime(df):
    df['datetime'] = pd.to_datetime(df['datetime'])
    df['year'] = df['datetime'].dt.year
    df['hour'] = df['datetime'].dt.hour
    df['weekday'] = df['datetime'].dt.weekday
    
create_datetime(train)
create_datetime(test)

# Creating a column to append to final predicitons
date_time = test['datetime']

In [4]:
# Input from https://www.kaggle.com/code/guosue/top-1-bike-sharing-detailed-eda-xgboost
def delete_outliers(data, df):
    mean = np.mean(data)
    std = np.std(data)
    outliers=np.abs(data-mean) > (3*std)
    df.drop(index=data[outliers].index, inplace=True)
    
delete_outliers(train['count'], train)

In [5]:
# Find Standard deviation for later use
std_count = train['count'].std()

In [6]:
# Transform y into Log form for prediction
y = train['count'].apply(lambda x: np.log1p(x)).values

In [8]:
# Establishing which columns to drop &/or keep
drop_cols = ['atemp', 'count', 'datetime']
test_cols = ['atemp', 'datetime']
train.drop(drop_cols, axis=1, inplace=True)
test.drop(test_cols, axis=1, inplace=True)

In [9]:
train.columns, test.columns

(Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'humidity',
        'windspeed', 'casual', 'registered', 'year', 'hour', 'weekday'],
       dtype='object'),
 Index(['season', 'holiday', 'workingday', 'weather', 'temp', 'humidity',
        'windspeed', 'year', 'hour', 'weekday'],
       dtype='object'))

## Split Data

In [8]:
X_train, X_val, y_train, y_val = train_test_split(train, y, test_size=0.2, random_state = 42)

In [9]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((8591, 12), (2148, 12), (8591,), (2148,))

## Split y_train and y_val into casual and registered riders

In [10]:
# Idea posited by https://www.kaggle.com/code/guosue/top-1-bike-sharing-detailed-eda-xgboost
y_train_cas = X_train['casual'].apply(lambda x: np.log1p(x)).values
y_train_reg = X_train['registered'].apply(lambda x: np.log1p(x)).values

y_val_cas = X_val['casual'].apply(lambda x: np.log1p(x)).values
y_val_reg = X_val['registered'].apply(lambda x: np.log1p(x)).values

y_train_cas.shape, y_val_cas.shape

((8591,), (2148,))

In [11]:
X_train.drop(['casual', 'registered'], axis=1, inplace=True)
X_val.drop(['casual', 'registered'], axis=1, inplace=True)

In [12]:
#trans_X_DF = pd.DataFrame(data=X_train)
#data_dmatrix = xgb.DMatrix(data=trans_X_DF,label=trans_X_DF.iloc[:,-1])

In [102]:
# Parameters set by me
xgb_cas = xgb.XGBRegressor(random_state=42,
                           n_estimators=500, 
                           max_depth=4, 
                           learning_rate=0.1)

In [103]:
xgb_reg = xgb.XGBRegressor(random_state=42, 
                           n_estimators=500, 
                           max_depth=4, 
                           learning_rate=0.1)

In [104]:
xgb_cas.fit(X_train, y_train_cas)

In [105]:
xgb_reg.fit(X_train, y_train_reg)

In [17]:
# Created by https://www.kaggle.com/code/guosue/top-1-bike-sharing-detailed-eda-xgboost
xgb_parameters = {'random_state': [42], 
                  'n_estimators': [300, 400, 500], 
                  'max_depth': [3, 4, 5], 
                  'learning_rate': [0.1],
                  'alpha': [0, 0.1, 0.5]}




In [None]:
rmsle_scorer = metrics.make_scorer(mean_squared_log_error, greater_is_better = False)

In [None]:
gs = GridSearchCV(estimator = model, param_grid = xgb_parameters, scoring = rmsle_scorer, cv = 5)

In [None]:
# Will throw a lot of warning if ignore warnings not on.
#gs.fit(X_train, y_train_reg)
#print('Best params for XGBoost model on Registered Riders are: ', gs.best_params_)

In [None]:
#gs.fit(X_train, y_train_cas)
#print('Best params for XGBoost model on Casual Riders are: ', gs.best_params_)

In [106]:
# Proposed by https://www.kaggle.com/code/guosue/top-1-bike-sharing-detailed-eda-xgboost
# Define which models to use
models = [xgb_cas, xgb_reg]

# Create an empty list to append to
prediction = []

# Predict, unlog, and append each prediction to a list
for model in models:
    pre = model.predict(X_val)
    pre = np.expm1(pre)
    prediction.append(pre)

# Sum the predictions for Casual and Registered riders at each point in DateTime
preds = sum(prediction)

In [107]:
# Unlog the Validation 'answers'
y_val_unlog = np.expm1(y_val)

In [108]:
std_count

166.68142883624589

In [109]:
np.sqrt(mean_squared_error(y_val_unlog, preds))

37.820599671108575

In [110]:
# Final score vs. Validation data
mean_squared_log_error(y_val_unlog, preds)

0.08292528259125564

In [25]:
#submit = pd.DataFrame({'datetime':date_time,'count':preds})
#submit.to_csv('final_answer.csv',index=False)