In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/seoul-bike-rental-ai-pro-iti/sample_submission.csv
/kaggle/input/seoul-bike-rental-ai-pro-iti/train.csv
/kaggle/input/seoul-bike-rental-ai-pro-iti/test.csv


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
%matplotlib inline 

## Converting time series proplem to regression proplem
by getting previous time points for some features

In [3]:
def get_lag_time(df,col,n_back):
    for i in range(n_back):
        df[col+'-'+str(i+1)] = df[col].shift(i+1)
    return df

In [4]:
def get_lead_time(df,col,n_back):
    for i in range(n_back):
        df[col+'+'+str(i+1)] = df[col].shift(-(i+1))
    return df

## Reading and editing df and test sets

In [5]:
def read_edit(path):    
        df = pd.read_csv(path)
        # renaming columns
        names = {'Date':'date','Hour':'hour','Temperature(�C)':'temp',
                'Humidity(%)':'humidity','Visibility (10m)':'vis_10',
                'Dew point temperature(�C)':'dew_pt_temp','Solar Radiation (MJ/m2)':'solar_rad',
                'Rainfall(mm)':'rainfall_mm','Snowfall (cm)':'snowfall_cm','Seasons':'seasons','Holiday':'holiday',
                'Functioning Day':'functioning_day','Wind speed (m/s)':'windspeed'}

        df.rename(columns=names,inplace=True)

        # converting date to date time
        df['date']=pd.to_datetime(df['date'],format="%d/%m/%Y")
        # making date and night out of hours
        df['label_day_night']=df['hour'].apply(lambda x : 'Night' if (x >20 or x<5) else( 'Day'))
        # getting day names
        df['week_day']=df["date"].dt.day_name()
        # getting months
        df['month']=df['date'].dt.month
        # Encoding days of the week
        mapping_dictDay={'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}
        df['week_day_encoding']=df['week_day'].map(mapping_dictDay)

        # getting dummies for seasons,holiday,functioning_day
        df = pd.get_dummies(df, columns = ['holiday','functioning_day','seasons'],drop_first=True)

        df['date'], Categorical= pd.factorize(df['date'])
        df['week_day'], Categorical= pd.factorize(df['week_day'])

        # applying lag_time & lead_time
        df = get_lag_time(df,'windspeed',2)
        df = get_lag_time(df,'rainfall_mm',2)
        df = get_lead_time(df,'rainfall_mm',4)
        df = get_lead_time(df,'temp',1)
        

        return df

In [6]:
# reading train and test data
dataset_path = '/kaggle/input/seoul-bike-rental-ai-pro-iti/'
train = read_edit(os.path.join(dataset_path, 'train.csv'))
test = read_edit(os.path.join(dataset_path,'test.csv'))

test = test.drop(['date','dew_pt_temp','label_day_night','week_day','snowfall_cm'],axis =1)
X=train.drop(['ID','y','date','dew_pt_temp','label_day_night','week_day','snowfall_cm'],axis =1)
y=train['y'] #the target

X.head()

Unnamed: 0,hour,temp,humidity,windspeed,vis_10,solar_rad,rainfall_mm,month,week_day_encoding,holiday_No Holiday,...,seasons_Winter,windspeed-1,windspeed-2,rainfall_mm-1,rainfall_mm-2,rainfall_mm+1,rainfall_mm+2,rainfall_mm+3,rainfall_mm+4,temp+1
0,0,-5.2,37,2.2,2000,0.0,0.0,12,5,1,...,1,,,,,0.0,0.0,0.0,0.0,-5.5
1,1,-5.5,38,0.8,2000,0.0,0.0,12,5,1,...,1,2.2,,0.0,,0.0,0.0,0.0,0.0,-6.0
2,2,-6.0,39,1.0,2000,0.0,0.0,12,5,1,...,1,0.8,2.2,0.0,0.0,0.0,0.0,0.0,0.0,-6.2
3,3,-6.2,40,0.9,2000,0.0,0.0,12,5,1,...,1,1.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,-6.0
4,4,-6.0,36,2.3,2000,0.0,0.0,12,5,1,...,1,0.9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.4


## Train test split and mmodel

In [7]:
# train - test split
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.1, random_state=42)
# xgboost model
xg_reg = xgb.XGBRegressor(objective='count:poisson',random_state=42,colsample_bytree = 0.6, learning_rate = 0.1,max_depth=5
                 ,alpha = 10, n_estimators = 1000)

## fit and evaluate

In [8]:
from sklearn.metrics import mean_squared_log_error
# fit the model
xg_reg=xg_reg.fit(X_train,y_train)
#==========================================================
#========================train=============================
y_pred=xg_reg.predict(X_train)
rmsle_train = np.sqrt(mean_squared_log_error(y_train, y_pred))
print("RMSE-train: %f" % (rmsle_train))


#==========================================================
#======================validation==========================

y_pred1=xg_reg.predict(X_val)
rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred1))
print("RMSE-val: %f" % (rmsle))
#==========================================================
#====================cross-val=============================
scores = cross_val_score(xg_reg, X_train, y_train, cv=5,scoring='neg_mean_squared_log_error')
print("RMSE-Crossval %0.2f " % (np.sqrt(np.abs(scores.mean()))))




RMSE-train: 0.125224
RMSE-val: 0.242848
RMSE-Crossval 0.26 


## submission

In [9]:
y_pred2 = xg_reg.predict(test.drop(['ID'],axis=1))
test['y'] = y_pred2
test[['ID', 'y']].to_csv('sub_x.csv', index=False)
f = pd.read_csv('sub_x.csv')
f

Unnamed: 0,ID,y
0,5760,168.790660
1,5761,156.392240
2,5762,139.500120
3,5763,99.970400
4,5764,60.855278
...,...,...
2995,8755,927.762300
2996,8756,671.425000
2997,8757,577.484000
2998,8758,412.845600
