# Install packages: 

In [None]:
!pip install autoviz
!pip install xlrd

# Import the libraries

In [None]:
import os
import numpy as np
import pandas as pd
import scipy.stats as stats

import matplotlib.pyplot as plt
plt.style.use('ggplot')

import seaborn as sns 
sns.set_style("darkgrid")

from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()

import warnings
warnings.filterwarnings("ignore")


from sklearn.model_selection import train_test_split
from sklearn.metrics import  mean_squared_log_error
from catboost import CatBoostRegressor



%matplotlib inline

# Data Analysis

In [None]:
dataset_path = '../input/seoul-bike-rental-ai-pro-iti/train.csv'
df = pd.read_csv(dataset_path)

In [None]:
df.head()

In [None]:
df.describe().T

In [None]:
df = df.drop_duplicates()
df.shape

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:

df['Date'] = pd.to_datetime(df['Date'],dayfirst = True)

df['Month']= df['Date'].dt.month
df['Day_of_week'] = df['Date'].dt.dayofweek
df['Week_of_year'] = df['Date'].dt.weekofyear

df = df[df['y'] != 0]

In [None]:
df.shape

In [None]:
df.head()

# Data Visualisation:

In [None]:
fig = AV.AutoViz(dataset_path);

In [None]:
fig = sns.distplot(df['y']);

In [None]:
fig = sns.barplot(x="Hour", y="y", data=df);

In [None]:
df.columns

# Data Manipulating:

In [None]:
# Encoding

df = pd.get_dummies(df, columns=["Seasons"])

df['Holiday']=np.where(df['Holiday'] == 'No Holiday', 0, df['Holiday'])
df['Holiday']=np.where(df['Holiday'] == 'Holiday', 1, df['Holiday']).astype(int)

df['Functioning Day']=np.where(df['Functioning Day'] == 'No', 0, df['Functioning Day'])
df['Functioning Day']=np.where(df['Functioning Day'] == 'Yes', 1, df['Functioning Day']).astype(int)

df['Rush_hour']= df['Hour'].apply(lambda x : 1 if (9<x<19) else(0))
df['Weekend'] = df['Day_of_week'].apply(lambda x : 1 if (x<5) else(0))

In [None]:
# Lag Features

df['Humidity(%)_lag_1'] = df['Humidity(%)'].shift(-1)
df['Humidity(%)_lag_2'] = df['Humidity(%)'].shift(-2)
df['Humidity(%)_lag_3'] = df['Humidity(%)'].shift(1)
df['Humidity(%)_lag_4'] = df['Humidity(%)'].shift(2)


df['Wind speed (m/s)_lag_1'] = df['Wind speed (m/s)'].shift(-1)
df['Wind speed (m/s)_lag_2'] = df['Wind speed (m/s)'].shift(-2)
df['Wind speed (m/s)_lag_3'] = df['Wind speed (m/s)'].shift(1)
df['Wind speed (m/s)_lag_4'] = df['Wind speed (m/s)'].shift(2)

df['Rainfall(mm)_lag_1'] = df['Rainfall(mm)'].shift(-1)
df['Rainfall(mm)_lag_2'] = df['Rainfall(mm)'].shift(-2)
df['Rainfall(mm)_lag_3'] = df['Rainfall(mm)'].shift(1)
df['Rainfall(mm)_lag_4'] = df['Rainfall(mm)'].shift(2)

df['Solar Radiation (MJ/m2)_lag_1'] = df['Solar Radiation (MJ/m2)'].shift(-1)
df['Solar Radiation (MJ/m2)_lag_2'] = df['Solar Radiation (MJ/m2)'].shift(-2)
df['Solar Radiation (MJ/m2)_lag_3'] = df['Solar Radiation (MJ/m2)'].shift(1)
df['Solar Radiation (MJ/m2)_lag_4'] = df['Solar Radiation (MJ/m2)'].shift(2)

df['Temperature(�C)_lag_1'] = df['Temperature(�C)'].shift(-1)
df['Temperature(�C)_lag_2'] = df['Temperature(�C)'].shift(-2)
df['Temperature(�C)_lag_3'] = df['Temperature(�C)'].shift(1)
df['Temperature(�C)_lag_4'] = df['Temperature(�C)'].shift(2)


# Rolling Window

df['Humidity(%)_rolling_mean'] = df['Humidity(%)'].rolling(window=5).mean()
df['Wind speed (m/s)_rolling_mean'] = df['Wind speed (m/s)'].rolling(window=5).mean()
df['Rainfall(mm)_rolling_mean'] = df['Rainfall(mm)'].rolling(window=5).mean()
df['Solar Radiation (MJ/m2)_rolling_mean'] = df['Solar Radiation (MJ/m2)'].rolling(window=5).mean()
df['Temperature(�C)_rolling_mean'] = df['Temperature(�C)'].rolling(window=5).mean()

In [None]:
del df['ID']
del df['Date']

In [None]:
df.columns

# Data Splitting

In [None]:
X = df.drop(['y','Dew point temperature(�C)','Functioning Day','Snowfall (cm)'], axis=1)
Y = df['y']


X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size = .1, random_state = 0)


In [None]:
#  Transformations



X_train["Rainfall(mm)"] = X_train["Rainfall(mm)"].apply(np.log1p)
X_val["Rainfall(mm)"] = X_val["Rainfall(mm)"].apply(np.log1p)



y_train = np.log(y_train)
y_val = np.log(y_val)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

# Model Training

In [None]:
hyper_params = {
    'random_state': 0,
    'bootstrap_type': 'Bayesian',
    'depth': 4,
    'n_estimators': 1500,
    'learning_rate':.13455,
    'loss_function': 'RMSE',
}



In [None]:
cb = CatBoostRegressor(**hyper_params)

model=cb.fit(X_train, y_train, verbose=0)

In [None]:
# validation
X_val_pred = model.predict(X_val)

rmsle = np.sqrt(mean_squared_log_error(np.exp(y_val), np.exp(X_val_pred)))
print('RMSLE: %.3f' % rmsle)

In [None]:
# Prediction Plot

plt.figure(figsize=(20,10))
plt.plot(range(50),np.exp(y_val)[0:50], color = "blue")
plt.plot(range(50),np.exp(X_val_pred)[0:50], color = "red")
plt.legend(["Actual","prediction"]) 
plt.title("Predicted vs True Value")
plt.xlabel("Record number")
plt.ylabel('target')
plt.show()

In [None]:
sns.scatterplot(data=y_val, x=np.exp(X_val_pred), y=np.exp(y_val));

# Model Testing

In [None]:
dataset_path2 = '../input/seoul-bike-rental-ai-pro-iti/test.csv'
df2 = pd.read_csv(dataset_path2)

In [None]:
df2.head()

In [None]:
df2['Date'] = pd.to_datetime(df2['Date'],dayfirst = True)

df2['Month']= df2['Date'].dt.month
df2['Day_of_week'] = df2['Date'].dt.dayofweek
df2['Week_of_year'] = df2['Date'].dt.weekofyear



In [None]:
df2['Humidity(%)_lag_1'] = df2['Humidity(%)'].shift(-1)
df2['Humidity(%)_lag_2'] = df2['Humidity(%)'].shift(-2)
df2['Humidity(%)_lag_3'] = df2['Humidity(%)'].shift(1)
df2['Humidity(%)_lag_4'] = df2['Humidity(%)'].shift(2)



df2['Wind speed (m/s)_lag_1'] = df2['Wind speed (m/s)'].shift(-1)
df2['Wind speed (m/s)_lag_2'] = df2['Wind speed (m/s)'].shift(-2)
df2['Wind speed (m/s)_lag_3'] = df2['Wind speed (m/s)'].shift(1)
df2['Wind speed (m/s)_lag_4'] = df2['Wind speed (m/s)'].shift(2)


df2['Rainfall(mm)_lag_1'] = df2['Rainfall(mm)'].shift(-1)
df2['Rainfall(mm)_lag_2'] = df2['Rainfall(mm)'].shift(-2)
df2['Rainfall(mm)_lag_3'] = df2['Rainfall(mm)'].shift(1)
df2['Rainfall(mm)_lag_4'] = df2['Rainfall(mm)'].shift(2)

df2['Solar Radiation (MJ/m2)_lag_1'] = df2['Solar Radiation (MJ/m2)'].shift(-1)
df2['Solar Radiation (MJ/m2)_lag_2'] = df2['Solar Radiation (MJ/m2)'].shift(-2)
df2['Solar Radiation (MJ/m2)_lag_3'] = df2['Solar Radiation (MJ/m2)'].shift(1)
df2['Solar Radiation (MJ/m2)_lag_4'] = df2['Solar Radiation (MJ/m2)'].shift(2)

df2['Temperature(�C)_lag_1'] = df2['Temperature(�C)'].shift(-1)
df2['Temperature(�C)_lag_2'] = df2['Temperature(�C)'].shift(-2)
df2['Temperature(�C)_lag_3'] = df2['Temperature(�C)'].shift(1)
df2['Temperature(�C)_lag_4'] = df2['Temperature(�C)'].shift(2)


df2['Humidity(%)_rolling_mean'] = df2['Humidity(%)'].rolling(window=5).mean()
df2['Wind speed (m/s)_rolling_mean'] = df2['Wind speed (m/s)'].rolling(window=5).mean()
df2['Rainfall(mm)_rolling_mean'] = df2['Rainfall(mm)'].rolling(window=5).mean()
df2['Solar Radiation (MJ/m2)_rolling_mean'] = df2['Solar Radiation (MJ/m2)'].rolling(window=5).mean()
df2['Temperature(�C)_rolling_mean'] = df2['Temperature(�C)'].rolling(window=5).mean()

In [None]:
# Encoding

df2 = pd.get_dummies(df2, columns=["Seasons"])

df2['Holiday']=np.where(df2['Holiday'] == 'No Holiday', 0, df2['Holiday'])
df2['Holiday']=np.where(df2['Holiday'] == 'Holiday', 1, df2['Holiday']).astype(int)

df2['Functioning Day']=np.where(df2['Functioning Day'] == 'No', 0, df2['Functioning Day'])
df2['Functioning Day']=np.where(df2['Functioning Day'] == 'Yes', 1, df2['Functioning Day']).astype(int)


df2['Rush_hour']= df2['Hour'].apply(lambda x : 1 if (9<x<19) else(0))
df2['Weekend'] = df2['Day_of_week'].apply(lambda x : 1 if (x<5) else(0))


In [None]:
del df2['Date']

In [None]:
df2.columns

In [None]:
X_test = df2.drop(['ID','Functioning Day','Snowfall (cm)','Dew point temperature(�C)'], axis=1)

In [None]:
X_test.shape

In [None]:
y_test = model.predict(X_test)

In [None]:
df2['y'] = y_test

df2['y'] = np.exp((df2['y'])).astype(int)

In [None]:
func_val_list = df2['Functioning Day'].tolist()

df2['y'] = df2['y'].mul(func_val_list, axis=0)

# Submission File Generation

In [None]:
df2[['ID', 'y']].to_csv('/kaggle/working/submission.csv', index=False)