##  Regression using SKLearn's Neural Network (NN)
#### One-Hot Encode categorical features, Standardize numeric features

Objective:
1. Train a bike rental prediction model
2. NN requires one hot encoding of categorical data
3. NN also requires features to be on similar scale
4. Perform one-hot encoding of all categorical features:
    ['season', 'holiday', 'workingday', 'weather', 'year', 'month', 'day', 'dayofweek', 'hour']
5. Verify model performance 

In [None]:
import sys
import numpy as np
# Set random seed
np.random.seed(0)

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# NN
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, KBinsDiscretizer
# Column Transformer
from sklearn.compose import ColumnTransformer

In [None]:
column_list_file = 'bike_train_column_list.txt'
train_file = 'bike_train.csv'
validation_file = 'bike_validation.csv'
test_file = 'bike_test.csv'

In [None]:
# One Hot Encode all Categorical Features
# Let's define all the categorical features
categorical_features = ['season','holiday','workingday','weather','year','month','dayofweek','hour']

# Standardize Features
standardize_features = ['temp', 'atemp', 'humidity', 'windspeed']

In [None]:
columns = ''
with open(column_list_file,'r') as f:
    columns = f.read().split(',')

In [None]:
columns

In [None]:
# Specify the column names as the file does not have column header
df_train = pd.read_csv(train_file,names=columns)
df_validation = pd.read_csv(validation_file,names=columns)

In [None]:
df_train.head()

In [None]:
df_validation.head()

In [None]:
X_train = df_train.iloc[:,1:] # Features: 1st column onwards 
y_train = df_train.iloc[:,0].ravel() # Target: 0th column

X_validation = df_validation.iloc[:,1:]
y_validation = df_validation.iloc[:,0].ravel()

In [None]:
df_train.head()

In [None]:
# Features to one-hot encode
categorical_features+['day']

In [None]:
# Features to standardize
standardize_features

### Column Transformer
1. Chain all data transformations
2. Easy and straight forward

In [None]:
colTransformer = ColumnTransformer([('onehot',
                                     OneHotEncoder(categories='auto',sparse=False),
                                     categorical_features),
                                    ('onehotday',
                                     OneHotEncoder(categories=[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]],
                                                   sparse=False),
                                     ['day']),
                                    ('standardize',
                                    StandardScaler(),standardize_features)
                                   ],
                                   remainder="passthrough")

In [None]:
colTransformer.fit(X_train)

In [None]:
X_train_encoded = colTransformer.transform(X_train)
X_validation_encoded = colTransformer.transform(X_validation)

In [None]:
print('Training Data',X_train.shape, 'OneHot Encoded',X_train_encoded.shape)
print('Val Data',X_validation.shape, 'OneHot Encoded',X_validation_encoded.shape)

In [None]:
X_train_encoded[:1]

In [None]:
X_validation_encoded[:1]

### Train a neural network regressor

In [None]:
nn_regressor = MLPRegressor(random_state=5, 
                            hidden_layer_sizes=[100],
                            activation='relu',
                            max_iter=100)

In [None]:
nn_regressor

In [None]:
%%time
nn_regressor.fit(X_train_encoded,y_train)

In [None]:
# Compare actual vs predicted performance with dataset not seen by the model before
df = pd.read_csv(validation_file,names=columns)

In [None]:
df.head()

In [None]:
result = nn_regressor.predict(X_validation_encoded)

In [None]:
result[:5]

In [None]:
df['count_predicted'] = result

In [None]:
df.head()

In [None]:
df['count_predicted'].describe()

In [None]:
# Convert log(count) to count
df['count'] = df['count'].map(np.expm1)
df['count_predicted'] = df['count_predicted'].map(np.expm1)

In [None]:
# Actual Vs Predicted
plt.plot(df['count'], label='Actual')
plt.plot(df['count_predicted'],label='Predicted')
plt.xlabel('Sample')
plt.ylabel('Count')
plt.xlim([100,150])
plt.title('Validation Dataset - Predicted Vs. Actual')
plt.legend()
plt.show()

In [None]:
# Over prediction and Under Prediction needs to be balanced
# Training Data Residuals
residuals = (df['count'] - df['count_predicted'])

plt.hist(residuals)
plt.grid(True)
plt.xlabel('Actual - Predicted')
plt.ylabel('Count')
plt.title('Residuals Distribution')
plt.axvline(color='r')
plt.show()

In [None]:
value_counts = (residuals > 0).value_counts(sort=False)
print(' Under Estimation: {0:.2f}'.format(value_counts[True]/len(residuals)))
print(' Over  Estimation: {0:.2f}'.format(value_counts[False]/len(residuals)))

In [None]:
import sklearn.metrics as metrics
print("RMSE: {0:.2f}".format(metrics.mean_squared_error(df['count'],
                                                    df['count_predicted'])**.5))

In [None]:
# Metric Use By Kaggle
def compute_rmsle(y_true, y_pred):
    if type(y_true) != np.ndarray:
        y_true = np.array(y_true)
        
    if type(y_pred) != np.ndarray:
        y_pred = np.array(y_pred)
     
    return(np.average((np.log1p(y_pred) - np.log1p(y_true))**2)**.5)

In [None]:
print("RMSLE: {0:.2f}".format(compute_rmsle(df['count'],df['count_predicted'])))

In [None]:
# Optional Test Data
# Prepare Data for Submission to Kaggle
df_test = pd.read_csv(test_file,parse_dates=['datetime'])

In [None]:
X_test =  df_test.iloc[:,1:] # Exclude datetime for prediction

In [None]:
X_test.head()

In [None]:
# Transform data first with column transformer
result = nn_regressor.predict(colTransformer.transform(X_test))

In [None]:
# Convert result to actual count
df_test["count"] = np.expm1(result)

In [None]:
df_test.head()

In [None]:
df_test[df_test["count"] < 0]

In [None]:
df_test[['datetime','count']].to_csv('predicted_count.csv',index=False)