In [1]:
import pandas as pd
import numpy as np
import Builder

from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

### Read in Data

This is a fairly common dataset, here sourced from [data.world](https://data.world/data-society/capital-bikeshare-2011-2012), but also available from the UCI database. It details hundreds of thousands of Capital Bikeshare rides in 2011 and 2012 sampled every hour, including features for temporal factors, weather conditions, and registered versus casual users. 

In [2]:
df = pd.read_csv('Data/bike_data.csv')

## Data Cleaning

Our date column was an object, so transform to a datetime with pandas and create a new column for Month.

In [3]:
df.Date = pd.to_datetime(arg=df.Date) #dtype('<M8[ns]')
df['Month'] = df['Date'].dt.month

### Remove Correlated Features

We'll be predicting Total Users, so let's drop the other user columns because they'll be heavily correlated with Total Users. And we'll remove Temperatuer F, in favor of the "feels like" temperature. 

In [4]:
df.drop(['Casual Users', 'Registered Users', 'Temperature F', 'Date'], axis=1, inplace=True)

## Modeling

### Encode Categorical Data

In [5]:
cate_feats = ['Season', 'Hour', 'Holiday', 'Day of the Week', 'Working Day', 'Weather Type', 'Month']

for col in cate_feats:
    df[col] = df[col].astype('category')

In [6]:
X = pd.get_dummies(df, drop_first=True)
X.drop('Total Users', axis=1, inplace=True)
y = df['Total Users']

### Normalization

In [7]:
for col in X.columns:
    # Subtract the minimum and divide by the range forcing a scale of 0 to 1 for each feature
    X[col] = (X[col] - min(X[col])) / (max(X[col]) - min(X[col])) 

### Train-Test-Split 

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

### Dummy Regressor

In [9]:
dummy_regressor = DummyRegressor(strategy='mean')
dummy_regressor.fit(X, y)
dummy_regressor.predict(X)
print("Dummy Model Accuracy: ", dummy_regressor.score(X, y))

Dummy Model Accuracy:  0.0


### CART Tree Model

In [10]:
regressor = DecisionTreeRegressor(random_state=42, max_depth=3)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=3, random_state=42)

In [11]:
y_pred = regressor.predict(X_test)
print('MSE score:', mse(y_test, y_pred))
print('R-sq score:', r2_score(y_test,y_pred))

MSE score: 21091.847432693474
R-sq score: 0.344608475006546
