# Machine Learning

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split

#### Step 1 predict registered users

In [2]:
#1. use train.csv

df = pd.read_csv('../data/train.csv', index_col = 0, parse_dates=True)
df

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168
2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129


In [3]:
df.index.hour
df['hour_day'] = df.index.hour

In [4]:
#'night' = 1 ,'morning' = 2 , 'afternoon' = 3, 'evening' = 4
time_labels = [1,2,3,4]
bin_boundaries = [-0.1, 4.9, 12, 18, 23.9]
df['part_of_day']= pd.cut(df['hour_day'], bins=bin_boundaries, labels=time_labels)

In [5]:
#2. drop count and casual columns

df.drop(columns=['count','casual'])

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,registered,hour_day,part_of_day
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,13,0,1
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,32,1,1
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,27,2,1
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,10,3,1
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...
2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,329,19,4
2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,231,20,4
2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,164,21,4
2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,117,22,4


In [6]:
#3. define X as the features and y as the registered

X = df[['temp']]
y = df['registered']

In [7]:
#4. split into train and test data ->
   # Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
#5. use X_train to train a model to predict registered user

m = LinearRegression()

In [9]:
m.fit(Xtrain, ytrain)

LinearRegression()

In [10]:
Xtrain.shape

(7620, 1)

In [11]:
Xtest.shape

(3266, 1)

In [12]:
df.shape

(10886, 13)

In [13]:
from sklearn import metrics

In [14]:
#6. evaluating the model (getting R2)

m.score(Xtrain, ytrain) #training error 

0.09789883284786016

In [15]:
m.score(Xtest, ytest) # test error R2

0.11001327381783121

In [16]:
#7. predict y_registered_train

y_registered_train= m.predict(Xtest)
y_registered_train

array([236.67611815,  57.5606004 ,  92.38861774, ..., 147.11835927,
       236.67611815,  87.41318669])

In [17]:
#8. rmse tells us the error in average on our predictions

metrics.mean_squared_error(m.predict(Xtest), ytest)**0.5

140.95902115076757

In [18]:
#or 

metrics.mean_squared_error(y_registered_train, ytest)**0.5

140.95902115076757

## Evaluate model and improve with more features if you want

In [19]:
#a. define X as the features and y as the registered

X = df[['temp','hour_day','part_of_day','season']]
y = df['registered']

In [20]:
#b. drop count and casual columns

df.drop(columns=['count','casual'])

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,registered,hour_day,part_of_day
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,13,0,1
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,32,1,1
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,27,2,1
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,10,3,1
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...
2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,329,19,4
2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,231,20,4
2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,164,21,4
2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,117,22,4


In [21]:
#c. define X as the features and y as the registered

X = df[['temp','hour_day','part_of_day','season']]
y = df['registered']

In [22]:
#d. split into train and test data ->
   # Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
#e. use X_train to train a model to predict registered user

m = LinearRegression()

In [24]:
m.fit(Xtrain, ytrain)

LinearRegression()

In [25]:
#f. evaluating the model (getting R2)

m.score(Xtrain, ytrain) #training error 

0.23491210342810465

In [26]:
m.score(Xtest, ytest) # test error R2

0.22602434751882994

In [27]:
#g. predict y_registered_train

y_registered_train= m.predict(Xtest)
y_registered_train

array([235.43162126,   3.41076999, 200.71426007, ..., 165.79975298,
       192.81778838,  41.42473358])

In [28]:
#8. rmse tells us the error in average on our predictions

metrics.mean_squared_error(m.predict(Xtest), ytest)**0.5

131.45126204363265

### Use test.csv
### Do all the same feature engineering you did to X

In [38]:
Xtest = pd.read_csv('../data/test.csv', index_col = 0, parse_dates=True)
Xtest

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014
...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981


In [39]:
Xtest.index.hour
Xtest['hour_day'] = df_2.index.hour

In [40]:
#'night' = 1 ,'morning' = 2 , 'afternoon' = 3, 'evening' = 4
time_labels = [1,2,3,4]
bin_boundaries = [-0.1, 4.9, 12, 18, 23.9]
Xtest['part_of_day']= pd.cut(Xtest['hour_day'], bins=bin_boundaries, labels=time_labels)

In [46]:
Xtest.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,hour_day,part_of_day
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,0,1
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,1,1
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2,1
2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,3,1
2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,4,1


In [44]:
#make prediction using this model for register users -> y_registered_pred

y_pred=m.predict(Xtest)

Feature names unseen at fit time:
- atemp
- holiday
- humidity
- weather
- windspeed
- ...
Feature names must be in the same order as they were in fit.



ValueError: X has 10 features, but LinearRegression is expecting 4 features as input.

In [None]:
metrics.mean_squared_error(y_pred, ytest)**0.5 