# Machine Learning exercise

#### The objective is to predict the missing values for the "registered" and "casual" users from the original dataset (null values). 

In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split

## Step 0: Explore orginal dataset

In [2]:
dc_bikes= pd.read_csv('data_sets/dc_bikes.csv', parse_dates=True, index_col=0)
dc_bikes.shape

(17379, 11)

In [3]:
dc_bikes.head (10)

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3.0,13.0,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8.0,32.0,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5.0,27.0,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3.0,10.0,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0.0,1.0,1
2011-01-01 05:00:00,1,0,0,2,9.84,12.88,75,6.0032,0.0,1.0,1
2011-01-01 06:00:00,1,0,0,1,9.02,13.635,80,0.0,2.0,0.0,2
2011-01-01 07:00:00,1,0,0,1,8.2,12.88,86,0.0,1.0,2.0,3
2011-01-01 08:00:00,1,0,0,1,9.84,14.395,75,0.0,1.0,7.0,8
2011-01-01 09:00:00,1,0,0,1,13.12,17.425,76,0.0,8.0,6.0,14


In [4]:
dc_bikes['hour']=dc_bikes.index.hour
dc_bikes['month']=dc_bikes.index.month

In [5]:
dc_bikes.isnull().sum()

season           0
holiday          0
workingday       0
weather          0
temp             0
atemp            0
humidity         0
windspeed        0
casual        6493
registered    6493
count            0
hour             0
month            0
dtype: int64

#### This null values for "registered" and "casual" are the ones that will be predicted.

## Step 1: Predict registered users

### a. Train data

In [6]:
#1. use train.csv

df_train = pd.read_csv('data_sets/train.csv', index_col = 0, parse_dates=True)
df_train.shape

(10886, 11)

In [7]:
df_train.index.hour
df_train['hour_day'] = df_train.index.hour

In [8]:
#2. drop "count" and "casual" columns

df_train.drop(columns=['count','casual'],inplace=True)

In [9]:
df_train.shape

(10886, 10)

In [10]:
#3. define X as the features and y as "registered"

X = df_train[df_train.columns[df_train.columns!='registered']]
y = df_train['registered']

In [11]:
#4. split into train and test data to find the best fit model and improve features->
   # Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
Xtrain.shape

(7620, 9)

In [13]:
Xtest.shape

(3266, 9)

In [14]:
#5. use X_train to train a model to predict registered user and find the fit model

m = LinearRegression()

In [15]:
m.fit(Xtrain, ytrain)

LinearRegression()

In [16]:
y_registered_train=m.predict(Xtest)

In [17]:
y_registered_train.shape

(3266,)

In [18]:
metrics.mean_squared_error(y_registered_train, ytest)**0.5 

127.97141229318626

### Evaluate model (here on test dataset), first we do only prediction for y_registered

### b. Test data

In [19]:
df_test = pd.read_csv('data_sets/test.csv', index_col = 0, parse_dates=True)
df_test.shape

(6493, 8)

In [20]:
df_test.index.hour
df_test['hour_day'] = df_test.index.hour

In [21]:
df_test.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,hour_day
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,0
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,1
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2
2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,3
2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,4


In [22]:
df_test.shape

(6493, 9)

In [23]:
Xtest2=df_test[df_test.columns]
Xtest2.shape

(6493, 9)

In [24]:
y_registered_pred=m.predict(Xtest2)
y_registered_pred

array([ 28.44813254,  34.02469824,  40.62870195, ..., 161.49619864,
       175.9966243 , 169.52803232])

### Registered users (prediction)

In [25]:
pd.DataFrame(y_registered_pred)

Unnamed: 0,0
0,28.448133
1,34.024698
2,40.628702
3,48.434174
4,55.038178
...,...
6488,144.465601
6489,151.069605
6490,161.496199
6491,175.996624


## Step 2 predict casual users

### a. Train data

In [26]:
#1. use train.csv

df_train2 = pd.read_csv('data_sets/train.csv', index_col = 0, parse_dates=True)
df_train2.shape

(10886, 11)

In [27]:
df_train2.index.hour
df_train2['hour_day'] = df_train2.index.hour

In [28]:
#2. drop "registered" and "count" columns

df_train2.drop(columns=['registered', 'count'],inplace=True)

In [29]:
#3. define X as the features and y as "casual"

X2 = df_train2[df_train2.columns[df_train2.columns!='casual']]
y2 = df_train2['casual']

In [30]:
#4. split into train and test data to find the best fit model and improve features->
   # Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)


X2train, X2test, y2train, y2test = train_test_split(X2, y2, test_size=0.3, random_state=42)

In [31]:
X2train.shape

(7620, 9)

In [32]:
X2test.shape

(3266, 9)

### Find the fit model

In [33]:
m.fit(X2train, y2train)

LinearRegression()

In [34]:
y_casual_train=m.predict(X2test)

In [35]:
metrics.mean_squared_error(y_casual_train, y2test)**0.5 

37.712563358600654

In [36]:
df_test2 = pd.read_csv('data_sets/test.csv', index_col = 0, parse_dates=True)
df_test2.shape

(6493, 8)

### b. Test data

In [37]:
df_test2 = pd.read_csv('data_sets/test.csv', index_col = 0, parse_dates=True)
df_test2.shape

(6493, 8)

In [38]:
df_test2.index.hour
df_test2['hour_day'] = df_test2.index.hour
df_test2.shape

(6493, 9)

In [39]:
X2test2=df_test2[df_test2.columns]
X2test2.shape

(6493, 9)

In [40]:
y_casual_pred=m.predict(X2test2)
y_casual_pred

array([-17.85663778, -12.62867251, -11.45787825, ...,   6.09026052,
        11.59344044,   5.44676273])

### Casual users (prediction)

In [41]:
pd.DataFrame(y_casual_pred)

Unnamed: 0,0
0,-17.856638
1,-12.628673
2,-11.457878
3,-11.731826
4,-10.561032
...,...
6488,6.709532
6489,7.880326
6490,6.090261
6491,11.593440


### Total users (prediction)

In [42]:
y_total_pred = y_registered_pred + y_casual_pred
len(y_total_pred)

6493

In [43]:
pd.DataFrame(y_total_pred)

Unnamed: 0,0
0,10.591495
1,21.396026
2,29.170824
3,36.702348
4,44.477146
...,...
6488,151.175133
6489,158.949931
6490,167.586459
6491,187.590065


# New Dataframe including predicted values

In [44]:
updated_df = df_test

In [45]:
updated_df['y_registered_pred'] = y_registered_pred
updated_df['y_casual_pred'] = y_casual_pred
updated_df['y_total_pred'] = y_total_pred

In [46]:
updated_df.shape

(6493, 12)

In [47]:
updated_df.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,hour_day,y_registered_pred,y_casual_pred,y_total_pred
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,0,28.448133,-17.856638,10.591495
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,1,34.024698,-12.628673,21.396026
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2,40.628702,-11.457878,29.170824
2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,3,48.434174,-11.731826,36.702348
2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,4,55.038178,-10.561032,44.477146


# Now, let's compare the predicted values to the ones in the original dataset (dc_bikes)

In [48]:
dc_bikes.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,month
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3.0,13.0,16,0,1
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8.0,32.0,40,1,1
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5.0,27.0,32,2,1
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3.0,10.0,13,3,1
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0.0,1.0,1,4,1


In [49]:
dc_bikes.shape

(17379, 13)

In [50]:
dc_bikes_empty=dc_bikes[dc_bikes['casual'].isnull()]

In [51]:
dc_bikes_empty

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,hour,month
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,,,33,0,1
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,,,15,1,1
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,,,9,2,1
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,,,8,3,1
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,,,7,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,156,19,12
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,104,20,12
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,67,21,12
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,43,22,12


In [52]:
y_true= pd.DataFrame(dc_bikes_empty['count'])
y_true

Unnamed: 0_level_0,count
datetime,Unnamed: 1_level_1
2011-01-20 00:00:00,33
2011-01-20 01:00:00,15
2011-01-20 02:00:00,9
2011-01-20 03:00:00,8
2011-01-20 04:00:00,7
...,...
2012-12-31 19:00:00,156
2012-12-31 20:00:00,104
2012-12-31 21:00:00,67
2012-12-31 22:00:00,43


In [53]:
metrics.mean_squared_error(y_true, y_total_pred)**0.5

133.15455440533563

In [54]:
pd.DataFrame(y_total_pred)

Unnamed: 0,0
0,10.591495
1,21.396026
2,29.170824
3,36.702348
4,44.477146
...,...
6488,151.175133
6489,158.949931
6490,167.586459
6491,187.590065


In [55]:
bikes_pred=pd.DataFrame(columns=['count_true', 'count_total_prediction'])
bikes_pred['count_true']=y_true
bikes_pred['count_total_prediction']=y_total_pred
bikes_pred['count_total_prediction']=bikes_pred.count_total_prediction.astype('int64')

In [56]:
bikes_pred

Unnamed: 0_level_0,count_true,count_total_prediction
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-01-20 00:00:00,33,10
2011-01-20 01:00:00,15,21
2011-01-20 02:00:00,9,29
2011-01-20 03:00:00,8,36
2011-01-20 04:00:00,7,44
...,...,...
2012-12-31 19:00:00,156,151
2012-12-31 20:00:00,104,158
2012-12-31 21:00:00,67,167
2012-12-31 22:00:00,43,187


### Conclusion: The average squared difference (mean_squared_error) between the estimated values and true values are 133 units. 