# Workshop 3

Starter code for workshop 3. You should have seen most of it before, but make sure you understand what it is doing!

In [4]:
# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [6]:
# Read data
import pandas as pd

housing = pd.read_csv("workshop3.csv")
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200


Split the available data 80/20 for training and testing. Don't use the test data until the very end!

In [7]:
# Split our data in train and test
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=45)

median_house_value is the value we want to predict, so separate it from the other features.

In [8]:
# Split train data in training_features (X) and training_labels 
training_features = train_set.drop(["median_house_value"], axis=1) # X
training_labels = train_set["median_house_value"].copy() # Y

In [9]:
# Create a linear regression model
from sklearn.linear_model import LinearRegression

# Train and predict in train data
lr = LinearRegression()
lr.fit(training_features,training_labels)
prediction_lr = lr.predict(training_features)

In [10]:
# Get the root mean squared error
from sklearn.metrics import mean_squared_error

# Get mean squared error
mse = mean_squared_error(training_labels,prediction_lr) #MSE
# Get root mean squared error
rmse = mean_squared_error(training_labels,prediction_lr,squared = False) #RMSE
print(mse)
print(rmse)

4826810823.833597
69475.2533196792


In [11]:
# Create a baseline using the mean of median_house variable
baseline = np.full(training_labels.shape,training_labels.mean())

In [12]:
# Get the rmse for our baseline
rmse_bl = mean_squared_error(training_labels,baseline,squared = False) #RMSE
print(rmse_bl)

115598.25388852337


In [13]:
# Try a new model - KNN
from sklearn.neighbors import KNeighborsRegressor

# We need to define the n_neighbors parameter
knn = KNeighborsRegressor(n_neighbors= 5)

# Later we can train and predict in our training data
knn.fit(training_features,training_labels)
pred_knn = knn.predict(training_features)

In [14]:
# Get the rmse of our KNN model
rmse_knn = mean_squared_error(training_labels,pred_knn,squared = False) #RMSE
print(rmse_knn)

80796.20511653168


In [15]:
# Now, we go to see the use of cross validation
from sklearn.model_selection import cross_val_score

# We need to define a model
knn = KNeighborsRegressor(n_neighbors= 5)
# Implement cross_val_score in our knn model. We need to define the numbers of folds (cv).

cv_scores = cross_val_score(knn, training_features, training_labels, cv=5,scoring= 'neg_root_mean_squared_error')

In [16]:
# Using cross_validation (from sklearn) returns a negative rmse. Then, we need to add a negative to get the positive value.
print('this is my rmse for cv: ', -cv_scores.mean())

this is my rmse for cv:  100092.32412862862


In [17]:
# Also, we can implement other similar function, cross_validate
from sklearn.model_selection import cross_validate

# Set a model
knn = KNeighborsRegressor(n_neighbors= 5)
# Get results
cv_results = cross_validate(knn, training_features, training_labels, cv=5,scoring= 'neg_root_mean_squared_error')

In [18]:
# This function gives us more information
cv_results

{'fit_time': array([0.03231955, 0.02801943, 0.027637  , 0.02799892, 0.02797055]),
 'score_time': array([0.05159807, 0.04855013, 0.04000282, 0.04001403, 0.04004073]),
 'test_score': array([-100494.53383849,  -99983.48219245, -100080.96776509,
         -99679.57219173, -100223.06465538])}

In [19]:
# In this way we can obtain the rmse for our cross validation process
-cv_results['test_score'].mean()

100092.32412862862

In [None]:
from sklearn.model_selection import KFold

knn = KNeighborsRegressor(n_neighbors=5)



In [21]:
# Also, we can implement other similar function, cross_validate
from sklearn.model_selection import cross_validate

# Set a model
knn = KNeighborsRegressor(n_neighbors= 5)
# Get results
cv_results = cross_validate(knn, training_features, training_labels, cv=5,scoring= 'neg_root_mean_squared_error', return_train_score=True)

In [22]:
cv_results

{'fit_time': array([0.03203511, 0.02811623, 0.02796555, 0.0280571 , 0.02798104]),
 'score_time': array([0.04409695, 0.04038692, 0.05603027, 0.04809284, 0.04803324]),
 'test_score': array([-100494.53383849,  -99983.48219245, -100080.96776509,
         -99679.57219173, -100223.06465538]),
 'train_score': array([-81067.70666977, -81430.20898091, -81420.29198242, -81204.35926971,
        -81607.64248322])}

In [23]:
# In this way we can obtain the rmse of training fold for our cross validation process
-cv_results['train_score'].mean()

81346.04187720611

In [20]:
# Now, we go to use GridSearchCV to determine which are the best parameters for our model
from sklearn.model_selection import GridSearchCV

# We go to use knn
knn = KNeighborsRegressor()
# Using the method "get_params()" we can see the different parameters which can change in our model
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [24]:
# We need to create a dictionary with the different values that we go to use
parameters = {'n_neighbors':[3,5,7,9,11], 'weights':['uniform', 'distance']}

# Create our GridSearchCV model
clf = GridSearchCV(knn, parameters,scoring= 'neg_root_mean_squared_error')

# Train our GridSearch. This can take long time, if we have many possible values for each parameter
clf.fit(training_features, training_labels)

In [25]:
# Finally, we can obtain the atribute "best_params_" from our GridSearchCV object and know which are the best setting of parameters
clf.best_params_

{'n_neighbors': 11, 'weights': 'distance'}

In [26]:
# Also, we can get the score of the best scenario (using the best parameters). Remember that GridSearchCV uses cross validation internally
-clf.best_score_

96190.50350085564

In [32]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
    KNeighborsRegressor()
)

In [34]:
# We need to create a dictionary with the different values that we go to use
parameters = {'kneighborsregressor__n_neighbors':[3,5,7,9,11], 'kneighborsregressor__weights':['uniform', 'distance']}

# Create our GridSearchCV model
clf = GridSearchCV(pipeline, parameters,scoring= 'neg_root_mean_squared_error')

# Train our GridSearch. This can take long time, if we have many possible values for each parameter
clf.fit(training_features, training_labels)

In [35]:
# Finally, we can obtain the atribute "best_params_" from our GridSearchCV object and know which are the best setting of parameters
clf.best_params_

{'kneighborsregressor__n_neighbors': 11,
 'kneighborsregressor__weights': 'distance'}

In [36]:
# Also, we can get the score of the best scenario (using the best parameters). Remember that GridSearchCV uses cross validation internally
-clf.best_score_

61625.349556488996

In [37]:
# Split test data 
test_features = test_set.drop(["median_house_value"], axis=1) # X
test_labels = test_set["median_house_value"].copy() # Y

In [40]:
pipeline_best = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
    KNeighborsRegressor(n_neighbors=clf.best_params_['kneighborsregressor__n_neighbors'],
                        weights=clf.best_params_['kneighborsregressor__weights'])
)

pipeline_best.fit(training_features, training_labels)
predictions = pipeline_best.predict(test_features)

In [42]:
rmse = mean_squared_error(test_labels, predictions, squared=False)
print(rmse)

59654.59027904501
