In [1]:
# imports

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
import category_encoders as ce
from sklearn.pipeline import make_pipeline

In [2]:
# read in the dataset
df = pd.read_csv("/Users/imac/DAT07-28-AG/Homework/Unit3/data/housing.csv")

In [3]:
# see what is in the dataset
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
# see shape of dataset. 506 rows and 14 columns

df.shape

(506, 14)

In [5]:
# see data type info about dataset. All data entries are either integers or floats, there are no null values

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  PRICE    506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [6]:
# Declare X & y. the target variable (y) will be PRICE and everything else will be X
X = df.drop("PRICE", axis=1)
y = df["PRICE"]

In [25]:
# initialise GradientBoostingRegressor
gbm = GradientBoostingRegressor(n_estimators=1000)

In [26]:
# fit X and y
gbm.fit(X, y)

GradientBoostingRegressor(n_estimators=1000)

In [None]:
# score the model
# gbm.score(X, y)

In [9]:
# Add a prediction column 
df['Prediction'] = gbm.predict(X)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE,Prediction
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,23.999998
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,21.600006
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,34.699992
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,33.400005
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,36.199985


In [10]:
# see values from feature_importances; i.e. how important is each column to the PRICE?
gbm.feature_importances_

array([0.02532018, 0.00065132, 0.00299976, 0.00090312, 0.03691756,
       0.40707377, 0.01150256, 0.08676356, 0.00151973, 0.01213831,
       0.03442857, 0.01235964, 0.36742193])

In [11]:
# sort importance values into a table to make them easier to understand
feats = pd.DataFrame({
    'Columns': X.columns,
    'Importance': gbm.feature_importances_
})

# sort in order of highest importance
# the majority of PRICE (0.76) is a result of RM and LSTAT
feats.sort_values(by='Importance', ascending=False)

Unnamed: 0,Columns,Importance
5,RM,0.407074
12,LSTAT,0.367422
7,DIS,0.086764
4,NOX,0.036918
10,PTRATIO,0.034429
0,CRIM,0.02532
11,B,0.01236
9,TAX,0.012138
6,AGE,0.011503
2,INDUS,0.003


In [27]:
# Cross validation - create training & test sets
# test set will be 20% of the total data
# data set does not include dates / times so can be randomly shuffled
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1990)

In [28]:
# fit the model on X_train and y_train
gbm.fit(X_train, y_train)

GradientBoostingRegressor(n_estimators=1000)

In [29]:
X_train.shape

(404, 13)

In [30]:
X_test.shape

(102, 13)

In [31]:
y_train.shape

(404,)

In [32]:
y_test.shape

(102,)

In [33]:
# Divide training set again, creating a validation set for testing
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train , test_size=0.2, random_state=1990)

In [34]:
gbm.fit(X_train, y_train)

GradientBoostingRegressor(n_estimators=1000)

In [35]:
# gbm.score(X_val, y_val)

In [36]:
# get 10-fold validation score for the training set. 
cv_scores = cross_val_score(estimator=gbm, X=X_train, y=y_train, cv=10)

# Scores range from 0.689 - 0.928
cv_scores

array([0.92889165, 0.69133319, 0.92006708, 0.85865759, 0.89472099,
       0.82995848, 0.90695711, 0.72940915, 0.91876402, 0.91387198])

In [37]:
# scores are a little erratic so will increase number of folds
cv_scores = cross_val_score(estimator=gbm, X=X_train, y=y_train, cv=20)
cv_scores

array([0.83954002, 0.95934388, 0.90006726, 0.46056393, 0.94218522,
       0.93870287, 0.92274439, 0.76417142, 0.89591343, 0.89507523,
       0.78170127, 0.87707867, 0.86255386, 0.95362014, 0.67067304,
       0.8354423 , 0.96430697, 0.84758105, 0.89576956, 0.91911929])

In [38]:
cv_scores.mean()

0.8563076897393358