In [138]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_regression, make_swiss_roll, fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR

In [169]:
housing = fetch_california_housing()

In [170]:
print(housing.frame)

None


In [171]:
housing.data

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [172]:
housing.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [173]:
housing.DESCR

'.. _california_housing_dataset:\n\nCalifornia Housing dataset\n--------------------------\n\n**Data Set Characteristics:**\n\n:Number of Instances: 20640\n\n:Number of Attributes: 8 numeric, predictive attributes and the target\n\n:Attribute Information:\n    - MedInc        median income in block group\n    - HouseAge      median house age in block group\n    - AveRooms      average number of rooms per household\n    - AveBedrms     average number of bedrooms per household\n    - Population    block group population\n    - AveOccup      average number of household members\n    - Latitude      block group latitude\n    - Longitude     block group longitude\n\n:Missing Attribute Values: None\n\nThis dataset was obtained from the StatLib repository.\nhttps://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html\n\nThe target variable is the median house value for California districts,\nexpressed in hundreds of thousands of dollars ($100,000).\n\nThis dataset was derived from the 1990 U.S

In [174]:
housing_df = pd.DataFrame(housing.data)

In [175]:
housing_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [176]:
column_name_conversion = {
    0:'Median Income',
  1:'Median House Age',
  2:'Avg # of Rooms',
  3:'Avg # of Bedrooms',
  4:'Population Block Group',
  5:'Avg # of Household Members',
  6:'Block Group Latitude',
  7:'Block Group Longitude'
}

In [177]:
housing_df = housing_df.rename(columns=column_name_conversion)

In [178]:
housing_df.head()

Unnamed: 0,Median Income,Median House Age,Avg # of Rooms,Avg # of Bedrooms,Population Block Group,Avg # of Household Members,Block Group Latitude,Block Group Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [179]:
y = pd.DataFrame(housing.target)

In [180]:
y = y.rename(columns={0:'Median House Value'})

In [181]:
y.tail(100)

Unnamed: 0,Median House Value
20540,1.943
20541,2.265
20542,2.362
20543,1.826
20544,2.194
...,...
20635,0.781
20636,0.771
20637,0.923
20638,0.847


In [182]:
y['Median House Value'] = y['Median House Value'] * 100_000

In [183]:
y.tail(100)

Unnamed: 0,Median House Value
20540,194300.0
20541,226500.0
20542,236200.0
20543,182600.0
20544,219400.0
...,...
20635,78100.0
20636,77100.0
20637,92300.0
20638,84700.0


In [184]:
housing_df_rounded = housing_df.apply(lambda col: round(col, 2), axis=1)

In [185]:
housing_df_rounded.tail(100)

Unnamed: 0,Median Income,Median House Age,Avg # of Rooms,Avg # of Bedrooms,Population Block Group,Avg # of Household Members,Block Group Latitude,Block Group Longitude
20540,3.19,16.0,4.39,0.98,1386.0,2.18,38.54,-121.72
20541,4.62,13.0,6.12,1.04,2828.0,2.54,38.54,-121.70
20542,4.73,33.0,6.58,1.01,979.0,2.60,38.55,-121.75
20543,1.73,33.0,3.88,1.03,3717.0,2.10,38.55,-121.74
20544,2.06,23.0,4.80,1.01,6330.0,3.46,38.55,-121.76
...,...,...,...,...,...,...,...,...
20635,1.56,25.0,5.05,1.13,845.0,2.56,39.48,-121.09
20636,2.56,18.0,6.11,1.32,356.0,3.12,39.49,-121.21
20637,1.70,17.0,5.21,1.12,1007.0,2.33,39.43,-121.22
20638,1.87,18.0,5.33,1.17,741.0,2.12,39.43,-121.32


In [190]:
X = housing_df_rounded

In [191]:
X_train, X_test = train_test_split(X, random_state=13)

In [192]:
y_train, y_test = train_test_split(y, random_state=13)

In [201]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
data = [X_train_scaled, X_test_scaled, y_train, y_test]

In [202]:
def test_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train score: {reg.score(X_train_scaled, y_train)}')
    print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')
    plt.show()    

In [203]:
test_model(LinearRegression(), data)

test_model(KNeighborsRegressor(), data)
test_model(RandomForestRegressor(), data)
test_model(ExtraTreesRegressor(), data)
test_model(AdaBoostRegressor(), data)

Model: LinearRegression
Train score: 0.6071736033532957
Test Score: 0.6032270007474417

Model: KNeighborsRegressor
Train score: 0.7902114420654279
Test Score: 0.6767929429866262



  return fit_method(estimator, *args, **kwargs)


Model: RandomForestRegressor
Train score: 0.9736392176897759
Test Score: 0.8016565329827507



  return fit_method(estimator, *args, **kwargs)


Model: ExtraTreesRegressor
Train score: 0.9999999975621863
Test Score: 0.8062825495988335



  y = column_or_1d(y, warn=True)


Model: AdaBoostRegressor
Train score: 0.3206384297654561
Test Score: 0.3014934764797367

