<a href="https://colab.research.google.com/github/AngelCBC/house-price-regression/blob/main/house_price_regressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Set up

import numpy.random as rnd
import numpy as np
rnd.seed(42)

# Directory
%cd "/content"

/content


In [2]:
# Get the Data.

import os, tarfile
from six.moves import urllib

DATASETS_URL = "https://github.com/ageron/handson-ml/raw/master/datasets"
HOUSING_PATH = os.getcwd()+ os.path.join("/datasets","housing")
HOUSING_URL = DATASETS_URL + "/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.exists(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url,tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    print("Done!")
    housing_tgz.close()

fetch_housing_data()

Done!


In [3]:
# Load the Data.

import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
housing.info() # Some nan in "total_bedrooms" attrb

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [5]:
housing["ocean_proximity"].value_counts() # non-float64 attrb

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit

# Train test split fn.

def custom_train_test(data, test_size=0.2):
    # Categories to shuffle the data. 
    categories = pd.cut(data["median_income"],bins=[0., 1.5, 3.0, 4.5, 6.,np.inf],
                        labels= [1, 2, 3, 4, 5])
    
    # Train-test Stratified Shuffle split.
    split = StratifiedShuffleSplit(n_splits=1, test_size=test_size, 
                                   random_state=42)
    for train_index, test_index in split.split(data, categories):
        train_data = data.loc[train_index]
        train_labels = data["median_house_value"].loc[train_index]
        test_data = data.loc[test_index]
        test_labels = data["median_house_value"].loc[test_index]

    # Drop the label column in the training and testing data.
    for set_ in (train_data, test_data):
        set_.drop("median_house_value", axis=1, inplace=True)

    return train_data, train_labels, test_data, test_labels

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

# Class to create some extra attributes in the data.

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    fit = lambda self, X, y=None : self # returns itself.
    def transform(self, X):
        # Class indices to operate in the pandas dataframe.
        col_names = "total_rooms", "total_bedrooms", "population", "households"
        rooms_ix, bedrooms_ix, population_ix, households_ix = [
                            housing.columns.get_loc(c) for c in col_names]

        # Attributes formation.
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
      
        return np.c_[X, rooms_per_household, population_per_household,
                bedrooms_per_room]

In [8]:
from sklearn.ensemble import RandomForestRegressor

# Determine the most important k-features with a Random Forest Regressor.

def best_attrbs(data, labels, k):
    # Random Forest Regression model.
    rf_reg = RandomForestRegressor()
    rf_reg.fit(data, labels)
    importance = rf_reg.feature_importances_
    return np.sort(np.argpartition(importance, -k)[-k:])

class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, k):
        self.k = k
    def fit(self, X, y):
        self.selection = best_attrbs(X, y, self.k)
        return self 
    def transform(self, X):
        return X[:, self.selection]

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR

# Numeric and non-numeric attrbs.
non_num_attrb = ["ocean_proximity"]
num_attrbs = list(housing.drop(["ocean_proximity", "median_house_value"], axis=1))

# Processing the dataset.
def processing_housing_data(data, n_iter, num_attrbs=num_attrbs,
                            non_numeric_attrbs=non_num_attrb):
    train_data, train_labels, test_data, test_labels = custom_train_test(data)
    
    # Pipelines for the data transformation.
    pipeline = ColumnTransformer([
        ("numeric", Pipeline([ 
                    ("imputer", SimpleImputer()), 
                    ('attribs_adder', CombinedAttributesAdder()),
                    ('std_scaler', StandardScaler()),]), num_attrbs),
        ("categorical", OneHotEncoder(handle_unknown = 'ignore'), non_numeric_attrbs)
    ])

    full_pipeline = Pipeline([
        ("prep", pipeline),
        ("attrb_selection", FeatureSelector(1)),
        ("SVR", SVR()),
    ])
    
    # Random Search with cross-validation.
    param_distributions = {
        "prep__numeric__imputer__strategy": ["mean", "median", "most_frequent"],
        "SVR__C": randint(low=1, high=1e3),
        "SVR__gamma": randint(low=1e-3, high=1e3),
        "SVR__kernel": ["linear", "rbf"],
        "attrb_selection__k": randint(low=1, high=train_data.shape[1] + 4),
    }
    search = RandomizedSearchCV(full_pipeline, param_distributions,
                                n_iter=n_iter, scoring="neg_mean_squared_error", 
                                cv=5, random_state=42, verbose=3)
    search.fit(train_data, train_labels)

    # Predictions in the test set.
    y_pred = search.predict(test_data)

    return mean_squared_error(test_labels, y_pred, squared=False)

In [13]:
rmse = processing_housing_data(data=housing, n_iter=2)
print("The RMSE of the test set is:", rmse)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END SVR__C=103, SVR__gamma=435, SVR__kernel=linear, attrb_selection__k=11, prep__numeric__imputer__strategy=mean;, score=-5195329208.537 total time=  22.2s
[CV 2/5] END SVR__C=103, SVR__gamma=435, SVR__kernel=linear, attrb_selection__k=11, prep__numeric__imputer__strategy=mean;, score=-4908236764.291 total time=  22.0s
[CV 3/5] END SVR__C=103, SVR__gamma=435, SVR__kernel=linear, attrb_selection__k=11, prep__numeric__imputer__strategy=mean;, score=-5407937378.006 total time=  21.9s
[CV 4/5] END SVR__C=103, SVR__gamma=435, SVR__kernel=linear, attrb_selection__k=11, prep__numeric__imputer__strategy=mean;, score=-5910841519.632 total time=  22.0s
[CV 5/5] END SVR__C=103, SVR__gamma=435, SVR__kernel=linear, attrb_selection__k=11, prep__numeric__imputer__strategy=mean;, score=-5119749337.330 total time=  21.9s
[CV 1/5] END SVR__C=21, SVR__gamma=614, SVR__kernel=rbf, attrb_selection__k=3, prep__numeric__imputer__strategy=mos