In [114]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

In [115]:
df = pd.read_csv('Datas/housing.csv')

In [116]:
ocean_prox = set(df["ocean_proximity"])

In [117]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [118]:
#Add new datas : mean number of rooms per households, population per households and mean ratio of bedroom over rooms.

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        self.name = 'test'
        
    def fit(self, X, y=None):
        return self # nothing else to do
    
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]

In [119]:
#Class to chose some specifics columns

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [120]:
#Identic to LabelBinazer except the number of parmeters (to work in a pipeline).

from sklearn.preprocessing import LabelBinarizer

class CustomBinarizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None,**fit_params):
        return self
    def transform(self, X):
        return LabelBinarizer().fit(X).transform(X)

In [121]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, Imputer

#diferents type of datas, numeric or non-numerical labels.
num_attribs = list(df.drop("ocean_proximity", axis=1))
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_binarizer', CustomBinarizer()),])


full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),])

np_prepared = full_pipeline.fit_transform(df)


In [122]:
np_prepared.shape

(20640, 17)

In [123]:
#Find the list of columns for np_prepared

#Old list
c = list(df.columns)

#list of columns after num_pipeline
c.remove("ocean_proximity")
c.append('rooms_per_household')
c.append('population_per_household')
c.append('bedrooms_per_room')

#list of columns after cat_pipeline
ocean_list = list(ocean_prox)
ocean_list.sort()

#new columns
col_list = c + ocean_list

In [124]:
df = pd.DataFrame(df_prepared, columns = col_list)

In [125]:
#Slice df into X and Y:
temp = df
X= temp.drop("median_house_value",axis =1).values

Y = df[['median_house_value']].values

In [126]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [127]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

lin_reg = LinearRegression()
lin_reg.fit(X_train,Y_train)
Y_test_pred = lin_reg.predict(X_test)



In [128]:
from sklearn.metrics import mean_squared_error

lin_mse = mean_squared_error(Y_test_pred, Y_test)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.6300004505108533

In [146]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train,Y_train)

Y_test_pred = tree_reg.predict(X_test)

In [133]:
lin_mse = mean_squared_error(Y_test_pred, Y_test)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.633582804557723

In [144]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg,X,Y,scoring="neg_mean_squared_error", cv=10)

rmse_scores = np.sqrt(-scores)


def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(rmse_scores)

Scores: [1.02419138 0.63958039 0.71916735 0.66148581 0.77949994 0.68680895
 0.58225142 0.87235793 0.82576643 0.64531502]
Mean: 0.7436424615133449
Standard deviation: 0.12666038742444596


In [165]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, Y_train.flat)

Y_test_pred = forest_reg.predict(X_test)

forest_mse = mean_squared_error(Y_test_pred, Y_test)
forest_rmse = np.sqrt(forest_mse)
forest_rmse



0.45469256125842383