In [2]:
import pandas as pd
import numpy as np
import sklearn

In [3]:
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)

In [4]:
from sklearn.model_selection import train_test_split
train_set, test_set=train_test_split(df, test_size=0.2, random_state=42)
housing=train_set.drop('median_house_value', axis=1)
housing_labels=train_set['median_house_value'].copy()
housing_num=housing.drop('ocean_proximity', axis=1)

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
          ('std_scaler', StandardScaler())             
])
num_pipeline.fit_transform(housing_num)

array([[ 1.27258656, -1.3728112 ,  0.34849025, ..., -0.17491646,
         0.05137609, -0.2117846 ],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.40283542,
        -0.11736222,  0.34218528],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.08821601,
        -0.03227969, -0.66165785],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ..., -0.60675918,
         0.02030568,  0.99951387],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.40217517,
         0.00707608, -0.79086209],
       [-1.41489815,  0.99543676,  1.85617335, ..., -0.85144571,
        -0.08535429,  1.69520292]])

In [30]:
from sklearn.compose import ColumnTransformer

num_attribs=list(housing_num)
cat_attribs=['ocean_proximity']
full_pipeline=ColumnTransformer([
    ('num',num_pipeline,num_attribs),
    ('cat',OneHotEncoder(),cat_attribs)
])

housing_prepared=full_pipeline.fit_transform(housing)

In [31]:
housing_prepared[0:5,:]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646,  0.05137609,
        -0.2117846 ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542, -0.11736222,
         0.34218528,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.44760309, -0.46014647, -1.95271028, -0.34259695, -0.49522582,
        -0.44981806, -0.43046109,  0.14470145,  0.08821601, -0.03227969,
        -0.66165785,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.23269811, -1.38217186,  0.58654547, -0.56148971, -0.40930582,
        -0.00743434, -0.38058662, -1.01786438, -0.60001532,  0.07750687,
         0.78303162,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.10855122,  0.5320839 ,  1

In [32]:
from sklearn.linear_model import LinearRegression
LR_model=LinearRegression()
LR_model.fit(housing_prepared,housing_labels)


LinearRegression()

In [33]:
test_data=housing.sample(10)

In [34]:
test_label = housing_labels.loc[test_data.index]
test_label

10716    296200.0
2718      93800.0
11363    207200.0
19347    201700.0
1352      95300.0
13579     79500.0
1361     156900.0
10605    296600.0
9156     253700.0
7889     313800.0
Name: median_house_value, dtype: float64

In [35]:
test_data_prepared=full_pipeline.transform(test_data)
test_data_prepared

array([[ 0.873702  , -0.92817967, -1.95271028, -0.75003178, -0.81265249,
        -0.80513222, -0.79270724,  2.12208213, -0.1378855 , -0.04457899,
        -0.29334731,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 2.00055088, -1.33068821, -1.00048937, -0.16831049,  0.13246752,
        -0.21323511,  0.03415898, -1.05331166, -0.41829128, -0.06814149,
         0.82985328,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.81885538, -0.89073701, -0.36567544,  0.73853091,  0.71958753,
         1.48769951,  0.78227603,  0.1793085 , -0.04688419,  0.0699844 ,
        -0.26068139,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-1.61434043,  1.38858464,  0.98330419,  0.03586678, -0.08949249,
        -0.38649474, -0.35696187,  0.07806056,  0.85338678, -0.0332881 ,
        -0.49430938,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-1.18553953,  1.11244506,  0

In [36]:
predicted_labels=LR_model.predict(test_data_prepared)
predicted_labels

array([352376.27023983,  70694.50659669, 197846.80148814, 252220.19764857,
       156285.44711791,  72311.77409825, 256400.01779375, 337471.54206272,
       291888.71088796, 254852.13504273])

In [37]:
pd.DataFrame({'Prognoz':predicted_labels, 'Real_price':test_label})

Unnamed: 0,Prognoz,Real_price
10716,352376.27024,296200.0
2718,70694.506597,93800.0
11363,197846.801488,207200.0
19347,252220.197649,201700.0
1352,156285.447118,95300.0
13579,72311.774098,79500.0
1361,256400.017794,156900.0
10605,337471.542063,296600.0
9156,291888.710888,253700.0
7889,254852.135043,313800.0


In [38]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [39]:
X_test=test_set.drop('median_house_value',axis=1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [40]:
y_test=test_set['median_house_value']
y_test

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
           ...   
15362    263300.0
16623    266800.0
18086    500001.0
2144      72300.0
3665     151500.0
Name: median_house_value, Length: 4128, dtype: float64

In [41]:
X_test_prepared=full_pipeline.transform(X_test)

In [42]:
y_predicted=LR_model.predict(X_test_prepared)

In [43]:
from sklearn.metrics import mean_squared_error
lin_mse=mean_squared_error(y_test,y_predicted)
lin_rmse=np.sqrt(lin_mse)
print(lin_rmse)

72701.32600762135


In [44]:
from sklearn.tree import DecisionTreeRegressor
Tree_model=DecisionTreeRegressor()
Tree_model.fit(housing_prepared,housing_labels)
predicted_y=Tree_model.predict(X_test_prepared)

In [45]:
lin_mse = mean_squared_error(y_test,predicted_y)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

72052.44291475488


In [46]:
from sklearn.ensemble import RandomForestRegressor
RF_model=RandomForestRegressor()
RF_model.fit(housing_prepared,housing_labels)
y_predicted=RF_model.predict(X_test_prepared)
y_predicted

array([ 48381.  , 104396.  , 461875.38, ..., 499103.99,  71252.  ,
       173877.  ])

In [47]:
forest_mse=mean_squared_error(y_test,y_predicted)
forest_mse=np.sqrt(forest_mse)
forest_mse

50204.77091097301

In [53]:
from sklearn.model_selection import cross_val_score

In [60]:
scores=cross_val_score(LR_model,housing_prepared,housing_labels, scoring='neg_mean_squared_error',cv=10)
LR_rmse_score=np.sqrt(-scores)


In [64]:
import pickle
filename='RF_model.pkl'
with open(filename,'wb') as file:
        pickle.dump(RF_model,file)


In [66]:
import joblib
filename='LR_model.jbl'
joblib.dump(LR_model,filename)

['LR_model.jbl']

In [67]:
filename='pipeline.jbl'
joblib.dump(full_pipeline,filename)

['pipeline.jbl']