In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)

X_train = train_set.drop('median_house_value', axis = 1)
y = train_set['median_house_value'].copy()

X_num = X_train.drop('ocean_proximity', axis = 1)

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
  def __init__(self, add_bedrooms_per_room = True):
    self.add_bedrooms_per_room = add_bedrooms_per_room

  def fit(self, X, y = None):
    return self

  def transform(self, X):
    rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
    population_per_household = X[:, population_ix] / X[:, households_ix]
    if self.add_bedrooms_per_room:
      bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
      return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
    else:
      return np.c_[X, rooms_per_household, population_per_household]

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
    ('std_scaler',StandardScaler())
])

In [6]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [7]:
X_prepared = full_pipeline.fit_transform(X_train)

In [8]:
X_prepared[0:5, :]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646,  0.05137609,
        -0.2117846 ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542, -0.11736222,
         0.34218528,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.44760309, -0.46014647, -1.95271028, -0.34259695, -0.49522582,
        -0.44981806, -0.43046109,  0.14470145,  0.08821601, -0.03227969,
        -0.66165785,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.23269811, -1.38217186,  0.58654547, -0.56148971, -0.40930582,
        -0.00743434, -0.38058662, -1.01786438, -0.60001532,  0.07750687,
         0.78303162,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.10855122,  0.5320839 ,  1

In [9]:
# Machine Learning
# Linear Regression - Chiziqli regressiya

from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()
LR_model.fit(X_prepared, y)

In [10]:
test_data = X_train.sample(5)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
674,-122.16,37.68,16.0,1687.0,348.0,568.0,352.0,2.3869,NEAR BAY
5631,-118.28,33.77,47.0,307.0,69.0,374.0,65.0,2.9063,<1H OCEAN
15947,-122.43,37.72,52.0,3351.0,719.0,2101.0,706.0,3.0107,NEAR BAY
3597,-118.5,34.25,32.0,2333.0,389.0,969.0,331.0,4.8164,<1H OCEAN
7803,-118.1,33.9,40.0,1880.0,377.0,1229.0,378.0,4.4167,<1H OCEAN


In [11]:
test_label = y.loc[test_data.index]
test_label

674       83300.0
5631     146900.0
15947    242000.0
3597     241100.0
7803     174600.0
Name: median_house_value, dtype: float64

In [12]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared

array([[-1.28526067,  0.95331377, -1.00048937, -0.43916728, -0.45465249,
        -0.75500126, -0.38846153, -0.78448998, -0.26918305, -0.12811145,
        -0.11336815,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.64932944, -0.87669601,  1.45941464, -1.07377233, -1.1205325 ,
        -0.92562242, -1.14182853, -0.5117297 , -0.29831078,  0.22946922,
         0.20515572,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-1.4198842 ,  0.9720351 ,  1.85617335,  0.32603764,  0.43080086,
         0.59325767,  0.54077859, -0.45690457, -0.28851646, -0.010454  ,
         0.02939707,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.53963619, -0.65204008,  0.2691385 , -0.14209855, -0.35679916,
        -0.40232557, -0.44358595,  0.49134961,  0.67570125, -0.01463664,
        -0.79525742,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.73907847, -0.8158517 ,  0

In [13]:
predicted_data = LR_model.predict(test_data_prepared)
predicted_data

array([165595.41461151, 197801.20767226, 224775.93455419, 257474.85396258,
       244051.05113286])

In [14]:
pd.DataFrame({'Prognoz': predicted_data, 'Real baxosi': test_label})

Unnamed: 0,Prognoz,Real baxosi
674,165595.414612,83300.0
5631,197801.207672,146900.0
15947,224775.934554,242000.0
3597,257474.853963,241100.0
7803,244051.051133,174600.0


In [15]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [16]:
X_test = test_set.drop("median_house_value", axis = 1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [17]:
y_test = test_set['median_house_value'].copy()
y_test

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
           ...   
15362    263300.0
16623    266800.0
18086    500001.0
2144      72300.0
3665     151500.0
Name: median_house_value, Length: 4128, dtype: float64

In [18]:
X_test_prepared = full_pipeline.transform(X_test)

In [19]:
y_predicted = LR_model.predict(X_test_prepared)

In [20]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test, y_predicted)
lin_rmne = np.sqrt(lin_mse)
print(lin_rmne)

72701.32600762138


In [21]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
Tree_model = DecisionTreeRegressor()
Tree_model.fit(X_prepared, y)

In [22]:
y_predicted = Tree_model.predict(X_test_prepared)

In [23]:
lin_mse = mean_squared_error(y_test, y_predicted)
#RMSE hisoblayman

lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

72755.78190044758


In [24]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

In [25]:
y_predicted = RF_model.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, y_predicted)
#RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

50281.255046130274


In [26]:
# Cross - Validation usulida baholash

X = df.drop("median_house_value", axis = 1)
y = df['median_house_value'].copy()

X_prepared = full_pipeline.transform(X)

In [27]:
def display_scores(scoress):
  print("Scoress:", scoress)
  print("Mean:", scoress.mean())
  print("Std.dev", scoress.std())

In [28]:
#Cross-validation
from sklearn.model_selection import cross_val_score

In [29]:
scores = cross_val_score(LR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)

In [30]:
display_scores(LR_rmse_scores)

Scoress: [84188.51219065 61197.24357613 86752.24346334 62289.14292385
 80540.40041898 68919.39949642 52503.82940087 90910.07884989
 77674.67507925 53941.60539478]
Mean: 71891.71307941683
Std.dev 13249.525989444988


In [31]:
#Decision Tree
scores = cross_val_score(Tree_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scoress: [116914.41075588  71592.99237559  83787.99755612  75662.14299752
  91341.52322521  77952.16087163  69109.53619817 100461.28992466
  94472.68498414  76911.80504807]
Mean: 85820.65439369899
Std.dev 14201.419618741162


In [33]:
# Random Forest
scores = cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scoress: [96998.51358692 47240.28673245 65576.47962207 56300.07867255
 61422.1297468  60267.99439231 46349.87230348 78595.19711934
 74781.58868326 49710.56170971]
Mean: 63724.27025688913
Std.dev 15153.727145599614


In [34]:
# Modelni saqlash PICKLE yordamida

import pickle

filename = "RF_model.pkl"
with open(filename, 'wb') as file:
  pickle.dump(RF_model, file)

In [35]:
# Modelni qayta o`qish

with open(filename, 'rb') as file:
  model = pickle.load(file)

In [36]:
# Modelni sinab ko`ramiz

scores = cross_val_score(model, X_prepared, y, scoring = 'neg_mean_squared_error', cv = 5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scoress: [78027.96099267 64016.79804061 61697.89117158 82614.06785366
 62189.02305769]
Mean: 69709.14822324186
Std.dev 8819.013201209711


In [37]:
# Modelni JOBLIB orqali saqlash

import joblib

filename = 'RF_model.jbl' # faylga istalgan nom beramiz
joblib.dump(RF_model, filename)

['RF_model.jbl']

In [38]:
# Modelni yuklash
model = joblib.load(filename)

In [39]:
#Modelni sinab ko`rish
scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scoress: [77652.06862174 63879.64089045 61348.55537725 80248.48777281
 62357.21185194]
Mean: 69097.1929028361
Std.dev 8126.854901057167


In [40]:
# pipelineni saqlab olamiz
filename = 'pipeline.jbl'
joblib.dump(full_pipeline, filename)

['pipeline.jbl']