In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [2]:
# Online dataset joylashgan joy uni ochish
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)

In [4]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train =  train_set.drop('median_house_value', axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop('ocean_proximity', axis=1)

In [5]:
# Biz o'zimiz ham transformerlar yasashimiz mukin

from sklearn.base import BaseEstimator, TransformerMixin

#bizga kerakli ustun indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer, estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [6]:
from sklearn.pipeline  import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
    ('std_scaler', StandardScaler())
])


In [8]:
# Ammo matnlar bilan ishlash uchun alohida conver yaratishimiz kerak bo'ladi
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

# yakuniy katta konver
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

X_prepared = full_pipeline.fit_transform(X_train)

In [9]:
X_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

In [10]:
# Linear regression

In [44]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [12]:
LR_model.fit(X_prepared, y)

LinearRegression()

In [13]:
test_data = X_train.sample(10)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
4988,-118.31,34.0,52.0,2709.0,642.0,1751.0,613.0,2.1116,<1H OCEAN
7768,-118.09,33.91,14.0,2369.0,604.0,1546.0,464.0,3.7969,<1H OCEAN
13705,-117.22,34.12,30.0,2512.0,597.0,1390.0,523.0,2.3725,INLAND
11004,-117.81,33.75,25.0,2365.0,471.0,1197.0,458.0,3.7031,<1H OCEAN
19505,-121.03,37.64,22.0,2001.0,387.0,1520.0,387.0,3.148,INLAND
9225,-120.06,36.94,19.0,901.0,183.0,700.0,190.0,2.2375,INLAND
5454,-118.46,34.0,36.0,1392.0,260.0,679.0,247.0,4.7344,<1H OCEAN
11558,-117.99,33.77,29.0,1312.0,267.0,922.0,255.0,3.1902,<1H OCEAN
10643,-117.79,33.56,36.0,2057.0,329.0,658.0,309.0,7.866,<1H OCEAN
17313,-120.18,34.75,17.0,2074.0,382.0,1035.0,359.0,3.7958,<1H OCEAN


In [14]:
test_label = y.loc[test_data.index]
test_label

4988     122500.0
7768     159400.0
13705     77200.0
11004    227800.0
19505    102300.0
9225      64300.0
5454     346900.0
11558    202400.0
10643    500001.0
17313    400000.0
Name: median_house_value, dtype: float64

In [16]:
test_data_prepared = full_pipeline.transform(test_data)
predicted_labels = LR_model.predict(test_data_prepared)

In [17]:
predicted_labels

array([183131.23749499, 205260.24117514, 106088.94178991, 205939.59027785,
       107593.22304161,  66297.63174002, 264069.66490074, 179287.79441658,
       392245.76805923, 232694.90078557])

In [19]:
pd.DataFrame({
    'Bashorat':predicted_labels,
    'Real Narxi': test_label
})

Unnamed: 0,Bashorat,Real Narxi
4988,183131.237495,122500.0
7768,205260.241175,159400.0
13705,106088.94179,77200.0
11004,205939.590278,227800.0
19505,107593.223042,102300.0
9225,66297.63174,64300.0
5454,264069.664901,346900.0
11558,179287.794417,202400.0
10643,392245.768059,500001.0
17313,232694.900786,400000.0


In [20]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [22]:
X_test = test_set.drop('median_house_value', axis=1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [24]:
y_test = test_set['median_house_value'].copy()
y_test

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
           ...   
15362    263300.0
16623    266800.0
18086    500001.0
2144      72300.0
3665     151500.0
Name: median_house_value, Length: 4128, dtype: float64

In [26]:
X_test_prepared = full_pipeline.transform(X_test)
X_test_prepared

array([[ 0.28534728,  0.1951    , -0.28632369, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.06097472, -0.23549054,  0.11043502, ...,  0.        ,
         0.        ,  0.        ],
       [-1.42487026,  1.00947776,  1.85617335, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.23041404,  0.78014149, -0.28632369, ...,  0.        ,
         0.        ,  0.        ],
       [-0.08860699,  0.52740357,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.60445493, -0.66608108, -0.92113763, ...,  0.        ,
         0.        ,  0.        ]])

In [27]:
y_predicted = LR_model.predict(X_test_prepared)

In [28]:
y_predicted

array([ 61874.25460143, 121853.52511139, 267770.94368091, ...,
       447837.04647878, 117275.9214608 , 185597.46125194])

In [29]:
pd.DataFrame({
    'Bashorat':y_predicted,
    'Asl narxi':y_test
})

Unnamed: 0,Bashorat,Asl narxi
20046,61874.254601,47700.0
3024,121853.525111,45800.0
15663,267770.943681,500001.0
20484,264468.298380,218600.0
9814,258485.695855,278000.0
...,...,...
15362,214774.565911,263300.0
16623,236868.742582,266800.0
18086,447837.046479,500001.0
2144,117275.921461,72300.0


In [34]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_predicted)
mae     # o'rtacha 50 ming dollarga xato qilyapti

50898.73953494079

In [33]:
from sklearn.metrics import mean_squared_error
mse =  mean_squared_error(y_test, y_predicted)

print("RMSE=", np.sqrt(mse)) # o'rtacha 72 ming dollarga xato qilyapti

RMSE= 72701.32600762135


In [36]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

RandomForestRegressor()

In [37]:
y_predicted = RF_model.predict(X_test_prepared)

In [38]:
from sklearn.metrics import mean_squared_error
mse =  mean_squared_error(y_test, y_predicted)

print("RMSE=", np.sqrt(mse)) # buning xatoligi esa 50 ming dollar

RMSE= 50157.85992581288


In [39]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_predicted)
mae # bu esa undan ancha yaxshiroq, yani 32 ming dollar xato qilyapti

32325.317056686043

In [40]:
# Cross validation ----- Modelni baholash

In [48]:
X = df.drop('median_house_value', axis=1)
y = df['median_house_value'].copy()

X_prepared = full_pipeline.transform(X)

In [52]:
from sklearn.model_selection import cross_val_score

mse_scores = cross_val_score(LR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)

In [53]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

In [54]:
display_scores(np.sqrt(-mse_scores))

Scores: [73394.92502922 74814.24096819 75431.93119241 76608.78768825
 66196.48128669]
Mean: 73289.27323295095
Std.dev: 3694.713678722368


In [55]:
scores = cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [98572.13179809 47458.13287116 65155.97477854 56277.43720742
 61360.65258141 60526.72751224 47241.81725228 78459.11718172
 73844.02210611 49821.65071537]
Mean: 63871.76640043252
Std.dev: 15285.739678861943


In [56]:
# Agar bizda malumot juda ham kam bo'lsa, ya'ni biz 80% ni training 20% ni
# test uchun ololmaydigan bo'lsak, biz cross validationdan foydalanganimiz
# maqul

In [57]:
# malumotlarni saqlashni PICKLE uslubi
import pickle

filename = 'RF_model.pkl' # faylga nom berish
with open(filename, 'wb') as file:
    pickle.dump(RF_model, file)

In [58]:
# modelni qaytarib o'qish esa quyidagicha
with open(filename, 'rb') as file:
    model = pickle.load(file)

In [59]:
# joblib ---- Bu katta malumotlarni saqlashda keng qo'llaniladigan usul
# Agar biz nafaqat modelni balki pipeline, yoki boshqa modullarni ham
# saqlashimiz kerak bo'lsa, shuni ishlatamiz
import joblib

filename = 'RF_model.jbl' # file ga nom berish
joblib.dump(RF_model, filename)

['RF_model.jbl']

In [60]:
filename = 'LR_model' # file nomi
joblib.dump(LR_model, filename)

['LR_model']

In [61]:
# modelni o'qish
model = joblib.load(filename)