In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv"
df = pd.read_csv(URL)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


**Ma'lumotlarni train va test set'larga ajratib olamiz**

In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)

X_train = train_set.drop("median_house_value", axis = 1)
y = train_set["median_house_value"].copy()

X_numeric = X_train.drop("ocean_proximity", axis = 1) # dataframe ichidagi faqat Integer qiymatli ustunlarni qoldirdik

**PIPELINE quramiz**

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

# bizga kerakli ustunlarning indexlari orqali o'zimizga ma'lu, o'zgaruvchiga saqlab olindi
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
  def __init__(self, add_bedrooms_per_room = True):
    self.add_bedrooms_per_room = add_bedrooms_per_room
  def fit(self, X, y = None):
    return self # bu funksiyamiz faqat transformer, estimator emas
  def transform(self, X):
    rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
    population_per_household = X[:, population_ix] / X[:, households_ix]
    if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
    else:
      return np.c_[X, rooms_per_household, population_per_household]

**Sonli ustunlar uchun Pipeline**

In [5]:
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy = "median")),
    ("attribute_adder", CombinedAttributesAdder(add_bedrooms_per_room = True)),
    ("std_scaler", StandardScaler())
])

**Matnli ustunlar uchun Pipeline**

In [8]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_numeric)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', numeric_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

X_prepared = full_pipeline.fit_transform(X_train)

In [9]:
X_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

In [14]:
# Machine Learning uchun Model yaratamiz
# Linear Regression usuli orqali
# Machine Learning uchun model yaratishda biz komputerga ham savol ham javobni beramiz, shunda biz yaratgan model o'zi aniqlshtirib oladi

from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

LR_model.fit(X_prepared, y) # manashu yakuniy model, biz komputerga bashorat qilishni o'rgatdik

# modelimizning nomi LR_model

In [17]:
# bizning maqsad yuqorida yaratgan modelimiz orqali pastdagi sample uchun bashorat qiymatlarini yaratishdir
test_data = X_train.sample(10)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN
10526,-117.7,33.57,4.0,3283.0,911.0,1512.0,782.0,3.3125,<1H OCEAN
15271,-117.27,33.08,7.0,2949.0,447.0,1335.0,426.0,6.0922,NEAR OCEAN
18008,-121.97,37.29,25.0,4096.0,743.0,2027.0,741.0,5.3294,<1H OCEAN
17360,-120.4,34.87,10.0,2197.0,329.0,1064.0,319.0,4.9766,<1H OCEAN
17947,-121.96,37.33,35.0,2294.0,411.0,1054.0,449.0,4.0667,<1H OCEAN
19755,-122.21,40.18,30.0,744.0,156.0,410.0,165.0,2.1898,INLAND
3593,-118.48,34.24,32.0,2621.0,412.0,1285.0,414.0,6.6537,<1H OCEAN
15173,-117.06,33.03,23.0,2023.0,309.0,678.0,340.0,7.0913,<1H OCEAN
6448,-118.04,34.13,39.0,2485.0,382.0,1072.0,342.0,6.0878,INLAND


In [22]:
test_label = y.loc[test_data.index]
test_label

Unnamed: 0,median_house_value
17445,172600.0
10526,138500.0
15271,342400.0
18008,300300.0
17360,199600.0
17947,276900.0
19755,63200.0
3593,267600.0
15173,265400.0
6448,430200.0


In [25]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared

array([[-4.47603094e-01, -4.60146471e-01, -1.95271028e+00,
        -3.42596951e-01, -4.95225824e-01, -4.49818057e-01,
        -4.30461089e-01,  1.44701447e-01,  8.82160119e-02,
        -3.22796862e-02, -6.61657847e-01,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00],
       [ 9.38520745e-01, -9.70302653e-01, -1.95271028e+00,
         2.94767244e-01,  8.89040862e-01,  7.52377703e-02,
         7.40276475e-01, -2.98415837e-01, -5.18168628e-01,
        -1.00485215e-01,  1.11446930e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 1.15292120e+00, -1.19963892e+00, -1.71465506e+00,
         1.41174429e-01, -2.18372487e-01, -8.04320480e-02,
        -1.94213598e-01,  1.16132950e+00,  6.23004475e-01,
         3.18192867e-03, -1.05668783e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00],
       [-1.19052558e+00,  7.70780828e

In [27]:
# bashorat qilish uchun .predict metodidan foydalanamiz

test_data_prepared = full_pipeline.transform(test_data)
predicted_labels = LR_model.predict(test_data_prepared)
predicted_labels

array([244957.50017771, 216895.72244916, 274561.89107679, 296452.00243446,
       264654.17245586, 251693.8203103 ,  60570.45550396, 325203.10952032,
       336881.70051099, 267554.83219558])

In [28]:
# modelimiz bilan asl qiymatni solishtirish uchun

pd.DataFrame({
    "Bashorat_Natija" : predicted_labels,
    "Asl_Natija" : test_label
})

Unnamed: 0,Bashorat_Natija,Asl_Natija
17445,244957.500178,172600.0
10526,216895.722449,138500.0
15271,274561.891077,342400.0
18008,296452.002434,300300.0
17360,264654.172456,199600.0
17947,251693.82031,276900.0
19755,60570.455504,63200.0
3593,325203.10952,267600.0
15173,336881.700511,265400.0
6448,267554.832196,430200.0


**5-QADAM MODELNI BAHOLASH**


In [29]:
# modelimizni baholash uchun test_set qismi bilan amalga oshiramiz
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [30]:
# Test qismidan y-labelni o'chirib alohida olishimiz kerak
X_test = test_set.drop("median_house_value", axis = 1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [31]:
# label.ni alohida olamiz - bu bizning asl qiymatimiz

y_label = test_set["median_house_value"].copy()
y_label

Unnamed: 0,median_house_value
20046,47700.0
3024,45800.0
15663,500001.0
20484,218600.0
9814,278000.0
...,...
15362,263300.0
16623,266800.0
18086,500001.0
2144,72300.0


In [32]:
# endi X_test ni pipeline'dan o'tkazishimiz kerak

X_test_prepared = full_pipeline.transform(X_test)

In [34]:
# Avvalgi yaratgan modelimiz orqali Bashorat qilamiz
y_predicted = LR_model.predict(X_test_prepared)
y_predicted

array([ 61874.25460143, 121853.52511139, 267770.94368091, ...,
       447837.04647878, 117275.9214608 , 185597.46125194])

In [39]:
# endi biz qanchalik hato bashorat qilganligimizni aniqlaymiz
# bu bu modulning boshida o'rgangan ikki formula MAE va RMSE orqali aniqlaymiz

from sklearn.metrics import mean_absolute_error

mse = mean_absolute_error(y_label, y_predicted)
print("MSE =", mse)

MSE = 50898.73953494079


In [40]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_label, y_predicted)
print("RMSE =", np.sqrt(rmse))

RMSE = 72701.32600762135


***Biz chiqargan model orqakli bashoratimiz unchalik ham yaxshi emasdir, xatolik katta dek tuyulishi mumkin, ya'ni MSE va RMSE ga qarab. Lekin avvalda gaplashilgan ekspertlar natijasidan ham bizning model yaxshiroqdir, shunga ham e'tibor berishimiz zarur.***

### RandomForest

In [42]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)
RandomForestRegressor()

# Modelni tekshiramiz:

y_predicted = RF_model.predict(X_test_prepared)
lin_mse = mean_squared_error(y_label, y_predicted)

# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

50384.46438208761


Cross-Validation usuli bilan baholash
Yuqorida biz modelni baholash uchun ma'lumotlarni test va train setlarga ajratdik.
Bu usulning kamchiligi biz test va train uchun doim bir xil ma'lumotlardan foydalanayapmiz.

Cross-validation yordamida biz ma'lumotlarni bir necha qismga ajratib, modelni turli qismlar yordamida bir nechta bor train va test qilishimiz mumkin.

Misol uchun, quyidagi rasmda ma'lumotlarni 5 ga ajratib train va test qilish ko'rsatilgan.

In [43]:
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"].copy()

X_prepared = full_pipeline.transform(X)
# Validation natijalarini ko'rsatish uchun sodda funksiya yasab olamiz

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

# Cross-validation
from sklearn.model_selection import cross_val_score

# LogisticRegression
scores = cross_val_score(LR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

# Decision Tree
scores = cross_val_score(Tree_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

# Random Forest
scores = cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

# Modelni saqlash
# Yaratgan modelimizdan kelajakda foydalanish uchun saqlab qo'yishimiz lozim. Umuman olganda nafaqat model, balki boshqa kerak bo'ladigan o'zgaruvchilarni ham saqlab qo'yish maqsadga muvvofiq bo'ladi. Masalan pipeline.

# Buning uchun Pythondagi pickle yoki joblib modullaridan foydalanamiz.

#pickle yordamida saqlash

import pickle

filename = 'RF_model.pkl' # faylga istalgan nom beramiz
with open(filename, 'wb') as file:
    pickle.dump(RF_model, file)

# Modelni qayta o'qiymiz:

with open(filename, 'rb') as file:
    model = pickle.load(file)

# Modelni sinab ko'ramiz

scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

# joblib yordamida saqlash
# joblib katta NumPy martrisalarni siqib saqlash uchun afzal.

# joblib o'rnatilmagan bo'lsa pip install joblib yordamida o'rnatib oling.

import joblib

filename = 'RF_model.jbl' # faylga istalgan nom beramiz
joblib.dump(RF_model, filename)
['RF_model.jbl']

# Modelni o'qiymiz

model = joblib.load(filename)
scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

# pipeline saqlab olamiz

filename = 'pipeline.jbl'
joblib.dump(full_pipeline, filename)
['pipeline.jbl']
