In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn # scikit-learn.org library

In [5]:
URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv"
df = pd.read_csv(URL)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)

In [7]:
housing = train_set.drop("median_house_value", axis = 1)
housing_labels = train_set["median_house_value"].copy()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [9]:
# null qiymatli elementlarni to'ldirish uchun, o'rtacha total_bedrooms'ni hisoblab, shu median orqali to'ldiramiz
median = df["total_bedrooms"].median()
median

435.0

In [10]:
df["total_bedrooms"] = df["total_bedrooms"].fillna(median)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [11]:
# biz yuqorida null qiymatlarni to'ldirishimiz uchun fillna metodidan foydalandik
# lekin biz, yana ham avtomatlashtirishimiz uchun quyidagi metodlardan foydalamiz
# bu metod SK-LEARN ichida mavjud bo'lgan SimpleImputer

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy = "median") # strategiyasini biz, o'rtacha qiymatini oldik, ya'ni bo'sh joylarda median qiymatlarini chiqaradi

In [12]:
# df ichiga endi to'ldirish uchun
# lekin oxirgi ustun string turida turibdi, biz bu ustunni tashlab yuborishimiz kerak, sababi, median metodi stringda ishlamaydi
housing_number = housing.drop("ocean_proximity", axis = 1)
housing_number.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542


In [13]:
imputer.fit(housing_number)

In [14]:
# bu imputer'ning qiymatlarini ko'rish uchun, ya'ni har bir ustun uchun median qiymatlari:
imputer.statistics_

array([-118.51  ,   34.26  ,   29.    , 2129.    ,  437.    , 1167.    ,
        410.    ,    3.5458])

In [15]:
# transform orqali ichi to'ldirildi, va bu qiymatlar array turida bo'ladi
X = imputer.transform(housing_number)
X

array([[-1.1703e+02,  3.2710e+01,  3.3000e+01, ...,  2.3000e+03,
         6.2300e+02,  3.2596e+00],
       [-1.1816e+02,  3.3770e+01,  4.9000e+01, ...,  1.3140e+03,
         7.5600e+02,  3.8125e+00],
       [-1.2048e+02,  3.4660e+01,  4.0000e+00, ...,  9.1500e+02,
         3.3600e+02,  4.1563e+00],
       ...,
       [-1.1838e+02,  3.4030e+01,  3.6000e+01, ...,  1.7560e+03,
         5.2700e+02,  2.9344e+00],
       [-1.2196e+02,  3.7580e+01,  1.5000e+01, ...,  1.7770e+03,
         5.5900e+02,  5.7192e+00],
       [-1.2242e+02,  3.7770e+01,  5.2000e+01, ...,  2.6190e+03,
         1.2420e+03,  2.5755e+00]])

In [16]:
type(X)

numpy.ndarray

In [17]:
# umuman olganda, quyidagicha birdan to'ldirishimiz mumkin
X = imputer.fit_transform(housing_number)
X

array([[-1.1703e+02,  3.2710e+01,  3.3000e+01, ...,  2.3000e+03,
         6.2300e+02,  3.2596e+00],
       [-1.1816e+02,  3.3770e+01,  4.9000e+01, ...,  1.3140e+03,
         7.5600e+02,  3.8125e+00],
       [-1.2048e+02,  3.4660e+01,  4.0000e+00, ...,  9.1500e+02,
         3.3600e+02,  4.1563e+00],
       ...,
       [-1.1838e+02,  3.4030e+01,  3.6000e+01, ...,  1.7560e+03,
         5.2700e+02,  2.9344e+00],
       [-1.2196e+02,  3.7580e+01,  1.5000e+01, ...,  1.7770e+03,
         5.5900e+02,  5.7192e+00],
       [-1.2242e+02,  3.7770e+01,  5.2000e+01, ...,  2.6190e+03,
         1.2420e+03,  2.5755e+00]])

In [18]:
# endi bu array data'ni DataFrame'ga o'tkazamiz
housing_transform_df = pd.DataFrame(X,
                                    columns = housing_number.columns,
                                    index = housing_number.index)
housing_transform_df.head(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542
17848,-121.86,37.42,20.0,5032.0,808.0,2695.0,801.0,6.6227
6252,-117.97,34.04,28.0,1686.0,417.0,1355.0,388.0,2.5192
9389,-122.53,37.91,37.0,2524.0,398.0,999.0,417.0,7.9892
6113,-117.9,34.13,5.0,1126.0,316.0,819.0,311.0,1.5
6061,-117.79,34.02,5.0,18690.0,2862.0,9427.0,2777.0,6.4266


In [19]:
# yuqorida biz sonli ustunlar bilan ishladik, endi esa, String (matn) ko'rinishidagi data bilan ishlaymiz
# modelimizda, machine learning ichidagi algoritmlarimiz matnni o'qimaydi, faqat sonlar bilan ishlaydi
# matnli ko'rinishidagi ustunlar asosan, kategoriyalarga ajratib, numericka o'zgartiriladi
housing_category = housing[['ocean_proximity']]
housing_category

Unnamed: 0,ocean_proximity
14196,NEAR OCEAN
8267,NEAR OCEAN
17445,NEAR OCEAN
14265,NEAR OCEAN
2271,INLAND
...,...
11284,<1H OCEAN
11964,INLAND
5390,<1H OCEAN
860,<1H OCEAN


In [20]:
# string turini kategoriyalarga ajratib olib, raqamlashtirish
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()

housing_category_encoder = ordinal_encoder.fit_transform(housing_category)

In [21]:
housing['ocean_proximity'].value_counts()

Unnamed: 0_level_0,count
ocean_proximity,Unnamed: 1_level_1
<1H OCEAN,7341
INLAND,5227
NEAR OCEAN,2086
NEAR BAY,1854
ISLAND,4


In [22]:
# bu ham array bo'lib chiqadi
# 5 ta qiymat mavjud bo'lgan ocean_proximity ustini qandaydir darajadi son orqali almashtirilib olindi
# lekin almashtirishning bu usulining kamchiligi mavjud
housing_category_encoder[ : 10]

array([[4.],
       [4.],
       [4.],
       [4.],
       [1.],
       [0.],
       [0.],
       [3.],
       [0.],
       [0.]])

In [23]:
# bu usulning OrdinalEncoder'dan yomon tomoni, bizda juda ko'p ustun bo'lsa, qo'shimchasiga yana categoriyalarga asosan ko'payib ketadi.
# OneHotEncoder bizga matritsali array qaytaradi, kategoriyalamoqchi bo'lgan elementlarni 1 -> orqali belgilab chiqaradi

from sklearn.preprocessing import OneHotEncoder

category_encoder = OneHotEncoder()

housing_category_one_hot = category_encoder.fit_transform(housing_category)
housing_category_one_hot.toarray()

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [24]:
# matritsa qaytarib olishimiz uchun, bizga pandas kutubxonasi orqali olishimiz ham mumkin.
# LEKIN maCHINE lEARNING UCHUN SKLEARN    Library'dan foydalanamiz
housing_one_hot = pd.get_dummies(housing["ocean_proximity"])
housing_one_hot.head(10)

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
14196,False,False,False,False,True
8267,False,False,False,False,True
17445,False,False,False,False,True
14265,False,False,False,False,True
2271,False,True,False,False,False
17848,True,False,False,False,False
6252,True,False,False,False,False
9389,False,False,False,True,False
6113,True,False,False,False,False
6061,True,False,False,False,False


In [2]:
# sklearn orqali biz o'zimiz transformerlar yaratishimiz va avtomatlashtirishimiz mumkin
# quyidagi transform orqali biz, avvalroq hisoblagan har bir xona, o'rtacha nechi kishi yashaydi shularni hisoblab beruvchi transform yaratamiz
# yuqoridagi va bu transformerlar shablon sifatida boshqa loyihalarda qo'llasa bo'ladi

from sklearn.base import BaseEstimator, TransformerMixin

# bizga kerakli ustunlarning indexlari orqali o'zimizga ma'lu, o'zgaruvchiga saqlab olindi
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
  def __init__(self, add_bedrooms_per_room = True):
    self.add_bedrooms_per_room = add_bedrooms_per_room
  def fit(self, X, y = None):
    return self # bu funksiyamiz faqat transformer, estimator emas
  def transform(self, X):
    rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
    population_per_household = X[:, population_ix] / X[:, households_ix]
    if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
        bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
        return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
    else:
      return np.c_[X, rooms_per_household, population_per_household]

In [25]:
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room = False)
housing_extra_attribs = attr_adder.transform(housing.values)

In [26]:
housing.values[0, :]

array([-117.03, 32.71, 33.0, 3126.0, 627.0, 2300.0, 623.0, 3.2596,
       'NEAR OCEAN'], dtype=object)

In [27]:
housing_extra_attribs[0, :]

array([-117.03, 32.71, 33.0, 3126.0, 627.0, 2300.0, 623.0, 3.2596,
       'NEAR OCEAN', 5.017656500802568, 3.691813804173355], dtype=object)

In [None]:
# Standatrtizatsiya va Normalizatsiya

# Machine Learning yaxshi ishlashi uchun sonlar orasidagi diapazon bir biridan juda katta bo'lmasligi kerak
# Bu ishnning yechimi sklearn ichidagi MinMaxScaler bajarib beradi
# lekin bu ishni qo'lda ham bajarishimiz mumkin

**xnew = (x - xmin) / (xmax - xmin)**

In [32]:
# ocean_proximity string ustuni o'chgan df.ni chaqirib olamiz
# .describe orqali bizga min va max qiymatlarini ko'rishimiz mumkin
housing_number.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
count,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0,16512.0
mean,-119.58229,35.643149,28.608285,2642.004784,538.496851,1426.453004,499.986919,3.880754
std,2.005654,2.136665,12.602499,2174.646744,419.007096,1137.05638,380.967964,1.904294
min,-124.35,32.55,1.0,2.0,1.0,3.0,1.0,0.4999
25%,-121.81,33.93,18.0,1454.0,296.75,789.0,280.0,2.5667
50%,-118.51,34.26,29.0,2129.0,437.0,1167.0,410.0,3.5458
75%,-118.01,37.72,37.0,3160.0,647.0,1726.0,606.0,4.773175
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001


In [35]:
housing_number.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542


In [33]:
xmin = housing_number["median_income"].min()
xmin

0.4999

In [34]:
xmax = housing_number["median_income"].max()
xmax

15.0001

In [36]:
xnew = (3.2596 - xmin) / (xmax - xmin)
xnew

0.19032151280671988

In [37]:
# MinMaxScaler 0 < xnew < 1 oralig'ida bo'ladi

from sklearn.preprocessing import MinMaxScaler

min_max_scaller = MinMaxScaler()

min_max_scaller.fit_transform(housing_number)

array([[0.72908367, 0.01702128, 0.62745098, ..., 0.06437961, 0.10228581,
        0.19032151],
       [0.61653386, 0.12978723, 0.94117647, ..., 0.0367443 , 0.12415721,
        0.22845202],
       [0.38545817, 0.22446809, 0.05882353, ..., 0.02556125, 0.05508962,
        0.25216204],
       ...,
       [0.59462151, 0.15744681, 0.68627451, ..., 0.04913254, 0.08649893,
        0.16789424],
       [0.23804781, 0.53510638, 0.2745098 , ..., 0.04972112, 0.09176122,
        0.35994676],
       [0.19223108, 0.55531915, 1.        , ..., 0.07332044, 0.20407828,
        0.14314285]])

In [39]:
# Ikkinchi usul bu Standatrizatsiya / Normalizatsiya usuli deb ataladi

from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()

standard_scaler.fit_transform(housing_number)

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.76827628,
         0.32290591, -0.326196  ],
       [ 0.70916212, -0.87669601,  1.61811813, ..., -0.09890135,
         0.6720272 , -0.03584338],
       [-0.44760309, -0.46014647, -1.95271028, ..., -0.44981806,
        -0.43046109,  0.14470145],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.28983345,
         0.07090859, -0.49697313],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.30830275,
         0.15490769,  0.96545045],
       [-1.41489815,  0.99543676,  1.85617335, ...,  1.04883375,
         1.94776365, -0.68544764]])

**xnew = (x - xmean) / xstandarddeviation**

### 3.3 Transfomer

Yuqorida ma'lumotlarg ishlov berishda biz `sklearn` tarkibidagi tayyor obyektlar va ulardagi `.fit()` yoki `.fit_transform()` funksiyalariga murojaat qildik.
Bu funksiyalar (`imputer`, `OneHotEncoder` va hokazo) **transformer**lar deb ataladi.

Umuman olganda `sklearn` tarkibidagi obyektlar uch turli bo'ladi:
1. **Estimator** - tahmin qiluvchi (baholovchi) funksiyalar. Bunday obyektlar `.fit()` metodiga ega bo'ladi, bu metod berilgan ma'lumotlardan qandaydur qiymatni hisoblaydi (masalan, avvalroq biz `imputer.fit()` yordamida median qiymatni hisobladik)
2. **Transformer** - ham tahmin qilib ham ma'lumotga ishlov beruvchi obyekt. Bunday obyektlar `.fit_transform()` metodiga ega bo'ladi, ya'ni avval qandaydur qiymatnbi hisoblaydi, keyin esa hisoblashni ma'lumotlarga qo'llaydi. Yuoqirdagi `OrdinalEncoder` va `OneHotEncoder` shunday obyektlardan.
3. **Predictor** - bashorat qiluvchi obyektlar. Bunday obyektlar `.predict()` metodiga ega bo'ladi. Ular biz bergan ma'lumotlar asosida bashoratlar qaytaradi. Dars davomida predictor obyektlar bilan ko'p ishlaymiz.

Obyektlarning afzalligi, ular hisob kitob natijalarini ichki attributrlarda saqlab qoladi. Masalan, yuqorida ko'rganimiz `imputer.statistics_`.

Keling endi o'zimiz ham transformer yozishni o'rganamiz.

Esingizda bo'lsa, avvalgi darsimizda biz ma'lumotlarg yangi 2 ustun qo'shdik: `rooms_per_household` va `bedrooms_per_room`.

Keling berilgan ma'lumotlarga shu ikki ustunni avtomat ravishda qo'shadigan transformer yasaymiz.

Buning uchun `sklearn` tarkibida `BaseEstimator` va `TransformerMixin` klasslaridan meros olib yangi klass yasaymiz va bu klassimizga kabi `fit()` va `transform()` metodlarini qo'shamiz: