In [1]:
import pandas as pd
import numpy as np
import sklearn

In [8]:
df = pd.read_csv("https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [12]:
df.dropna(inplace=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20433 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20433 non-null  float64
 1   latitude            20433 non-null  float64
 2   housing_median_age  20433 non-null  float64
 3   total_rooms         20433 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20433 non-null  float64
 6   households          20433 non-null  float64
 7   median_income       20433 non-null  float64
 8   median_house_value  20433 non-null  float64
 9   ocean_proximity     20433 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.7+ MB


In [14]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.3, random_state=34)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
# bizga kerak ustunlar indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
          ('std_scaler', StandardScaler())
])

In [17]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [18]:
num_attribs

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income']

In [19]:
X_prepared = full_pipeline.fit_transform(X_train)
X_prepared[:5,:]

array([[-0.61775764,  0.8890882 , -0.6844821 , -0.87151138, -1.00080694,
        -0.98483524, -1.07228316,  1.17569632,  1.16795121,  0.03493972,
        -0.87820324,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [-1.3917177 ,  2.70038839, -0.049212  , -0.40739517, -0.27505967,
        -0.42404962, -0.33064347, -1.3671812 , -0.31810482, -0.07646333,
         0.49823135,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [-1.00723432,  1.92812862, -0.52566457, -0.0813331 ,  0.05093173,
        -0.22643119,  0.15507831, -1.07719569, -0.43198711, -0.13879836,
         0.24159473,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [-0.93233496,  1.39456732,  0.66546685, -0.49594963, -0.52252759,
        -0.23509867, -0.45599102, -0.69030427, -0.27998871,  0.07598284,
        -0.1296468 ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.68549122, -0.78648149,  1

In [20]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [21]:
LR_model.fit(X_prepared, y)

In [22]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20338,-119.03,34.23,16.0,5323.0,795.0,2493.0,779.0,5.6762,271300.0,<1H OCEAN
7678,-118.09,33.92,31.0,1983.0,419.0,1157.0,390.0,3.5455,168300.0,<1H OCEAN
14176,-117.07,32.72,18.0,1758.0,286.0,987.0,277.0,4.6875,141800.0,NEAR OCEAN
16061,-122.49,37.76,49.0,1637.0,304.0,729.0,281.0,4.3281,323100.0,NEAR BAY
6111,-117.90,34.13,25.0,3076.0,856.0,2868.0,752.0,2.6619,117600.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
10736,-117.90,33.61,41.0,1521.0,328.0,527.0,275.0,4.0764,500001.0,<1H OCEAN
484,-122.27,37.86,52.0,2307.0,583.0,1127.0,548.0,1.8447,198200.0,NEAR BAY
4498,-118.21,34.05,26.0,745.0,258.0,694.0,236.0,1.3846,129200.0,<1H OCEAN
6085,-117.87,34.09,36.0,1267.0,191.0,640.0,200.0,5.2405,220000.0,<1H OCEAN


In [24]:
X_test = test_set.drop('median_house_value', axis=1)
X_test.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20338,-119.03,34.23,16.0,5323.0,795.0,2493.0,779.0,5.6762,<1H OCEAN
7678,-118.09,33.92,31.0,1983.0,419.0,1157.0,390.0,3.5455,<1H OCEAN
14176,-117.07,32.72,18.0,1758.0,286.0,987.0,277.0,4.6875,NEAR OCEAN
16061,-122.49,37.76,49.0,1637.0,304.0,729.0,281.0,4.3281,NEAR BAY
6111,-117.9,34.13,25.0,3076.0,856.0,2868.0,752.0,2.6619,<1H OCEAN


In [25]:
y_test = test_set['median_house_value'].copy()
y_test

20338    271300.0
7678     168300.0
14176    141800.0
16061    323100.0
6111     117600.0
           ...   
10736    500001.0
484      198200.0
4498     129200.0
6085     220000.0
16131    457800.0
Name: median_house_value, Length: 6130, dtype: float64

In [26]:
X_test_prepared = full_pipeline.transform(X_test)

In [27]:
y_predicted = LR_model.predict(X_test_prepared)

In [28]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test, y_predicted)
lin_rmse = np.sqrt(lin_mse)
print(f'{lin_rmse=}')

lin_rmse=67409.42864828889


#cross-validation

In [31]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

In [29]:
X = df.drop('median_house_value', axis=1)
y = df['median_house_value'].copy()
X_prepared = full_pipeline.transform(X)

In [33]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(LR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
rmse_scores = np.sqrt(-scores)
display_scores(rmse_scores)

Scores: [72081.03946226 74317.13788012 75204.59956579 75613.47670597
 66200.18454445]
Mean: 72683.28763171735
Std.dev: 3464.2695337321175
