<a href="https://colab.research.google.com/github/Azizbek0712/california.pro/blob/main/california_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import sklearn # scikit-learn kutubxonasi

In [None]:
# Onlayn dataset joylashgan manzilini ko'rsatamiaz
URL = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
# bizga kerak ustunlar indekslari
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # bizni funksiyamiz faqat transformer. estimator emas
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room: # add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room = True)),
          ('std_scaler', StandardScaler())
])

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [None]:
X_prepared = full_pipeline.fit_transform(X_train)

In [None]:
X_prepared[0:5,:]

array([[ 1.27258656, -1.3728112 ,  0.34849025,  0.22256942,  0.21122752,
         0.76827628,  0.32290591, -0.326196  , -0.17491646,  0.05137609,
        -0.2117846 ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.70916212, -0.87669601,  1.61811813,  0.34029326,  0.59309419,
        -0.09890135,  0.6720272 , -0.03584338, -0.40283542, -0.11736222,
         0.34218528,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.44760309, -0.46014647, -1.95271028, -0.34259695, -0.49522582,
        -0.44981806, -0.43046109,  0.14470145,  0.08821601, -0.03227969,
        -0.66165785,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 1.23269811, -1.38217186,  0.58654547, -0.56148971, -0.40930582,
        -0.00743434, -0.38058662, -1.01786438, -0.60001532,  0.07750687,
         0.78303162,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.10855122,  0.5320839 ,  1

In [None]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [None]:
LR_model.fit(X_prepared, y)

In [None]:
# tasodifiy 5 ta qatorni ajratib olamiz
test_data = X_train.sample(5)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
3697,-118.37,34.2,34.0,2199.0,609.0,2488.0,597.0,2.9861,<1H OCEAN
5647,-118.29,33.74,29.0,1503.0,411.0,1368.0,390.0,2.1473,NEAR OCEAN
5778,-118.26,34.15,14.0,2981.0,894.0,1941.0,863.0,3.0,<1H OCEAN
16970,-122.3,37.55,35.0,3675.0,735.0,1930.0,715.0,3.9833,NEAR OCEAN
5680,-118.29,33.71,40.0,1933.0,475.0,902.0,412.0,4.25,NEAR OCEAN


In [None]:
# yuqoridagi qatorlarga mos keluvchi narxlarni ajratib olamiz (biz aynan shu qiymatlarni bashorat qilishimiz kerak)
test_label = y.loc[test_data.index]
test_label

3697     171800.0
5647     195300.0
5778     178600.0
16970    342800.0
5680     332800.0
Name: median_house_value, dtype: float64

In [None]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared

array([[ 0.60445493, -0.67544174,  0.42784199, -0.20371962,  0.16826752,
         0.9336205 ,  0.25465663, -0.46982314, -0.73380641,  0.09246041,
         1.10505387,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.64434338, -0.89073701,  0.03108328, -0.52378129, -0.30429249,
        -0.05140886, -0.28871259, -0.9103147 , -0.6624166 ,  0.03547393,
         1.0448581 ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.65930155, -0.6988434 , -1.15919286,  0.15588991,  0.84846753,
         0.45253919,  0.95289922, -0.46252363, -0.82980918, -0.07322521,
         1.50087829,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-1.35506546,  0.89246946,  0.50719373,  0.47503186,  0.46898752,
         0.4428648 ,  0.56440334,  0.05385138, -0.12372746, -0.03434505,
        -0.22171354,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.64434338, -0.90477801,  0

In [None]:
predicted_data = LR_model.predict(test_data_prepared)
predicted_data

array([171359.43679875, 162818.0978147 , 209653.54531071, 261352.90205808,
       280931.61239805])

In [None]:
pd.DataFrame({'Prognoz':predicted_data, 'Real baxosi': test_label})

Unnamed: 0,Prognoz,Real baxosi
3697,171359.436799,171800.0
5647,162818.097815,195300.0
5778,209653.545311,178600.0
16970,261352.902058,342800.0
5680,280931.612398,332800.0


In [None]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [None]:
X_test = test_set.drop('median_house_value', axis=1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [None]:
y_test = test_set['median_house_value'].copy()
y_test

20046     47700.0
3024      45800.0
15663    500001.0
20484    218600.0
9814     278000.0
           ...   
15362    263300.0
16623    266800.0
18086    500001.0
2144      72300.0
3665     151500.0
Name: median_house_value, Length: 4128, dtype: float64

In [None]:
X_test_prepared = full_pipeline.transform(X_test)

In [None]:
y_predicted = LR_model.predict(X_test_prepared)

In [None]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

72701.32600762138


In [None]:
from sklearn.tree import DecisionTreeRegressor
Tree_model = DecisionTreeRegressor()
Tree_model.fit(X_prepared, y)

In [None]:
y_predicted = Tree_model.predict(X_test_prepared)

In [None]:
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

72281.63167709751


In [None]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

In [None]:
y_predicted = RF_model.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

50237.9328433118


In [None]:
X = df.drop("median_house_value", axis=1)
y = df["median_house_value"].copy()

X_prepared = full_pipeline.transform(X)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(LR_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)

In [None]:
display_scores(LR_rmse_scores)

Scores: [84188.51219065 61197.24357613 86752.24346334 62289.14292385
 80540.40041898 68919.39949642 52503.82940087 90910.07884989
 77674.67507925 53941.60539478]
Mean: 71891.71307941683
Std.dev: 13249.525989444988


In [None]:
scores = cross_val_score(Tree_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [118243.36255836  72152.87606499  83614.31634612  76738.32935684
  89684.51836351  78777.57740811  68061.30128262 100989.62639553
  95384.68741059  74403.66974838]
Mean: 85805.02649350377
Std.dev: 14687.543229467552


In [None]:
scores = cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [97477.95030597 47621.10049062 65291.00808168 57038.65003642
 60561.34133655 60119.04595086 47142.05689682 80294.68720801
 74307.65095195 49033.47461531]
Mean: 63888.69658741873
Std.dev: 15316.274094449502


In [None]:
import pickle

filename = 'RF_model.pkl' # faylga istalgan nom beramiz
with open(filename, 'wb') as file:
    pickle.dump(RF_model, file)

In [None]:
with open(filename, 'rb') as file:
    model = pickle.load(file)


In [None]:
scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [76302.95817103 64211.15460324 61572.03991524 80954.86510056
 62278.91886056]
Mean: 69063.98733012643
Std.dev: 7993.8987817707975


In [None]:
import joblib

filename = 'RF_model.jbl' # faylga istalgan nom beramiz
joblib.dump(RF_model, filename)

['RF_model.jbl']

In [None]:
model = joblib.load(filename)

In [None]:
scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [77047.9858354  64084.30608828 61117.34688266 80225.24685148
 62206.86946016]
Mean: 68936.35102359412
Std.dev: 8039.9362715849775


In [None]:
filename = 'pipeline.jbl'
joblib.dump(full_pipeline, filename)

['pipeline.jbl']