In [6]:
import pandas as pd
import numpy as np
import sklearn # scikit-learn kutubxonasi

# Onlayn dataset joylashgan manzilini ko'rsatamiaz
URL = "https://raw.githubusercontent.com/anvarnarz/praktikum_datasets/main/housing_data_08-02-2021.csv"
df = pd.read_csv(URL)

df.drop('location', axis=1, inplace = True)

index_to_drop1 = df[(df['size'] == 'Площадьземли:1сот')].index
df = df.drop(index_to_drop1)

df['size'] = df['size'].astype(float)

indexes_to_drop2 = df[df['size'] > 999].index
df = df.drop(indexes_to_drop2)

index_to_drop3 = df[(df['price'] == 'Договорная')].index
df = df.drop(index_to_drop3)

df['price'] = df['price'].astype(int)

indexes_to_drop4 = df[(df['price'] > 500000) & (df['price'] < 5000)].index
df = df.drop(indexes_to_drop4)

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

housing = train_set.drop("price", axis=1)
housing_labels = train_set["price"].copy()

housing_num = housing.drop("district", axis=1)

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

num_pipeline = Pipeline([
          ('std_scaler', StandardScaler())             
])

In [8]:
num_pipeline.fit_transform(housing_num)

array([[ 0.34804979,  0.24393793,  1.91011311,  1.12631154],
       [-0.57246858, -0.35361856,  0.13629325, -0.78191016],
       [-1.49298695, -1.16846832, -1.19407164, -1.54519884],
       ...,
       [-0.57246858, -0.08200198,  0.13629325, -0.40026582],
       [-0.57246858, -0.70672012,  0.13629325, -0.40026582],
       [ 0.34804979, -0.08200198, -1.19407164, -0.78191016]])

In [9]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ['district']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OrdinalEncoder(), cat_attribs)
])

In [10]:
housing_prepared = full_pipeline.fit_transform(housing)

In [12]:
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [15]:
LR_model.fit(housing_prepared, housing_labels)

In [16]:
# tasodifiy 5 ta qatorni ajratib olamiz
test_data = housing.sample(5)
test_data

Unnamed: 0,district,rooms,size,level,max_levels
3932,Юнусабадский,3,65.0,4,4
5700,Мирабадский,2,65.0,3,9
5529,Мирабадский,4,175.0,3,12
741,Мирабадский,5,300.0,3,8
7554,Мирзо-Улугбекский,2,50.0,3,4


In [17]:
# yuqoridagi qatorlarga mos keluvchi narxlarni ajratib olamiz (biz aynan shu qiymatlarni bashorat qilishimiz kerak)
test_label = housing_labels.loc[test_data.index]
test_label

3932     53500
5700     78000
5529    145000
741     425000
7554     28000
Name: price, dtype: int64

In [18]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared

array([[ 0.34804979, -0.21781027,  0.13629325, -0.78191016,  8.        ],
       [-0.57246858, -0.21781027, -0.30716171,  1.12631154,  1.        ],
       [ 1.26856815,  2.76997217, -0.30716171,  2.27124457,  1.        ],
       [ 2.18908652,  6.16517949, -0.30716171,  0.7446672 ,  1.        ],
       [-0.57246858, -0.62523515, -0.30716171, -0.78191016,  2.        ]])

In [19]:
predicted_data = LR_model.predict(test_data_prepared)
predicted_data

array([ 65841.93381704,  66419.02029208, 167364.13371428, 244756.42412178,
        50284.40825703])

In [20]:
pd.DataFrame({'Prognoz':predicted_data, 'Real baxosi': test_label})

Unnamed: 0,Prognoz,Real baxosi
3932,65841.933817,53500
5700,66419.020292,78000
5529,167364.133714,145000
741,244756.424122,425000
7554,50284.408257,28000


In [25]:
test_set.head(2)

Unnamed: 0,district,rooms,size,level,max_levels,price
5751,Чиланзарский,1,30.0,2,4,25200
2203,Яккасарайский,2,42.0,1,2,37000


In [26]:
X_test = test_set.drop('price', axis=1)
X_test.head(2)

Unnamed: 0,district,rooms,size,level,max_levels
5751,Чиланзарский,1,30.0,2,4
2203,Яккасарайский,2,42.0,1,2


In [28]:
y_test = test_set['price'].copy()
y_test.head(2)

5751    25200
2203    37000
Name: price, dtype: int64

In [29]:
X_test_prepared = full_pipeline.transform(X_test)

In [30]:
y_predicted = LR_model.predict(X_test_prepared)

In [32]:
pd.DataFrame({'Prognoz':y_predicted, 'Real baxosi': y_test}).head(2)

Unnamed: 0,Prognoz,Real baxosi
5751,21557.493053,25200
2203,48093.140519,37000


In [33]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

439121.94210003706


In [34]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
Tree_model = DecisionTreeRegressor()
Tree_model.fit(housing_prepared, housing_labels)

In [35]:
y_predicted = Tree_model.predict(X_test_prepared)

In [36]:
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

443013.75237327145


In [37]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(housing_prepared, housing_labels)

In [38]:
y_predicted = RF_model.predict(X_test_prepared)
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE hisoblaymiz
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

473796.12708788464


In [None]:
# joblib yordamida saqlash

# joblib katta NumPy martrisalarni siqib saqlash uchun afzal.

# joblib o'rnatilmagan bo'lsa pip install joblib yordamida o'rnatib oling.

# import joblib

# filename = 'RF_model.jbl' # faylga istalgan nom beramiz
# joblib.dump(RF_model, filename)

# Modelni o'qiymiz

# model = joblib.load(filename)

# qayta o'qiganimizda df ning nomi modelga o'zgaradi

# scores = cross_val_score(model, X_prepared, y, scoring="neg_mean_squared_error", cv=5)
# LR_rmse_scores = np.sqrt(-scores)
# display_scores(LR_rmse_scores)

# pipeline saqlab olamiz

# filename = 'pipeline.jbl'
# joblib.dump(full_pipeline, filename)