In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv('https://raw.githubusercontent.com/anvarnarz/praktikum_datasets/main/housing_data_08-02-2021.csv')
df.head()

Unnamed: 0,location,district,rooms,size,level,max_levels,price
0,"город Ташкент, Юнусабадский район, Юнусабад 8-...",Юнусабадский,3,57,4,4,52000
1,"город Ташкент, Яккасарайский район, 1-й тупик ...",Яккасарайский,2,52,4,5,56000
2,"город Ташкент, Чиланзарский район, Чиланзар 2-...",Чиланзарский,2,42,4,4,37000
3,"город Ташкент, Чиланзарский район, Чиланзар 9-...",Чиланзарский,3,65,1,4,49500
4,"город Ташкент, Чиланзарский район, площадь Актепа",Чиланзарский,3,70,3,5,55000


# Ustunlar ta'rifi
- `location` - sotilayotgan uy manzili
- `district` - uy joylashgan tuman
- `rooms` - xonalar soni
- `size` - uy maydoni (kv.m)
- `level` - uy joylashgan qavat
- `max_levels` - ja'mi qavatlar soni
- `price` - uy narxi

## Vazifani CRSIP-DM Metolodgiyasi yordamida bajaring.
<img src="https://i.imgur.com/dzZnnYi.png" alt="CRISP-DM" width="800"/>


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7565 entries, 0 to 7564
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   location    7565 non-null   object
 1   district    7565 non-null   object
 2   rooms       7565 non-null   int64 
 3   size        7565 non-null   object
 4   level       7565 non-null   int64 
 5   max_levels  7565 non-null   int64 
 6   price       7565 non-null   object
dtypes: int64(3), object(4)
memory usage: 413.8+ KB


In [None]:
def str_cleaner(dataframe):
    # Vergullarni olib tashlash va sonli turga o'zgartirish
    dataframe['price'] = dataframe['price'].str.replace(',', '')
    dataframe['size'] = dataframe['size'].str.replace(',', '')

    dataframe['price'] = pd.to_numeric(dataframe['price'], errors='coerce')
    dataframe['size'] = pd.to_numeric(dataframe['size'], errors='coerce')

    return dataframe.drop('location',axis=1).dropna()

dff = str_cleaner(df)

In [None]:
dff

Unnamed: 0,district,rooms,size,level,max_levels,price
0,Юнусабадский,3,57.0,4,4,52000.0
1,Яккасарайский,2,52.0,4,5,56000.0
2,Чиланзарский,2,42.0,4,4,37000.0
3,Чиланзарский,3,65.0,1,4,49500.0
4,Чиланзарский,3,70.0,3,5,55000.0
...,...,...,...,...,...,...
7560,Яшнободский,1,38.0,5,5,24500.0
7561,Яшнободский,2,49.0,1,4,32000.0
7562,Шайхантахурский,2,64.0,3,9,40000.0
7563,Мирзо-Улугбекский,1,18.0,1,4,11000.0


Datasetni train va test qismlariga ajratib olamiz

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(dff,test_size=0.2,random_state=42)

X_train = train_set.drop('price',axis=1)
y = train_set['price'].copy()
X_num = X_train.drop('district',axis=1)

Sonli ustunlarni oz ichiga oluvchi pipeline yasaymiz

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder,FunctionTransformer

num_pipeline = Pipeline([
    ('scaler',StandardScaler())
])

num_pipeline.fit_transform(X_num)

array([[-0.57458889, -0.03523441,  0.12524471, -0.78114458],
       [-1.4952878 , -0.05655365, -1.20092799, -1.54042117],
       [ 1.26680894,  0.00882535, -0.31681286, -0.40150628],
       ...,
       [ 0.34611002, -0.01107261, -0.75887042, -0.40150628],
       [-0.57458889, -0.04447275,  0.12524471, -0.40150628],
       [ 1.26680894, -0.01107261, -0.75887042,  1.11704692]])

Barcha ustunlarni qamrab oluvchi Pipeline yasaymiz

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ['district']

full_pipeline = ColumnTransformer([
    ('num_pipeline',num_pipeline,num_attribs),
    ('cat',OneHotEncoder(),cat_attribs)
])


In [None]:
X_prepared = full_pipeline.fit_transform(X_train)
X_prepared

array([[-0.57458889, -0.03523441,  0.12524471, ...,  0.        ,
         0.        ,  0.        ],
       [-1.4952878 , -0.05655365, -1.20092799, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.26680894,  0.00882535, -0.31681286, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.34611002, -0.01107261, -0.75887042, ...,  0.        ,
         0.        ,  0.        ],
       [-0.57458889, -0.04447275,  0.12524471, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.26680894, -0.01107261, -0.75887042, ...,  0.        ,
         0.        ,  1.        ]])

Linear Regression model

In [None]:
from sklearn.linear_model import LinearRegression
LR_model = LinearRegression()

In [None]:
LR_model.fit(X_prepared,y)

Modelni Baholaymiz

In [None]:
test_data = X_train.sample(5)
test_labels = y.loc[test_data.index]

In [None]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared

array([[-5.74588890e-01, -4.09195446e-02, -1.20092799e+00,
        -7.81144577e-01,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 1.26680894e+00, -1.83427096e-03,  2.33553253e+00,
         1.11704692e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [-5.74588890e-01, -3.66556965e-02,  3.21964766e+00,
         1.87632351e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00],
       [-5.74588890e-01, -3.31024898e

In [None]:
predicted_data = LR_model.predict(test_data_prepared)
predicted_data

array([ 22293.93690903,  70035.09244006,  36172.25992784,  46695.18175752,
       106292.97062373])

In [None]:
dfg = pd.DataFrame({'asl_qiymat':test_labels,'bashorat':predicted_data})
dfg['xatolik'] = dfg['asl_qiymat']-dfg['bashorat']
dfg.abs()

Unnamed: 0,asl_qiymat,bashorat,xatolik
4812,43000.0,22293.936909,20706.063091
5294,75000.0,70035.09244,4964.90756
1418,24921.0,36172.259928,11251.259928
3568,37197.0,46695.181758,9498.181758
6443,168000.0,106292.970624,61707.029376


In [None]:
X_test = test_set.drop('price',axis=1)
y_test = test_set['price'].copy()

X_test_prepared = full_pipeline.transform(X_test)

In [None]:
y_predicted = LR_model.predict(X_test_prepared)

mean absolute error

In [None]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test,y_predicted)
print("MAE: ",mae)

MAE:  67771.32084531563


mean squared error

In [None]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_predicted)
print('RMSE: ',np.sqrt(mse))

RMSE:  1366742.755611274


Random Forest model

In [None]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()

In [None]:
RF_model.fit(X_prepared,y)

In [None]:
y_predicted = RF_model.predict(X_test_prepared)

In [None]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_predicted)
print('RMSE: ',np.sqrt(mse))

RMSE:  1370893.324685595


In [None]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test,y_predicted)
print("MAE: ",mae)

MAE:  61248.664802110216


cross validation

In [None]:
from sklearn.model_selection import cross_val_score
crossval = cross_val_score(LR_model,X_prepared,y,scoring='neg_mean_squared_error',cv=5)

In [None]:
def displayscore(crosval):
  print("score = ", crossval)
  print('score mean =', crossval.mean())
  print('score std = ', np.std(crossval))

In [None]:
displayscore(np.sqrt(-crossval))

score =  [-1.24941569e+09 -1.30840323e+10 -3.27754938e+09 -5.44638917e+09
 -2.01067020e+11]
score mean = -44824881332.422165
score std =  78223724123.09923


In [None]:
import joblib

joblib.dump(LR_model,"LR_model.jbl")

['LR_model.jbl']

In [None]:
model = joblib.load("LR_model.jbl")

In [None]:
joblib.dump(full_pipeline,'full_pipeline.jbl')

['full_pipeline.jbl']