<a href="https://colab.research.google.com/github/Arslonbekjon/ML-House-price-prediction/blob/main/02_ml_pipeline_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#⬅️previous step-->Data Exploration

#Click [here](https://github.com/Arslonbekjon/ML-House-price-prediction/blob/main/01_data_exploration.ipynb)

In [1]:
import pandas as pd
import numpy as np
import sklearn


In [2]:
URL="https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df=pd.read_csv(URL)

In [3]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train= train_set.drop("median_house_value", axis=1)
y= train_set['median_house_value'].copy()

X_num = X_train.drop('ocean_proximity',axis=1)

#Pipline quramiz

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
# Bizga kerakli ustunlar indekslari

rooms_ix, bedrooms_ix, population_ix, households_ix=3,4,5,6

class CombinedAttributesAdder(BaseEstimator,TransformerMixin):
  def __init__(self,add_bedrooms_per_room=True):
    self.add_bedrooms_per_room = add_bedrooms_per_room
  def fit(self,X,y=None):
    return self # bizni funksiyamiz faqat transformer. estimator emas
  def transform(self,X):
    rooms_per_household = X[:,rooms_ix]/X[:,households_ix]
    population_per_household=X[:,population_ix]/X[:,households_ix]
    if self.add_bedrooms_per_room:# add_bedrooms_per_room ustuni ixtiyoriy bo'ladi
       bedrooms_per_room=X[:,bedrooms_ix]/X[:,rooms_ix]
       return np.c_[X,rooms_per_household,population_per_household,bedrooms_per_room]
    else:
      return np.c_[X,rooms_per_household,population_per_household]


In [5]:
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder(add_bedrooms_per_room=True)),
    ('std_scaler',StandardScaler())
])

In [6]:
from sklearn.compose import ColumnTransformer

num_attribs=list(X_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
    ('num',num_pipeline,num_attribs),
    ('cat',OneHotEncoder(),cat_attribs)
])

In [7]:
X_prepared = full_pipeline.fit_transform(X_train)

In [8]:
X_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

In [9]:
#Linear Regression

from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [10]:
LR_model.fit(X_prepared,y)

In [11]:
test_data=X_train.sample(10)

In [12]:
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN
2271,-119.80,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND
...,...,...,...,...,...,...,...,...,...
11284,-117.96,33.78,35.0,1330.0,201.0,658.0,217.0,6.3700,<1H OCEAN
11964,-117.43,34.02,33.0,3084.0,570.0,1753.0,449.0,3.0500,INLAND
5390,-118.38,34.03,36.0,2101.0,569.0,1756.0,527.0,2.9344,<1H OCEAN
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,<1H OCEAN


In [13]:
test_label = y.loc[test_data.index]
test_label


Unnamed: 0,median_house_value
13990,73800.0
16007,335100.0
17986,227300.0
1436,194000.0
16682,195200.0
6791,205600.0
2186,70800.0
13803,108000.0
14245,86300.0
11650,315900.0


In [14]:
test_data_prepared = full_pipeline.transform(test_data)

predicted_labels = LR_model.predict(test_data_prepared)

In [15]:
predicted_labels

array([ 64514.22430646, 332561.91826626, 279333.7399967 , 231525.55473648,
       224438.76140291, 194876.12029599, 108297.19097311, 181497.32163875,
       142399.5403882 , 269516.57696937])

In [16]:
pd.DataFrame({'Bashorat':predicted_labels, 'Asl qiymat':test_label})

Unnamed: 0,Bashorat,Asl qiymat
13990,64514.224306,73800.0
16007,332561.918266,335100.0
17986,279333.739997,227300.0
1436,231525.554736,194000.0
16682,224438.761403,195200.0
6791,194876.120296,205600.0
2186,108297.190973,70800.0
13803,181497.321639,108000.0
14245,142399.540388,86300.0
11650,269516.576969,315900.0


#Next step is testing the model

In [17]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [18]:
X_test=test_set.drop('median_house_value', axis=1)
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,INLAND


In [19]:
y_test=test_set['median_house_value'].copy()
y_test


Unnamed: 0,median_house_value
20046,47700.0
3024,45800.0
15663,500001.0
20484,218600.0
9814,278000.0
...,...
15362,263300.0
16623,266800.0
18086,500001.0
2144,72300.0


In [20]:
X_test_prepared = full_pipeline.transform(X_test)

In [21]:
y_predicted = LR_model.predict(X_test_prepared)

In [22]:
y_predicted

array([ 61874.25460143, 121853.52511139, 267770.94368091, ...,
       447837.04647878, 117275.9214608 , 185597.46125194])

In [23]:
from sklearn.metrics import mean_absolute_error

MAE=mean_absolute_error(y_test,y_predicted)

print("MAE=", MAE)

MAE= 50898.73953494079


In [24]:
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test,y_predicted)
print("RMSE=",np.sqrt(MSE))

RMSE= 72701.32600762135


#Random Forest

In [25]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor()
RF_model.fit(X_prepared,y)

In [26]:
y_predicted = RF_model.predict(X_test_prepared)

In [27]:
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test,y_predicted)
print("RMSE=",np.sqrt(MSE))

RMSE= 49990.4446302849


#Cross-Validation

In [28]:
X = df.drop("median_house_value",axis=1)
y = df['median_house_value'].copy()

X_prepared = full_pipeline.transform(X)

In [29]:
from sklearn.model_selection import cross_val_score

mse_scores=cross_val_score(LR_model, X_prepared,y,scoring="neg_mean_squared_error",cv=5)

In [30]:
def display_scores(scores):
  print("Scores:", scores)
  print("Mean:", scores.mean())
  print("Std.dev:", scores.std())

In [31]:
display_scores(np.sqrt(-mse_scores))

Scores: [73394.92502922 74814.24096819 75431.93119241 76608.78768825
 66196.48128669]
Mean: 73289.27323295095
Std.dev: 3694.71367872237


In [32]:
scores = cross_val_score(RF_model, X_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [98894.00148014 47453.3852431  65498.33193362 56628.86571651
 61157.74335405 60037.09679215 47733.88265448 78817.51575373
 74513.7854507  49774.86964236]
Mean: 64050.947802084636
Std.dev: 15390.766228858487


#Pickle

In [33]:
import pickle
filename = 'RF_model_pkl'  #Faylga istalgan nom beramiz
with open(filename,'wb') as file:
  pickle.dump(RF_model,file)

In [34]:
with open(filename,'rb') as file:
  model = pickle.load(file)

#Joblib

In [35]:
import joblib

filename = 'LR_model.jbl' #Faylga istalgan nom beramiz
joblib.dump(LR_model, filename)

['LR_model.jbl']

In [36]:
model = joblib.load(filename)

In [37]:
scores = cross_val_score(model, X_prepared,y, scoring='neg_mean_squared_error',cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [73394.92502922 74814.24096819 75431.93119241 76608.78768825
 66196.48128669]
Mean: 73289.27323295095
Std.dev: 3694.71367872237
