<a href="https://colab.research.google.com/github/Diyorbek-MY/House_Price_prediction/blob/main/Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Moduling

In [None]:
import pandas as pd
import numpy as np
import sklearn

In [None]:
URL  = "https://github.com/ageron/handson-ml2/blob/master/datasets/housing/housing.csv?raw=true"
df = pd.read_csv(URL)

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

X_train = train_set.drop("median_house_value", axis=1)
y = train_set["median_house_value"].copy()

X_num = X_train.drop("ocean_proximity", axis=1)

# building Pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

In [None]:
X_prepared = full_pipeline.fit_transform(X_train)

In [None]:
X_prepared

array([[ 1.27258656, -1.3728112 ,  0.34849025, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.70916212, -0.87669601,  1.61811813, ...,  0.        ,
         0.        ,  1.        ],
       [-0.44760309, -0.46014647, -1.95271028, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.59946887, -0.75500738,  0.58654547, ...,  0.        ,
         0.        ,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112, ...,  0.        ,
         0.        ,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, ...,  0.        ,
         1.        ,  0.        ]])

# Linear Regression

In [None]:

from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()

In [None]:
LR_model.fit(X_prepared, y)

In [None]:
test_data = X_train.sample(10)
test_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
18883,-122.25,38.1,52.0,2315.0,556.0,1113.0,486.0,2.5042,NEAR BAY
6851,-118.15,34.07,44.0,1626.0,383.0,1063.0,334.0,2.4348,<1H OCEAN
15762,-122.43,37.77,52.0,1567.0,482.0,654.0,425.0,2.6914,NEAR BAY
17404,-120.44,34.94,29.0,1877.0,516.0,1634.0,492.0,1.6875,<1H OCEAN
7919,-118.07,33.86,31.0,2943.0,518.0,1703.0,472.0,3.7091,<1H OCEAN
13709,-117.21,34.11,26.0,1757.0,304.0,905.0,281.0,3.4103,INLAND
17078,-122.2,37.48,30.0,1170.0,258.0,610.0,243.0,3.4427,NEAR BAY
17167,-122.25,37.45,34.0,2999.0,365.0,927.0,369.0,10.2811,NEAR OCEAN
4685,-118.35,34.07,48.0,890.0,255.0,434.0,232.0,3.6111,<1H OCEAN
8391,-118.36,33.98,29.0,2861.0,816.0,1715.0,775.0,2.7712,<1H OCEAN


In [None]:
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN
2271,-119.80,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND
...,...,...,...,...,...,...,...,...,...
11284,-117.96,33.78,35.0,1330.0,201.0,658.0,217.0,6.3700,<1H OCEAN
11964,-117.43,34.02,33.0,3084.0,570.0,1753.0,449.0,3.0500,INLAND
5390,-118.38,34.03,36.0,2101.0,569.0,1756.0,527.0,2.9344,<1H OCEAN
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,<1H OCEAN


In [None]:
test_label = y.loc[test_data.index]
test_label

Unnamed: 0,median_house_value
18883,147900.0
6851,220700.0
15762,366700.0
17404,122700.0
7919,225900.0
13709,90900.0
17078,263500.0
17167,500001.0
4685,450000.0
8391,160900.0


In [None]:
test_data_prepared = full_pipeline.transform(test_data)
predicted_labels = LR_model.predict(test_data_prepared)

In [None]:
predicted_labels

array([208245.49402908, 174617.23927964, 255118.74179176, 171845.4991032 ,
       197127.06190316, 119016.88407686, 219949.71436065, 507171.20230568,
       256052.71516727, 219402.19287868])

In [None]:
pd.DataFrame({'Predicted': predicted_labels, 'Actual': test_label})

Unnamed: 0,Predicted,Actual
18883,208245.494029,147900.0
6851,174617.23928,220700.0
15762,255118.741792,366700.0
17404,171845.499103,122700.0
7919,197127.061903,225900.0
13709,119016.884077,90900.0
17078,219949.714361,263500.0
17167,507171.202306,500001.0
4685,256052.715167,450000.0
8391,219402.192879,160900.0


#model evaluation

In [None]:
test_set

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
20046,-119.01,36.06,25.0,1505.0,,1392.0,359.0,1.6812,47700.0,INLAND
3024,-119.46,35.14,30.0,2943.0,,1565.0,584.0,2.5313,45800.0,INLAND
15663,-122.44,37.80,52.0,3830.0,,1310.0,963.0,3.4801,500001.0,NEAR BAY
20484,-118.72,34.28,17.0,3051.0,,1705.0,495.0,5.7376,218600.0,<1H OCEAN
9814,-121.93,36.62,34.0,2351.0,,1063.0,428.0,3.7250,278000.0,NEAR OCEAN
...,...,...,...,...,...,...,...,...,...,...
15362,-117.22,33.36,16.0,3165.0,482.0,1351.0,452.0,4.6050,263300.0,<1H OCEAN
16623,-120.83,35.36,28.0,4323.0,886.0,1650.0,705.0,2.7266,266800.0,NEAR OCEAN
18086,-122.05,37.31,25.0,4111.0,538.0,1585.0,568.0,9.2298,500001.0,<1H OCEAN
2144,-119.76,36.77,36.0,2507.0,466.0,1227.0,474.0,2.7850,72300.0,INLAND


In [None]:
x_test = test_set.drop("median_house_value", axis=1)
y_test = test_set["median_house_value"].copy()
y_test

Unnamed: 0,median_house_value
20046,47700.0
3024,45800.0
15663,500001.0
20484,218600.0
9814,278000.0
...,...
15362,263300.0
16623,266800.0
18086,500001.0
2144,72300.0


In [None]:
x_test_prepared = full_pipeline.transform(x_test)

In [None]:
y_predicted = LR_model.predict(x_test_prepared)

In [None]:
# MAE and RMSE

In [None]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_predicted)
mae

50898.73953494079

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_predicted)
rmse = np.sqrt(mse)
rmse

np.float64(72701.32600762135)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepared, y)

In [None]:
y_predicted = RF_model.predict(x_test_prepared)

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_predicted)
print("RMSE=", np.sqrt(mse))

RMSE= 50007.28345125393


## Cross-validation

In [None]:
x = df.drop("median_house_value", axis=1)
y = df["median_house_value"].copy()

x_prepared = full_pipeline.fit_transform(x)

In [None]:
from sklearn.model_selection import cross_val_score

mse_scores = cross_val_score(LR_model, x_prepared, y, scoring="neg_mean_squared_error", cv=5)

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
display_scores(np.sqrt(-mse_scores))

Scores: [73391.42036892 74809.28332317 75429.91837496 76604.35506436
 66196.72436926]
Mean: 73286.34030013601
Standard deviation: 3693.161254481324


In [None]:
scores = cross_val_score(RF_model, x_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [98243.89954773 47200.55305811 65199.45297681 56636.27133012
 60855.8451041  59961.56633956 46371.49527272 78503.67251899
 74163.85505481 49470.34096367]
Mean: 63660.69521666237
Standard deviation: 15398.403413925189


## presentation

pickle

In [None]:
import pickle

filename = 'RF_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(RF_model, file)

In [None]:
with open(filename, 'rb') as file:
    RF_model = pickle.load(file)

joblib

In [None]:
import joblib

filename = 'LR_model.jbl'
joblib.dump(LR_model, filename)

['LR_model.jbl']

In [None]:
model = joblib.load(filename)

In [None]:
scores = cross_val_score(model, x_prepared, y, scoring="neg_mean_squared_error", cv=5)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [73391.42036892 74809.28332317 75429.91837496 76604.35506436
 66196.72436926]
Mean: 73286.34030013601
Standard deviation: 3693.161254481324
