# Real Estate House - Price Predictor

In [1]:
import pandas as pd

In [2]:
housing = pd.read_csv("housingdata.csv")

In [3]:
housing.head() #display top 5 rows

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
housing.info() # shows how many enteries in the data and we can identify the data missing or not

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       501 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [5]:
housing['CHAS'].value_counts() #indicate the count of the value in columns

0    471
1     35
Name: CHAS, dtype: int64

In [6]:
housing.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,501.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.287124,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.703626,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.887,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.209,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.625,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [7]:
%matplotlib inline 

In [8]:
# means i want to see graph here

In [9]:
# For ploting histogram
# import matplotlib.pyplot as plt
# housing.hist(bins=50, figsize=(20,15)) #Create the histogram

## Train-Test Spliting

In [10]:
# For learning purpose
import numpy as np
def split_train_test(data, test_ratio):
    np.random.seed(42) #fixed the shuffled value
    shuffled = np.random.permutation(len(data)) #shuffled the indexes on bases of random permutation 
    print(shuffled)
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled[:test_set_size]
    train_indices = shuffled[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [11]:
# train_set, test_set = split_train_test(housing, 0.2)

In [12]:
#print(f"Rows in train set:{len(train_set)}\nRows in test set:{len(test_set)}\n")

In [13]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(f"Rows in train set:{len(train_set)}\nRows in test set:{len(test_set)}\n")

Rows in train set:404
Rows in test set:102



In [14]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['CHAS']): # pick the specific column value fom the data and predict it
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [15]:
strat_test_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102 entries, 342 to 218
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     102 non-null    float64
 1   ZN       102 non-null    float64
 2   INDUS    102 non-null    float64
 3   CHAS     102 non-null    int64  
 4   NOX      102 non-null    float64
 5   RM       101 non-null    float64
 6   AGE      102 non-null    float64
 7   DIS      102 non-null    float64
 8   RAD      102 non-null    int64  
 9   TAX      102 non-null    int64  
 10  PTRATIO  102 non-null    float64
 11  B        102 non-null    float64
 12  LSTAT    102 non-null    float64
 13  MEDV     102 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 12.0 KB


In [16]:
strat_test_set

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
342,0.02498,0.0,1.89,0,0.518,6.540,59.7,6.2669,1,422,15.9,389.96,8.65,16.5
379,17.86670,0.0,18.10,0,0.671,6.223,100.0,1.3861,24,666,20.2,393.74,21.78,10.2
223,0.61470,0.0,6.20,0,0.507,6.618,80.8,3.2721,8,307,17.4,396.90,7.60,30.1
219,0.11425,0.0,13.89,1,0.550,6.373,92.4,3.3633,5,276,16.4,393.74,10.50,23.0
48,0.25387,0.0,6.91,0,0.448,5.399,95.3,5.8700,3,233,17.9,396.90,30.81,14.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,0.05660,0.0,3.41,0,0.489,7.007,86.3,3.4217,2,270,17.8,396.90,5.50,23.6
466,3.77498,0.0,18.10,0,0.655,5.952,84.7,2.8715,24,666,20.2,22.01,17.15,19.0
52,0.05360,21.0,5.64,0,0.439,6.511,21.1,6.8147,4,243,16.8,396.90,5.28,25.0
121,0.07165,0.0,25.65,0,0.581,6.004,84.1,2.1974,2,188,19.1,377.67,14.27,20.3


In [17]:
strat_test_set.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,102.0,102.0,102.0,102.0,102.0,101.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0,102.0
mean,3.655942,13.45098,10.312255,0.068627,0.541353,6.306861,66.733333,3.98846,8.813725,391.980392,18.385294,369.670196,12.104314,22.62549
std,10.400966,27.503241,6.761154,0.254068,0.111397,0.665351,27.772183,2.131247,8.614667,167.837379,2.310604,68.075774,6.759257,8.452344
min,0.00906,0.0,0.46,0.0,0.385,4.138,6.5,1.137,1.0,188.0,12.6,3.65,2.47,5.0
25%,0.057827,0.0,4.95,0.0,0.448,5.905,45.85,2.22365,4.0,270.0,16.8,377.685,7.48,18.925
50%,0.17615,0.0,7.76,0.0,0.515,6.185,71.1,3.42295,5.0,307.0,19.15,393.74,10.565,21.5
75%,2.061955,0.0,18.1,0.0,0.61275,6.54,93.5,5.609225,8.0,461.0,20.2,396.9,16.2675,25.0
max,88.9762,90.0,27.74,1.0,0.871,8.725,100.0,10.5857,24.0,711.0,22.0,396.9,37.97,50.0


In [18]:
strat_test_set['CHAS'].value_counts()

0    95
1     7
Name: CHAS, dtype: int64

In [19]:
strat_train_set['CHAS'].value_counts()

0    376
1     28
Name: CHAS, dtype: int64

In [20]:
#95/7

In [21]:
# 376/28

In [22]:
housing = strat_train_set.copy()

## Looking for Correlations

In [23]:
 corr_matrix = housing.corr()

In [24]:
corr_matrix['MEDV'].sort_values(ascending=False) 
# value 1 means - strong positive correlations 
# value -1 means - strong negative correlations 

MEDV       1.000000
RM         0.679221
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
LSTAT     -0.740494
Name: MEDV, dtype: float64

## making the plot using pandas

In [25]:
# from pandas.plotting import scatter_matrix
# attributes = ["MEDV","RM","ZN", "LSTAT"]
# scatter_matrix(housing[attributes],figsize=(12,8))

In [26]:
#housing.plot(kind="scatter", x="RM", y="MEDV", alpha=0.8) # Use for examine the right point for predicion

## Trying Out Attribute Combination

In [27]:
 housing["TAXRM"] = housing["TAX"]/housing["RM"]

In [28]:
housing.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,TAXRM
254,0.04819,80.0,3.64,0,0.392,6.108,32.0,9.2203,1,315,16.4,392.89,6.57,21.9,51.571709
348,0.01501,80.0,2.01,0,0.435,6.635,29.7,8.344,4,280,17.0,390.94,5.99,24.5,42.200452
476,4.87141,0.0,18.1,0,0.614,6.484,93.6,2.3053,24,666,20.2,396.21,18.68,16.7,102.714374
321,0.18159,0.0,7.38,0,0.493,6.376,54.3,4.5404,5,287,19.6,396.9,6.87,23.1,45.012547
326,0.30347,0.0,7.38,0,0.493,6.312,28.9,5.4159,5,287,19.6,396.9,6.15,23.0,45.468948


In [29]:
corr_matrix = housing.corr()
corr_matrix['MEDV'].sort_values(ascending=False) 

MEDV       1.000000
RM         0.679221
B          0.361761
ZN         0.339741
DIS        0.240451
CHAS       0.205066
AGE       -0.364596
RAD       -0.374693
CRIM      -0.393715
NOX       -0.422873
TAX       -0.456657
INDUS     -0.473516
PTRATIO   -0.493534
TAXRM     -0.526629
LSTAT     -0.740494
Name: MEDV, dtype: float64

In [30]:
#housing.plot(kind="scatter", x="TAXRM", y="MEDV", alpha=0.8)

In [31]:
housing = strat_train_set.drop("MEDV", axis=1)
housing_labels = strat_train_set["MEDV"].copy()

## Missing Attributes

In [32]:
# To take care of missing attributes, you have three option:
#     1) Get rid of the missing data points
#     2) Get rid of the whole attribute
#     3) Set the value to some value(0, mean or median)

In [33]:
a = housing.dropna(subset=["RM"]) #option 1
a.shape

(400, 13)

In [34]:
housing.drop("RM", axis=1).shape #option 2
# there is no RM column and also the orginal housing dataframe will remain unchanged

(404, 12)

In [35]:
median = housing["RM"].median() # Compute median for option 3

In [36]:
housing["RM"].fillna(median)

254    6.108
348    6.635
476    6.484
321    6.376
326    6.312
       ...  
155    6.152
423    6.103
98     7.820
455    6.525
216    5.888
Name: RM, Length: 404, dtype: float64

In [37]:
housing.shape

(404, 13)

In [38]:
housing.describe() # begore we started filling missing attribute

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,404.0,404.0,404.0,404.0,404.0,400.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.602814,10.836634,11.34495,0.069307,0.558064,6.28214,69.039851,3.74621,9.735149,412.341584,18.473267,353.392822,12.791609
std,8.099383,22.150636,6.877817,0.25429,0.116875,0.71368,28.258248,2.099057,8.731259,168.672623,2.129243,96.069235,7.23574
min,0.00632,0.0,0.74,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,13.0,0.32,1.73
25%,0.086962,0.0,5.19,0.0,0.453,5.87975,44.85,2.035975,4.0,284.0,17.4,374.6175,6.8475
50%,0.286735,0.0,9.9,0.0,0.538,6.2135,78.2,3.1222,5.0,337.0,19.0,390.955,11.57
75%,3.731923,12.5,18.1,0.0,0.631,6.63025,94.1,5.1004,24.0,666.0,20.2,395.63,17.1025
max,73.5341,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,36.98


In [39]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
imputer.fit(housing)

In [40]:
imputer.statistics_

array([2.86735e-01, 0.00000e+00, 9.90000e+00, 0.00000e+00, 5.38000e-01,
       6.21350e+00, 7.82000e+01, 3.12220e+00, 5.00000e+00, 3.37000e+02,
       1.90000e+01, 3.90955e+02, 1.15700e+01])

In [41]:
x = imputer.transform(housing)

In [42]:
housing_tr = pd.DataFrame(x, columns=housing.columns)

In [43]:
housing_tr.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0,404.0
mean,3.602814,10.836634,11.34495,0.069307,0.558064,6.28146,69.039851,3.74621,9.735149,412.341584,18.473267,353.392822,12.791609
std,8.099383,22.150636,6.877817,0.25429,0.116875,0.710162,28.258248,2.099057,8.731259,168.672623,2.129243,96.069235,7.23574
min,0.00632,0.0,0.74,0.0,0.389,3.561,2.9,1.1296,1.0,187.0,13.0,0.32,1.73
25%,0.086962,0.0,5.19,0.0,0.453,5.883,44.85,2.035975,4.0,284.0,17.4,374.6175,6.8475
50%,0.286735,0.0,9.9,0.0,0.538,6.2135,78.2,3.1222,5.0,337.0,19.0,390.955,11.57
75%,3.731923,12.5,18.1,0.0,0.631,6.63,94.1,5.1004,24.0,666.0,20.2,395.63,17.1025
max,73.5341,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,36.98


## Scikit-learn Design

Primarily, there are three types of objects:
    
    1) Estimators - It estimate some parameter based on a dataset. Eg. imputer.
    It has a fit method and transform method.
    Fit method - fits the dataset and calculate internal parameters.
    
    2) Tranformers - transforme method takes input and returns output based on 
    the learnings from fit(). It also has a convenience function called fit_transform()
    which fits and them transforms.
    
    3) Predictors - LinearRegression model is an example of predictor. fit() and predict() 
    are two common functions. It also gives score() fuction which will 
    evaluate the predictions.

## Feature Scaling

Primarily, Two types of feature scaling methods:
    
    1) Min-max scaling (Normalization)
        (value-min)/(max-min)
        Sklearn provides a class called MinMaxScaler for this
        
    2) Standardization
        (value-mean)/std
        Sklearn provides a class called StandardScaler for this

## Creating a Pipline

In [44]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")), 
    # .....add as many as you want in your pipeline
    ('std_scaler', StandardScaler())
])

In [45]:
housing_num_tr = my_pipeline.fit_transform(housing)

In [46]:
housing_num_tr.shape

(404, 13)

## Selecting a desired model for Real Estates

In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
#model = LinearRegression() #this model is not good predict in this case
#model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)
    

In [48]:
some_data = housing.iloc[:5]

In [49]:
some_labels = housing_labels.iloc[:5]

In [50]:
prepared_data = my_pipeline.transform(some_data)

In [51]:
model.predict(prepared_data)

array([22.393, 25.601, 16.61 , 23.208, 23.46 ])

In [52]:
list(some_labels) # data is not accurate

[21.9, 24.5, 16.7, 23.1, 23.0]

## Evaluating the model

In [53]:
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
mse = mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)

In [54]:
mse

1.5177145148514826

## Using better evaluation technique - Cross Validation

In [55]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [56]:
rmse_scores

array([2.93322155, 2.78936004, 4.52803689, 2.59191408, 3.24785319,
       2.86105633, 4.83724281, 3.39384866, 2.90537636, 3.08503715])

In [57]:
def print_scores(scores):
    print("scores : ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation : ", scores.std())

In [58]:
print_scores(rmse_scores)

scores :  [2.93322155 2.78936004 4.52803689 2.59191408 3.24785319 2.86105633
 4.83724281 3.39384866 2.90537636 3.08503715]
Mean:  3.317294704549673
Standard deviation :  0.7190966217403666


##### Quiz: Convert this notebook into a python file and run the pipelinr using VS Code.

#### launch the project

### Saving The Model

In [62]:
from joblib import dump, load
dump(model, "Dragon.joblib")

['Dragon.joblib']

### Testing the model on test data

In [68]:
X_test = strat_test_set.drop("MEDV", axis=1)
Y_test = strat_test_set["MEDV"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions =  model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_predictions, list(Y_test))

[24.53  11.733 25.712 22.098 18.428 14.865 19.8   14.347 32.279 41.728
 19.274 12.121 24.679 28.633 19.516 11.353 31.371 14.543 23.391 18.581
 19.501 18.281 16.719 22.307 19.243 32.088 16.392 33.736  9.313 33.321
 24.177 21.471 22.831 11.15  21.135 11.207 42.643 24.568 24.309 42.463
 23.946 29.417 20.291 20.586 19.128 33.344 44.051 20.318 19.958 21.661
 21.344 14.365 21.306 14.923 24.914 33.92  41.803 29.432 19.804 22.239
 46.63   9.689 19.18  26.68  14.776 33.803 19.943 18.181 19.177 35.033
 25.941 22.911 21.478 22.459 34.743 12.943 15.8   20.042 20.853 21.389
 22.994 21.076 14.411 23.288 20.952 21.152 14.153 21.095 21.798 23.026
 18.433 27.039  7.383 26.676 17.83  29.358 19.232 31.106 14.74  26.836
 20.868 20.639] [16.5, 10.2, 30.1, 23.0, 14.4, 15.6, 19.4, 14.1, 30.3, 35.2, 23.1, 13.8, 25.0, 27.9, 19.5, 12.3, 32.2, 13.5, 23.8, 21.7, 19.2, 19.5, 10.4, 23.2, 18.6, 28.5, 15.2, 32.0, 7.2, 34.6, 20.1, 20.6, 23.6, 13.1, 23.8, 12.7, 43.1, 24.7, 22.2, 44.0, 28.1, 31.0, 21.7, 23.4, 19.5, 33.1

In [70]:
final_rmse

2.9089728040168756

In [72]:
prepared_data[0]

array([-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.24455747, -1.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034])

### Using the model

In [73]:
from joblib import dump, load
import numpy as np
model = load("Dragon.joblib")
features = np.array([[-0.43942006,  3.12628155, -1.12165014, -0.27288841, -1.42262747,
       -0.24455747, -1.31238772,  2.61111401, -1.0016859 , -0.5778192 ,
       -0.97491834,  0.41164221, -0.86091034]])
model.predict(features)

array([22.393])