In [1]:
import pandas as pd

In [2]:
housing = pd.read_csv("data.csv")

In [3]:
housing.head()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [4]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 8 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   No                                      414 non-null    int64  
 1   X1 transaction date                     414 non-null    float64
 2   X2 house age                            414 non-null    float64
 3   X3 distance to the nearest MRT station  414 non-null    float64
 4   X4 number of convenience stores         414 non-null    int64  
 5   X5 latitude                             414 non-null    float64
 6   X6 longitude                            414 non-null    float64
 7   Y house price of unit area              414 non-null    float64
dtypes: float64(6), int64(2)
memory usage: 26.0 KB


In [5]:
housing.describe()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
count,414.0,414.0,414.0,414.0,414.0,414.0,414.0,414.0
mean,207.5,2013.148971,17.71256,1083.885689,4.094203,24.96903,121.533361,37.980193
std,119.655756,0.281967,11.392485,1262.109595,2.945562,0.01241,0.015347,13.606488
min,1.0,2012.667,0.0,23.38284,0.0,24.93207,121.47353,7.6
25%,104.25,2012.917,9.025,289.3248,1.0,24.963,121.528085,27.7
50%,207.5,2013.167,16.1,492.2313,4.0,24.9711,121.53863,38.45
75%,310.75,2013.417,28.15,1454.279,6.0,24.977455,121.543305,46.6
max,414.0,2013.583,43.8,6488.021,10.0,25.01459,121.56627,117.5


In [6]:
%matplotlib inline

In [7]:
# for ploting histogram
# import matplotlib.pyplot as plt
# housing.hist(bins=50, figsize=(20,15))

Train-Test Ratio

In [8]:
# for learning purpose
import numpy as np
def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled = np.random.permutation(len(data))
    print(shuffled)
    test_size = int(len(data) * test_ratio)
    test_indices = shuffled[:test_size]
    train_indices = shuffled[test_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [9]:
# train_set, test_set = split_train_test(housing, 0.2)

In [10]:
# print(f"Rows in train set : {len(train_set)}\nRows in test set : {len(test_set)}")

In [11]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(f"Rows in train set : {len(train_set)}\nRows in test set : {len(test_set)}")

Rows in train set : 331
Rows in test set : 83


In [12]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['X4 number of convenience stores']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [13]:
# strat_train_set
# strat_train_set.describe()
# strat_train_set.info()
strat_train_set['X4 number of convenience stores'].value_counts()

X4 number of convenience stores
0     53
5     53
3     37
1     37
6     30
7     25
4     25
8     24
9     20
2     19
10     8
Name: count, dtype: int64

In [14]:
strat_test_set['X4 number of convenience stores'].value_counts()

X4 number of convenience stores
5     14
0     14
3      9
1      9
6      7
4      6
7      6
8      6
9      5
2      5
10     2
Name: count, dtype: int64

Looking for correlations

In [15]:
corr_matrix = housing.corr()

In [16]:
corr_matrix['X4 number of convenience stores'].sort_values(ascending=False)

X4 number of convenience stores           1.000000
Y house price of unit area                0.571005
X6 longitude                              0.449099
X5 latitude                               0.444143
X2 house age                              0.049593
X1 transaction date                       0.009635
No                                       -0.012699
X3 distance to the nearest MRT station   -0.602519
Name: X4 number of convenience stores, dtype: float64

In [17]:
# from pandas.plotting import scatter_matrix
# attributes = ["X4 number of convenience stores", "X2 house age", "X3 distance to the nearest MRT station", "Y house price of unit area"]
# scatter_matrix(housing[attributes], figsize = (12,8))

In [18]:
# housing.plot(kind="scatter", x="X3 distance to the nearest MRT station", y="Y house price of unit area", alpha=0.8)

In [19]:
housing = strat_train_set.drop("X5 latitude", axis=1)
housing_labels = strat_train_set["X5 latitude"].copy()

Missing Attributes

In [20]:
# to take care of missing attributes, we have 3 options:
#     1. Get rid of the missing data points
#     2. Get rid of the whole attribute
#     3. Set the value to some value(0 or mean or median)

In [21]:
a = housing.dropna(subset=["Y house price of unit area"]) # option1
a.shape
# note that there is no Y house price of unit area column and also note that the original housing dataframe will remain unchanged

(331, 7)

In [22]:
housing.drop("Y house price of unit area", axis=1).shape # option 2
# note that there is no Y house price of unit area column and also note that the original housing dataframe will remain unchanged

(331, 6)

In [23]:
median = housing["Y house price of unit area"].median() # option 3

In [24]:
housing["Y house price of unit area"].fillna(median)

369    22.8
360    47.1
180    15.5
188    44.3
404    41.2
       ... 
10     41.4
381    47.3
159    37.4
141    28.9
292    24.5
Name: Y house price of unit area, Length: 331, dtype: float64

In [25]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")
imputer.fit(housing)

In [26]:
imputer.statistics_

array([ 209.     , 2013.167  ,   16.1    ,  492.2313 ,    4.     ,
        121.53844,   38.3    ])

In [27]:
X = imputer.transform(housing)

In [28]:
housing_tr = pd.DataFrame(X, columns=housing.columns)

In [29]:
housing_tr.describe()

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X6 longitude,Y house price of unit area
count,331.0,331.0,331.0,331.0,331.0,331.0,331.0
mean,209.36858,2013.142767,18.019033,1080.612966,4.102719,121.533541,37.623565
std,120.383667,0.280728,11.497031,1250.250085,2.944182,0.015267,12.960756
min,1.0,2012.667,0.0,23.38284,0.0,121.47516,7.6
25%,103.5,2012.917,9.1,289.3248,1.0,121.529675,27.3
50%,209.0,2013.167,16.1,492.2313,4.0,121.53844,38.3
75%,311.5,2013.417,29.6,1448.504,6.0,121.543395,46.15
max,414.0,2013.583,43.8,6396.283,10.0,121.56627,78.3


SCIKIT-LEARN DESIGN

primarily, three types of objects
1. Estimators - it estimates some parameter based on a dataset. Eg. imputer.
it has a fit and transform method. Fit method - fits the dataset and calculates internal parameters
2. Transformers - transform method takes input and returns output based on the learnings from fit(). it also has a convenience function called fit_transform() which fits and then transforms.
3. Predictors - Linear Regression model is an example of predictor. fit() and predict() are two common functions. it also gives score() function which will evaluate the predictions.

FEATURE SCALING

primarily, two types of feature scaling methods:
1. Min-max scaling (Normalization)
    (value-min)/(max-min)
   sklearn provides a class called MinMaxScaler for this
3. Standardization
   (vlaue-mean)/standarddeviation
   sklearn provides a class called StandardScaler for this

CREATING A PIPELINE

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    # add as many as your pipeline
    ('std_scaler', StandardScaler()),
])

In [31]:
housing_num_tr = my_pipeline.fit_transform(housing_tr)

In [32]:
housing_num_tr.shape

(331, 7)

SELECTING A DESIRED MODEL FOR OUR PROJECT

In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# model = LinearRegression()
# model = DecisionTreeRegressor()
model = RandomForestRegressor()
model.fit(housing_num_tr, housing_labels)

In [34]:
some_data = housing.iloc[:5]

In [35]:
some_labels = housing_labels.iloc[:5]

In [36]:
prepared_data = my_pipeline.transform(some_data)

In [37]:
model.predict(prepared_data)

array([24.9630087, 24.9824542, 24.9488344, 24.9760994, 24.9802227])

In [38]:
list(some_labels)

[24.96322, 24.983, 24.94898, 24.97707, 24.98203]

EVALUATING THE MODEL

In [39]:
from sklearn.metrics import mean_squared_error
housing_predictions = model.predict(housing_num_tr)
mse = mean_squared_error(housing_labels, housing_predictions)
rmse = np.sqrt(mse)

In [40]:
rmse

0.002754574406542031

USING BETTER EVALUATION TECHNIQUE

In [41]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, housing_num_tr, housing_labels, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [42]:
rmse_scores

array([0.00489882, 0.01239258, 0.00446498, 0.01305018, 0.0054038 ,
       0.00647763, 0.00271927, 0.00521897, 0.00653512, 0.00491439])

In [43]:
def print_scores(scores):
    print("Scores: ",scores)
    print("Mean: ",scores.mean())
    print("Standard Deviation: ", scores.std())

In [45]:
print_scores(rmse_scores)

Scores:  [0.00489882 0.01239258 0.00446498 0.01305018 0.0054038  0.00647763
 0.00271927 0.00521897 0.00653512 0.00491439]
Mean:  0.0066075745532597745
Standard Deviation:  0.003222876588011706


SAVING THE MODEL

In [46]:
from joblib import dump, load
dump(model, 'Price_Predicted.joblib')

['Price_Predicted.joblib']

TESTING THE MODEL ON TEST DATA

In [51]:
X_test = strat_test_set.drop("X5 latitude", axis=1)
Y_test = strat_test_set["X5 latitude"].copy()
X_test_prepared = my_pipeline.transform(X_test)
final_predictions = model.predict(X_test_prepared)
final_mse = mean_squared_error(Y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_predictions, list(Y_test))

[24.9786317 24.9744607 24.9502897 24.9547184 24.9746589 24.9885505
 24.9663478 24.968711  24.9728078 24.9739617 24.9727807 24.9744389
 24.9686429 24.963366  24.9755009 24.9711929 24.9503664 24.9816848
 24.9791643 24.9668491 24.9776424 24.9738854 24.9723972 24.9643501
 24.9736444 24.9724085 24.969515  24.971706  24.9725168 24.9735892
 24.9557172 24.9732498 24.972255  24.9650229 24.9486053 24.9646625
 24.9663984 24.9617012 24.974594  24.9425464 24.9751853 24.97078
 24.9703485 24.9785477 24.9526578 24.9744749 24.9738143 24.9781699
 24.9630805 24.9797959 24.9817441 24.9767266 24.9415102 24.9810591
 24.9761001 24.9666678 24.9736503 24.965569  24.9768443 24.9733361
 24.9741576 24.9430274 24.9737277 24.9750187 24.9625295 24.9638027
 24.9736131 24.9769497 24.9502621 24.9664755 24.976111  24.9634899
 24.970973  24.9762356 24.9724754 24.9712476 24.9745154 24.9559608
 24.9802225 24.9416636 24.9707134 24.9600667 24.976588 ] [24.98203, 24.97445, 24.94783, 24.95505, 24.97445, 24.93293, 24.96735, 24.

In [48]:
final_rmse

0.008832709976286065

In [53]:
prepared_data[0]

array([ 1.33634919, -1.69732575,  0.18998546,  0.88477281, -0.37510886,
       -1.38876478, -1.14545847])