In [1]:
import pandas as pd
import numpy as np
import quick_preprocess

#### House Data

In [2]:
df = pd.read_csv('housing.csv')

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  int64  
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


#### Handle Missing Data

In [5]:
df1 = df.copy()
df1 = quick_preprocess.handle_missing_data(df=df1,
                                           column=['total_bedrooms'],
                                           imputation=['most_frequent'])
df1

Replaced missing values in 'total_bedrooms' with 'most_frequent' value.


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,INLAND
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,INLAND
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,INLAND
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,INLAND


In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  int64  
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


#### Encoding Categorical Feature(s)

In [7]:
df2 = df1.copy()
df2 = quick_preprocess.encode_categorical_data(df=df2,
                                               col_for_one_hot_encoding=['ocean_proximity'],
                                               col_for_ordinal_encoding=[],
                                               drop=True)
# exception
p=[x for x in df2.columns if isinstance(x,str) ]
q=[x[0] for x in df2.columns if isinstance(x,tuple)]
df2.columns=p+q
df2

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,0.0,0.0,0.0,1.0,0.0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,0.0,0.0,0.0,1.0,0.0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,0.0,0.0,0.0,1.0,0.0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,0.0,0.0,0.0,1.0,0.0
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,0.0,1.0,0.0,0.0,0.0
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,0.0,1.0,0.0,0.0,0.0
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,0.0,1.0,0.0,0.0,0.0
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,0.0,1.0,0.0,0.0,0.0


In [8]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  int64  
 9   <1H OCEAN           20640 non-null  float64
 10  INLAND              20640 non-null  float64
 11  ISLAND              20640 non-null  float64
 12  NEAR BAY            20640 non-null  float64
 13  NEAR OCEAN          20640 non-null  float64
dtypes: float64(9), int64(5)
memory usage: 2.2 MB


#### Pipeline - using Linear Regression : Predict Median House Value

In [9]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY


In [10]:
# Pipeline
from sklearn.pipeline import Pipeline

# preprocessor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# estimator
from sklearn.linear_model import LinearRegression


In [11]:
# Numerical features
num_col= ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
          'total_bedrooms', 'population', 'households', 'median_income']


num_steps = [
             ('impute',SimpleImputer(strategy='mean')),
             ('scaler', StandardScaler()),
            ]

num_pipeline = Pipeline(num_steps)

In [12]:
# Categorical features
cat_col = ['ocean_proximity']

cat_steps = [
             ('impute',SimpleImputer(strategy='constant')),
             ('encoder',OneHotEncoder())
            ]
cat_pipeline = Pipeline(cat_steps)

In [13]:
# Preprocessing transformer
preprocessor = ColumnTransformer( 
                                    transformers=[('numeric', num_pipeline, num_col),
                                                  ('categorical', cat_pipeline, cat_col)
                                                 ]
                                )

In [14]:
# Final transformer
pipeline = Pipeline(
                    steps = [
                             ('preprocessor', preprocessor),
                             ('regressor',LinearRegression())
                            ]
                   )

#### Apply on data

In [15]:
X=df.drop(columns='median_house_value').copy()
y=np.array(df['median_house_value'].copy()).reshape(-1,1)
X.shape,y.shape

((20640, 9), (20640, 1))

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
for j in [X_train, X_test, y_train, y_test]:
    print(j.shape)

(16512, 9)
(4128, 9)
(16512, 1)
(4128, 1)


In [17]:
pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['longitude', 'latitude',
                                                   'housing_median_age',
                                                   'total_rooms',
                                                   'total_bedrooms',
                                                   'population', 'households',
                                                   'median_income']),
                                                 ('categorical',
                                                  Pipeline(steps=[('impute',
    

In [18]:
lr_model = pipeline.fit(X_train, y_train)
print (lr_model)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['longitude', 'latitude',
                                                   'housing_median_age',
                                                   'total_rooms',
                                                   'total_bedrooms',
                                                   'population', 'households',
                                                   'median_income']),
                                                 ('categorical',
                                                  Pipeline(steps=[('impute',
    

In [19]:
from sklearn.metrics import r2_score
predictions = lr_model.predict(X_test)
print ("Score : ",r2_score(y_test, predictions))

Score :  0.6223661413577597


#### Find best estimator

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.linear_model import Ridge


for reg in [RandomForestRegressor(),LinearRegression(),Ridge(),Lasso()]:
    pipeline = Pipeline(
                    steps = [
                             ('preprocessor', preprocessor),
                             ('regressor',reg)
                            ]
                   )
    lr_model = pipeline.fit(X_train, y_train)
    predictions = lr_model.predict(X_test)
    print (r2_score(y_test, predictions))
    


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.8143359202760672
0.6223661413577597
0.6222751160511161
0.6222730413309762


  positive)


#### Best Estimator - Random Forest Regressor

In [21]:
pipeline = Pipeline(
                steps = [
                         ('preprocessor', preprocessor),
                         ('regressor',RandomForestRegressor())
                        ]
               )
lr_model = pipeline.fit(X_train, y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


#### Test

In [22]:
row_no = int(input("Row Number : "))
test = X.iloc[row_no-1:row_no,:]
p=lr_model.predict(test)
print("Actual House Value : {}\nPredicted House Value : {} ".format(df['median_house_value'][row_no-1],p[0]))
print("% error : ",round((p[0]/df['median_house_value'][row_no-1]-1)*100,2),"%")

Row Number : 1002
Actual House Value : 158400
Predicted House Value : 160677.0 
% error :  1.44 %
