In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectKBest,f_regression
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

import joblib

In [3]:
data = pd.read_csv('../LinearRegression/Dataset/housing_prices_dataset.csv')

In [5]:
data

Unnamed: 0,Size,Bedrooms,Bathrooms,Neighborhood,YearBuilt,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,Price
0,10000.000000,,,Commercial,1973,,Medium,30.000000,5.488515,1.488980e+06
1,1930.867849,4.0,3.0,Industrial,1928,0.0,Medium,1.802602,7.003753,6.383428e+05
2,2323.844269,5.0,1.0,Commercial,2009,0.0,Medium,7.573310,3.090808,5.858642e+05
3,2761.514928,4.0,2.0,Industrial,1962,1.0,Low,2.761676,5.355583,7.148188e+05
4,1882.923313,3.0,2.0,Suburb,2001,0.0,Medium,5.537443,3.721960,5.637058e+05
...,...,...,...,...,...,...,...,...,...,...
4995,1975.517485,2.0,,Industrial,1948,0.0,Medium,4.502521,4.562272,5.572435e+05
4996,2355.705290,4.0,2.0,Commercial,1936,0.0,Low,3.156012,4.543997,6.891917e+05
4997,3556.455101,2.0,1.0,Industrial,1930,0.0,Low,5.276355,6.338340,7.605303e+05
4998,2404.018095,2.0,1.0,Suburb,1930,1.0,Low,6.526272,6.879909,5.751748e+05


In [6]:
def preprocess_data(data):
    data['HouseAge'] = 2024 - data['YearBuilt']
    data.drop(columns=['YearBuilt'], inplace=True)
    mapping = {'Low':1, 'Medium':2, 'High':3}
    data['LuxuryRating'] = data['LuxuryRating'].map(mapping)
    data = pd.get_dummies(data,dtype='int')
    return data

In [7]:
data

Unnamed: 0,Size,Bedrooms,Bathrooms,Neighborhood,YearBuilt,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,Price
0,10000.000000,,,Commercial,1973,,Medium,30.000000,5.488515,1.488980e+06
1,1930.867849,4.0,3.0,Industrial,1928,0.0,Medium,1.802602,7.003753,6.383428e+05
2,2323.844269,5.0,1.0,Commercial,2009,0.0,Medium,7.573310,3.090808,5.858642e+05
3,2761.514928,4.0,2.0,Industrial,1962,1.0,Low,2.761676,5.355583,7.148188e+05
4,1882.923313,3.0,2.0,Suburb,2001,0.0,Medium,5.537443,3.721960,5.637058e+05
...,...,...,...,...,...,...,...,...,...,...
4995,1975.517485,2.0,,Industrial,1948,0.0,Medium,4.502521,4.562272,5.572435e+05
4996,2355.705290,4.0,2.0,Commercial,1936,0.0,Low,3.156012,4.543997,6.891917e+05
4997,3556.455101,2.0,1.0,Industrial,1930,0.0,Low,5.276355,6.338340,7.605303e+05
4998,2404.018095,2.0,1.0,Suburb,1930,1.0,Low,6.526272,6.879909,5.751748e+05


In [8]:
data_pre = preprocess_data(data)

In [9]:
data_pre

Unnamed: 0,Size,Bedrooms,Bathrooms,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,Price,HouseAge,Neighborhood_Commercial,Neighborhood_Downtown,Neighborhood_Industrial,Neighborhood_Rural,Neighborhood_Suburb
0,10000.000000,,,,2,30.000000,5.488515,1.488980e+06,51,1,0,0,0,0
1,1930.867849,4.0,3.0,0.0,2,1.802602,7.003753,6.383428e+05,96,0,0,1,0,0
2,2323.844269,5.0,1.0,0.0,2,7.573310,3.090808,5.858642e+05,15,1,0,0,0,0
3,2761.514928,4.0,2.0,1.0,1,2.761676,5.355583,7.148188e+05,62,0,0,1,0,0
4,1882.923313,3.0,2.0,0.0,2,5.537443,3.721960,5.637058e+05,23,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1975.517485,2.0,,0.0,2,4.502521,4.562272,5.572435e+05,76,0,0,1,0,0
4996,2355.705290,4.0,2.0,0.0,1,3.156012,4.543997,6.891917e+05,88,1,0,0,0,0
4997,3556.455101,2.0,1.0,0.0,1,5.276355,6.338340,7.605303e+05,94,0,0,1,0,0
4998,2404.018095,2.0,1.0,1.0,1,6.526272,6.879909,5.751748e+05,94,0,0,0,0,1


In [12]:
data_pre.drop(columns='Price')

Unnamed: 0,Size,Bedrooms,Bathrooms,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,HouseAge,Neighborhood_Commercial,Neighborhood_Downtown,Neighborhood_Industrial,Neighborhood_Rural,Neighborhood_Suburb
0,10000.000000,,,,2,30.000000,5.488515,51,1,0,0,0,0
1,1930.867849,4.0,3.0,0.0,2,1.802602,7.003753,96,0,0,1,0,0
2,2323.844269,5.0,1.0,0.0,2,7.573310,3.090808,15,1,0,0,0,0
3,2761.514928,4.0,2.0,1.0,1,2.761676,5.355583,62,0,0,1,0,0
4,1882.923313,3.0,2.0,0.0,2,5.537443,3.721960,23,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1975.517485,2.0,,0.0,2,4.502521,4.562272,76,0,0,1,0,0
4996,2355.705290,4.0,2.0,0.0,1,3.156012,4.543997,88,1,0,0,0,0
4997,3556.455101,2.0,1.0,0.0,1,5.276355,6.338340,94,0,0,1,0,0
4998,2404.018095,2.0,1.0,1.0,1,6.526272,6.879909,94,0,0,0,0,1


In [13]:
data_pre.drop(columns='Price').columns

Index(['Size', 'Bedrooms', 'Bathrooms', 'HasGarage', 'LuxuryRating',
       'ProximityToCityCenter', 'Condition', 'HouseAge',
       'Neighborhood_Commercial', 'Neighborhood_Downtown',
       'Neighborhood_Industrial', 'Neighborhood_Rural', 'Neighborhood_Suburb'],
      dtype='object')

In [14]:
features = data_pre.drop(columns='Price').columns

In [15]:
features

Index(['Size', 'Bedrooms', 'Bathrooms', 'HasGarage', 'LuxuryRating',
       'ProximityToCityCenter', 'Condition', 'HouseAge',
       'Neighborhood_Commercial', 'Neighborhood_Downtown',
       'Neighborhood_Industrial', 'Neighborhood_Rural', 'Neighborhood_Suburb'],
      dtype='object')

In [16]:
transformer = Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),
                              ('scaling',StandardScaler())])

In [18]:
preprocessor = ColumnTransformer(transformers=[('num',transformer,features)])

In [31]:
pipeline = Pipeline(steps=[('preprocessor',preprocessor),
                           ('feature_selection',SelectKBest(f_regression,k=6)),
                           ('model',LinearRegression())])

In [32]:
pipeline

In [21]:
X=data_pre[features]
y=data_pre['Price']

In [22]:
X

Unnamed: 0,Size,Bedrooms,Bathrooms,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,HouseAge,Neighborhood_Commercial,Neighborhood_Downtown,Neighborhood_Industrial,Neighborhood_Rural,Neighborhood_Suburb
0,10000.000000,,,,2,30.000000,5.488515,51,1,0,0,0,0
1,1930.867849,4.0,3.0,0.0,2,1.802602,7.003753,96,0,0,1,0,0
2,2323.844269,5.0,1.0,0.0,2,7.573310,3.090808,15,1,0,0,0,0
3,2761.514928,4.0,2.0,1.0,1,2.761676,5.355583,62,0,0,1,0,0
4,1882.923313,3.0,2.0,0.0,2,5.537443,3.721960,23,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1975.517485,2.0,,0.0,2,4.502521,4.562272,76,0,0,1,0,0
4996,2355.705290,4.0,2.0,0.0,1,3.156012,4.543997,88,1,0,0,0,0
4997,3556.455101,2.0,1.0,0.0,1,5.276355,6.338340,94,0,0,1,0,0
4998,2404.018095,2.0,1.0,1.0,1,6.526272,6.879909,94,0,0,0,0,1


In [23]:
y

0       1.488980e+06
1       6.383428e+05
2       5.858642e+05
3       7.148188e+05
4       5.637058e+05
            ...     
4995    5.572435e+05
4996    6.891917e+05
4997    7.605303e+05
4998    5.751748e+05
4999    4.319291e+05
Name: Price, Length: 5000, dtype: float64

In [24]:
pipeline.fit(X,y)

In [25]:
pipeline.get_feature_names_out

<bound method Pipeline.get_feature_names_out of Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaling',
                                                                   StandardScaler())]),
                                                  Index(['Size', 'Bedrooms', 'Bathrooms', 'HasGarage', 'LuxuryRating',
       'ProximityToCityCenter', 'Condition', 'HouseAge',
       'Neighborhood_Commercial', 'Neighborhood_Downtown',
       'Neighborhood_Industrial', 'Neighborhood_Rural', 'Neighborhood_Suburb'],
      dtype='object'))])),
                ('model', LinearRegression())])>

In [26]:
X

Unnamed: 0,Size,Bedrooms,Bathrooms,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,HouseAge,Neighborhood_Commercial,Neighborhood_Downtown,Neighborhood_Industrial,Neighborhood_Rural,Neighborhood_Suburb
0,10000.000000,,,,2,30.000000,5.488515,51,1,0,0,0,0
1,1930.867849,4.0,3.0,0.0,2,1.802602,7.003753,96,0,0,1,0,0
2,2323.844269,5.0,1.0,0.0,2,7.573310,3.090808,15,1,0,0,0,0
3,2761.514928,4.0,2.0,1.0,1,2.761676,5.355583,62,0,0,1,0,0
4,1882.923313,3.0,2.0,0.0,2,5.537443,3.721960,23,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1975.517485,2.0,,0.0,2,4.502521,4.562272,76,0,0,1,0,0
4996,2355.705290,4.0,2.0,0.0,1,3.156012,4.543997,88,1,0,0,0,0
4997,3556.455101,2.0,1.0,0.0,1,5.276355,6.338340,94,0,0,1,0,0
4998,2404.018095,2.0,1.0,1.0,1,6.526272,6.879909,94,0,0,0,0,1


In [28]:
data_pre.corr()

Unnamed: 0,Size,Bedrooms,Bathrooms,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,Price,HouseAge,Neighborhood_Commercial,Neighborhood_Downtown,Neighborhood_Industrial,Neighborhood_Rural,Neighborhood_Suburb
Size,1.0,-0.002868,0.004128,-0.000767,-0.018547,0.498816,-1.1e-05,0.952369,0.009397,0.010432,-0.018544,-0.007797,0.029842,-0.01333
Bedrooms,-0.002868,1.0,-0.007199,-0.009011,-0.004641,-0.00171,0.004088,0.392132,-0.003371,0.004408,0.007342,-0.011365,0.010723,-0.010978
Bathrooms,0.004128,-0.007199,1.0,0.00678,-0.023857,0.003935,0.000893,0.156912,0.007046,0.000305,-0.005398,-0.000869,0.002293,0.003698
HasGarage,-0.000767,-0.009011,0.00678,1.0,0.010279,0.019281,-0.013893,-0.005184,-0.016489,-0.009345,-0.002033,0.01386,0.026156,-0.027524
LuxuryRating,-0.018547,-0.004641,-0.023857,0.010279,1.0,-0.017004,0.010778,-0.021039,0.019314,-0.023001,-0.006002,0.007545,0.007958,0.01339
ProximityToCityCenter,0.498816,-0.00171,0.003935,0.019281,-0.017004,1.0,-0.014041,0.338437,-0.006787,0.009468,-0.008498,-0.015072,0.030733,-0.016241
Condition,-1.1e-05,0.004088,0.000893,-0.013893,0.010778,-0.014041,1.0,0.005978,-0.024348,0.005811,-0.000371,-0.008713,-0.017631,0.020238
Price,0.952369,0.392132,0.156912,-0.005184,-0.021039,0.338437,0.005978,1.0,-0.005592,0.008127,-0.01819,-0.006762,0.030066,-0.012646
HouseAge,0.009397,-0.003371,0.007046,-0.016489,0.019314,-0.006787,-0.024348,-0.005592,1.0,0.006058,0.012342,-0.01714,0.001274,-0.002932
Neighborhood_Commercial,0.010432,0.004408,0.000305,-0.009345,-0.023001,0.009468,0.005811,0.008127,0.006058,1.0,-0.252752,-0.239421,-0.24471,-0.255987


In [35]:
pipeline.fit(X,y)

In [36]:
pipeline.named_steps['feature_selection'].get_support()

array([ True,  True,  True,  True, False,  True, False, False, False,
       False, False,  True, False])

In [39]:
X.columns[pipeline.named_steps['feature_selection'].get_support()]

Index(['Size', 'Bedrooms', 'Bathrooms', 'HasGarage', 'ProximityToCityCenter',
       'Neighborhood_Rural'],
      dtype='object')

In [40]:
X

Unnamed: 0,Size,Bedrooms,Bathrooms,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,HouseAge,Neighborhood_Commercial,Neighborhood_Downtown,Neighborhood_Industrial,Neighborhood_Rural,Neighborhood_Suburb
0,10000.000000,,,,2,30.000000,5.488515,51,1,0,0,0,0
1,1930.867849,4.0,3.0,0.0,2,1.802602,7.003753,96,0,0,1,0,0
2,2323.844269,5.0,1.0,0.0,2,7.573310,3.090808,15,1,0,0,0,0
3,2761.514928,4.0,2.0,1.0,1,2.761676,5.355583,62,0,0,1,0,0
4,1882.923313,3.0,2.0,0.0,2,5.537443,3.721960,23,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1975.517485,2.0,,0.0,2,4.502521,4.562272,76,0,0,1,0,0
4996,2355.705290,4.0,2.0,0.0,1,3.156012,4.543997,88,1,0,0,0,0
4997,3556.455101,2.0,1.0,0.0,1,5.276355,6.338340,94,0,0,1,0,0
4998,2404.018095,2.0,1.0,1.0,1,6.526272,6.879909,94,0,0,0,0,1


In [41]:
features

Index(['Size', 'Bedrooms', 'Bathrooms', 'HasGarage', 'LuxuryRating',
       'ProximityToCityCenter', 'Condition', 'HouseAge',
       'Neighborhood_Commercial', 'Neighborhood_Downtown',
       'Neighborhood_Industrial', 'Neighborhood_Rural', 'Neighborhood_Suburb'],
      dtype='object')

In [42]:
X.head(1)

Unnamed: 0,Size,Bedrooms,Bathrooms,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,HouseAge,Neighborhood_Commercial,Neighborhood_Downtown,Neighborhood_Industrial,Neighborhood_Rural,Neighborhood_Suburb
0,10000.0,,,,2,30.0,5.488515,51,1,0,0,0,0


In [43]:
predict_data_X = X.head(1)

In [44]:
predict_data_X

Unnamed: 0,Size,Bedrooms,Bathrooms,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,HouseAge,Neighborhood_Commercial,Neighborhood_Downtown,Neighborhood_Industrial,Neighborhood_Rural,Neighborhood_Suburb
0,10000.0,,,,2,30.0,5.488515,51,1,0,0,0,0


In [45]:
pipeline.predict(predict_data_X)

array([1530390.86108854])

In [47]:
y[0]

np.float64(1488980.0686547682)

In [48]:
joblib.dump(pipeline,'houseprice_pieline_lr2.pkl')

['houseprice_pieline_lr2.pkl']

In [49]:
X[0:2]

Unnamed: 0,Size,Bedrooms,Bathrooms,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,HouseAge,Neighborhood_Commercial,Neighborhood_Downtown,Neighborhood_Industrial,Neighborhood_Rural,Neighborhood_Suburb
0,10000.0,,,,2,30.0,5.488515,51,1,0,0,0,0
1,1930.867849,4.0,3.0,0.0,2,1.802602,7.003753,96,0,0,1,0,0


In [51]:
X[0:2].to_csv('test.csv')

In [52]:
data1= pd.read_csv('../LinearRegression/Dataset/housing_prices_dataset.csv')

In [53]:
data1

Unnamed: 0,Size,Bedrooms,Bathrooms,Neighborhood,YearBuilt,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,Price
0,10000.000000,,,Commercial,1973,,Medium,30.000000,5.488515,1.488980e+06
1,1930.867849,4.0,3.0,Industrial,1928,0.0,Medium,1.802602,7.003753,6.383428e+05
2,2323.844269,5.0,1.0,Commercial,2009,0.0,Medium,7.573310,3.090808,5.858642e+05
3,2761.514928,4.0,2.0,Industrial,1962,1.0,Low,2.761676,5.355583,7.148188e+05
4,1882.923313,3.0,2.0,Suburb,2001,0.0,Medium,5.537443,3.721960,5.637058e+05
...,...,...,...,...,...,...,...,...,...,...
4995,1975.517485,2.0,,Industrial,1948,0.0,Medium,4.502521,4.562272,5.572435e+05
4996,2355.705290,4.0,2.0,Commercial,1936,0.0,Low,3.156012,4.543997,6.891917e+05
4997,3556.455101,2.0,1.0,Industrial,1930,0.0,Low,5.276355,6.338340,7.605303e+05
4998,2404.018095,2.0,1.0,Suburb,1930,1.0,Low,6.526272,6.879909,5.751748e+05


# House age computation
# Drop of YearBuilt column
# Lable encoding of LuxuryRating
# Imputing nan values with median
# Scaling of features

In [None]:
#ceating function for House age computation,Drop YearBuilt column, Lable encoding of LuxuryRating and 
def prep_data(data):
    data1['HouseAge'] = 2024 - data1['YearBuilt']
    data1.drop(columns='YearBuilt',inplace=True)
    map1 = {'Low':1, 'Medium':2, 'High':3}
    data1['LuxuryRating'] = data1['LuxuryRating'].map(map1)


In [None]:
transformer1 = Pipeline (steps=[([('maping'),mapping = {'Low':1, 'Medium':2, 'High':3}]),('imputer',SimpleImputer(strategy='median')),()])