In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score
import joblib

In [2]:
df = pd.read_csv('housing.csv') 


In [3]:
df.drop(columns=['id','date'],inplace=True)
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,N,0,Average,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,N,0,Average,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,N,0,Average,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,N,0,Very Good,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,N,0,Average,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
df['waterfront'].sample(10)

7107     N
2657     N
15314    N
3006     N
14316    N
14963    N
2092     N
16318    N
3256     N
14449    N
Name: waterfront, dtype: object

**Feature Engg**

In [5]:
df.isnull().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(5)

In [7]:
numeric_features = [
    'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',  
    'view', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 
    'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15'
]

In [8]:
categ_features = ['zipcode','condition','waterfront']

In [9]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,N,0,Average,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,N,0,Average,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,180000.0,2,1.0,770,10000,1.0,N,0,Average,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,604000.0,4,3.0,1960,5000,1.0,N,0,Very Good,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,510000.0,3,2.0,1680,8080,1.0,N,0,Average,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [10]:
df['Houseage'] = 2025 - df['yr_built']
df['TotalSqft'] = df['sqft_above' ] + df['sqft_basement']
df['Renovated'] = df['yr_renovated'] > 0 
df['BathByBed'] = df['bathrooms'] / df['bedrooms']
df['FloorPerSqft'] = df['sqft_living'] /df['floors']


In [11]:
numeric_features += ['Houseage','TotalSqft','BathByBed','FloorPerSqft']
categ_features +=['Renovated']

In [12]:
x = df.drop(columns=['price'])
y = df['price']

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=42,test_size=0.2)

In [14]:
x_train.shape ,x_test.shape

((17290, 23), (4323, 23))

In [15]:
# no missing values preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', Pipeline(steps=[('scaler', StandardScaler())]), numeric_features),
        ('categor', Pipeline(steps=[('ohe', OneHotEncoder(drop='first', handle_unknown='ignore'))]), categ_features)
    ],
    remainder='passthrough'
)

In [16]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])
print(pipeline.steps)

[('preprocessor', ColumnTransformer(remainder='passthrough',
                  transformers=[('numeric',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 ['bedrooms', 'bathrooms', 'sqft_living',
                                  'sqft_lot', 'floors', 'view', 'grade',
                                  'sqft_above', 'sqft_basement', 'yr_built',
                                  'yr_renovated', 'lat', 'long',
                                  'sqft_living15', 'sqft_lot15', 'Houseage',
                                  'TotalSqft', 'BathByBed', 'FloorPerSqft']),
                                ('categor',
                                 Pipeline(steps=[('ohe',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore'))]),
                                 ['zipcode', 'condition', 'waterfront',
                          