## Importing dependencies and dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('Housing.csv')

In [3]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
X = df.drop(['price'], axis=1)
y = df['price']

In [5]:
X.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [6]:
y.head()

0    13300000
1    12250000
2    12250000
3    12215000
4    11410000
Name: price, dtype: int64

## Data preprocessing

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [10]:
categorical_cols = ['mainroad', 'guestroom','basement','hotwaterheating','airconditioning','prefarea','furnishingstatus']
numeric_cols = ['area','bedrooms','bathrooms','stories','parking']

numeric_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_cols),
    ('encoder', OneHotEncoder(sparse_output=False), categorical_cols)
],
    remainder = 'passthrough'                             
)

transformed_train_data = preprocessor.fit_transform(X_train)
transformed_test_data = preprocessor.transform(X_test)

ohe = preprocessor.named_transformers_['encoder']
ohe_features = ohe.get_feature_names_out(categorical_cols)

poly = preprocessor.named_transformers_['num']
poly_features = poly.get_feature_names_out(numeric_cols)
all_cols = list(poly_features) + list(ohe_features)

X_train = pd.DataFrame(transformed_train_data, columns=all_cols)
X_test = pd.DataFrame(transformed_test_data, columns=all_cols)


In [11]:
X_test.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,parking,area^2,area bedrooms,area bathrooms,area stories,area parking,...,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,0.354846,1.386046,1.554599,0.252805,0.395546,0.123538,0.980086,1.110117,0.38523,0.323951,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.630777,0.055861,1.554599,1.413269,-0.800511,0.366613,0.489051,1.369872,1.557253,-0.691966,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,-0.500539,-1.274325,-0.553238,-0.90766,-0.800511,-0.480503,-0.878662,-0.569631,-0.795925,-0.691966,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-0.05905,0.055861,-0.553238,0.252805,-0.800511,-0.196968,-0.04989,-0.361827,0.111251,-0.691966,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
4,-0.53733,0.055861,-0.553238,-0.90766,-0.800511,-0.501413,-0.423556,-0.586948,-0.808102,-0.691966,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [12]:
X_train.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,parking,area^2,area bedrooms,area bathrooms,area stories,area parking,...,basement_yes,hotwaterheating_no,hotwaterheating_yes,airconditioning_no,airconditioning_yes,prefarea_no,prefarea_yes,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,0.934301,0.055861,-0.553238,-0.90766,1.591603,0.661164,0.726185,0.105732,-0.321028,1.773785,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,-0.710246,-1.274325,-0.553238,-0.90766,-0.800511,-0.594086,-0.987887,-0.668337,-0.865333,-0.691966,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-0.390167,-1.274325,-0.553238,-0.90766,1.591603,-0.415265,-0.821175,-0.51768,-0.759395,0.781975,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
3,0.860719,0.055861,-0.553238,2.573733,1.591603,0.587144,0.668698,0.071098,2.851044,1.718685,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
4,2.065617,0.055861,-0.553238,-0.90766,1.591603,2.009798,1.610048,0.638229,0.053411,2.620957,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


## Train the model

In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [15]:
y_train_pred = lin_reg.predict(X_train)
y_test_pred = lin_reg.predict(X_test)

## Evaluate the model

In [16]:
from sklearn.metrics import r2_score

In [17]:
r2_score(y_test, y_test_pred)

0.6285803438020046

In [18]:
r2_score(y_train, y_train_pred)

0.7294796729019217