In [108]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [109]:
data = pd.read_csv("Housing.csv")
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [110]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [111]:
encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(data[['furnishingstatus']])
encoded = encoded.astype(int)
encoded_df = pd.DataFrame(encoded , columns=encoder.get_feature_names_out(['furnishingstatus']))
data = pd.concat([data , encoded_df] , axis=1)

data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished,1,0,0
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished,1,0,0
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished,0,1,0
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished,1,0,0
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished,1,0,0


In [112]:
for col in ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']:
    data[col] = LabelEncoder().fit_transform(data[col])


In [113]:
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished,1,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished,0,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished,1,0,0


In [114]:
data.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
count,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.858716,0.177982,0.350459,0.045872,0.315596,0.693578,0.234862,0.256881,0.416514,0.326606
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.348635,0.382849,0.477552,0.209399,0.46518,0.861586,0.424302,0.437314,0.493434,0.469402
min,1750000.0,1650.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0


In [115]:
fig_box = make_subplots(rows=1, cols=2)

fig_box.add_trace(
    go.Box(y=data['price'], name='Price'),
    row=1, col=1
)
fig_box.add_trace(
    go.Box(y=data['area'], name='Area'),
    row=1, col=2
)

fig_box.show()

In [116]:
fig_his = make_subplots(rows=1, cols=2)

fig_his.add_trace(
    go.Histogram(x=data['price'], name='Price'),
    row=1, col=1
)
fig_his.add_trace(
    go.Histogram(x=data['area'], name='Area'),
    row=1, col=2
)

fig_his.show()


In [117]:
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR
lower_bound = Q1 - 1.5 * IQR

data['price'] = np.where(data['price'] > upper_bound, upper_bound,np.where(data['price'] < lower_bound, lower_bound, data['price']))

Q1 = data['area'].quantile(0.25)
Q3 = data['area'].quantile(0.75)

IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR
lower_bound = Q1 - 1.5 * IQR

data['area'] = np.where(data['area'] > upper_bound, upper_bound, np.where(data['area'] < lower_bound, lower_bound, data['area']))
data.drop(columns='furnishingstatus' , inplace=True)


In [118]:
scaler = StandardScaler()
data[['price', 'area']] = scaler.fit_transform(data[['price', 'area']])


In [119]:
figs_his = make_subplots(rows=1, cols=2)

figs_his.add_trace(
    go.Histogram(x=data['price'], name='Price'),
    row=1, col=1
)
figs_his.add_trace(
    go.Histogram(x=data['area'], name='Area'),
    row=1, col=2
)

figs_his.show()

In [120]:
X = data.drop(columns='price')
Y = data.price

In [121]:
x_train , x_test , y_train , y_test = train_test_split(X , Y , random_state=42 , test_size=0.25)

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
model = LinearRegression()
model.fit(x_train_scaled , y_train)

In [122]:

y_pred = model.predict(x_test_scaled)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"R² score: {r2:.4f}")
print(f"Mean Squared Error: {mse:.4f}")


R² score: 0.6902
Mean Squared Error: 0.3846
