In [1]:
## Import necessary libraries
import  pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Housing_data.csv') # Load dataset

In [3]:
df # Display dataset

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished


In [4]:
df.describe() # Get summary statistics of numerical features

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [5]:
df.isnull().sum()  # Check for missing values in the dataset

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [6]:
##df[""].corr() # Missing column name - needs to be specified

In [7]:
df["mainroad"].unique()  # Get unique values in 'mainroad' column

array(['yes', 'no'], dtype=object)

In [8]:
## Hardcoded encoding of categorical variable
# Convert 'mainroad' feature from categorical to numerical (1 for 'yes', 0 for 'no')
df["mainroad"] = df["mainroad"].apply(lambda x: 1 if x == "yes" else 0) 

In [9]:
df["mainroad"].unique()  # Verify encoding

array([1, 0], dtype=int64)

In [10]:
# Print unique values for categorical columns
print(df["guestroom"].unique())
print(df["basement"].unique())
print(df["hotwaterheating"].unique())
print(df["airconditioning"].unique())
print(df["prefarea"].unique())

['no' 'yes']
['no' 'yes']
['no' 'yes']
['yes' 'no']
['yes' 'no']


In [11]:
# Function to encode categorical variables

def encoding_cat_variable(col_names):
    for col_name in col_names:
        df[col_name] = df[col_name].apply(lambda x: 1 if x == "yes" else 0)
    return df

In [12]:
# Encode multiple categorical variables

col_name = ['guestroom', 'basement','hotwaterheating','airconditioning','prefarea']
encoding_cat_variable(col_name)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,0,2,0,unfurnished
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,semi-furnished
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,unfurnished
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,furnished


In [13]:
df["furnishingstatus"].unique() # Check unique values in 'furnishingstatus'

array(['furnished', 'semi-furnished', 'unfurnished'], dtype=object)

In [14]:
# One-hot encoding for 'furnishingstatus' column

df = pd.get_dummies(data = df, columns = ["furnishingstatus"],dtype=int)  ### []

In [15]:
df.head(1) # Display first row of updated dataset

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,1,0,0


In [16]:
df.corr()  # Compute correlation matrix

# price and Area corr value is : -0.9 -- Price and Area Are opposite correlated 
# price and stories corr value is : 0.9 


# area and stories are highly correlated : 0.8 ## we need to drop one of these features to remove correlated features
# area is having : 0.9 corr with price
# stories is having : 0.3 with Price : So in this case we will drop stories as its having very less corelated with Target Variable(Price)


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
price,1.0,0.535997,0.366494,0.517545,0.420712,0.296898,0.255517,0.187057,0.093073,0.452954,0.384394,0.329777,0.22935,0.063656,-0.280587
area,0.535997,1.0,0.151858,0.19382,0.083996,0.288874,0.140297,0.047417,-0.009229,0.222393,0.35298,0.234779,0.145772,0.006156,-0.142278
bedrooms,0.366494,0.151858,1.0,0.37393,0.408564,-0.012033,0.080549,0.097312,0.046049,0.160603,0.13927,0.079023,0.079054,0.05004,-0.126252
bathrooms,0.517545,0.19382,0.37393,1.0,0.326165,0.042398,0.126469,0.102106,0.067159,0.186915,0.177496,0.063472,0.108139,0.029834,-0.132107
stories,0.420712,0.083996,0.408564,0.326165,1.0,0.121706,0.043538,-0.172394,0.018847,0.293602,0.045547,0.044425,0.093176,-0.003648,-0.082972
mainroad,0.296898,0.288874,-0.012033,0.042398,0.121706,1.0,0.092337,0.044002,-0.011781,0.105423,0.204433,0.199876,0.129971,0.01145,-0.133123
guestroom,0.255517,0.140297,0.080549,0.126469,0.043538,0.092337,1.0,0.372066,-0.010308,0.138179,0.037466,0.160897,0.099721,0.005821,-0.099023
basement,0.187057,0.047417,0.097312,0.102106,-0.172394,0.044002,0.372066,1.0,0.004385,0.047341,0.051497,0.228083,0.069852,0.050284,-0.117935
hotwaterheating,0.093073,-0.009229,0.046049,0.067159,0.018847,-0.011781,-0.010308,0.004385,1.0,-0.130023,0.067864,-0.059411,-0.008472,0.063819,-0.059194
airconditioning,0.452954,0.222393,0.160603,0.186915,0.293602,0.105423,0.138179,0.047341,-0.130023,1.0,0.159173,0.117382,0.160994,-0.053179,-0.094086


In [17]:
# Drop highly correlated and redundant features

df.drop(columns=['basement','hotwaterheating', 'furnishingstatus_semi-furnished'], inplace = True)

In [18]:
# List remaining columns

df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'airconditioning', 'parking', 'prefarea',
       'furnishingstatus_furnished', 'furnishingstatus_unfurnished'],
      dtype='object')

In [19]:
# Split dataset into features (X) and target variable (Y)

X = df.drop(columns = ["price"])
Y = df["price"]

In [20]:
# Standardize features

sc = StandardScaler()
# df_scaled = sc.fir_transform(df)  ## Wrong as we never do scaling on target Varible


In [21]:
from sklearn.model_selection import train_test_split

In [22]:
# Split data into training and testing sets

X_train, x_test, Y_train, y_test = train_test_split(X,Y, test_size=0.3, random_state=10)

In [23]:
X_train.head(1)  # Display first row of training data

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,airconditioning,parking,prefarea,furnishingstatus_furnished,furnishingstatus_unfurnished
175,9800,4,2,2,1,1,0,2,0,0,0


In [24]:
Y_train.head(1)  # Display first row of training target

175    5250000
Name: price, dtype: int64

In [25]:
# Apply standard scaling to training and test data

X_train_scaled = sc.fit_transform(X_train)  

In [26]:
type(X_train_scaled)  # Verify type of transformed data

numpy.ndarray

In [27]:
# Scale the test data using the same scaler (no fit, just transform)

X_test_scaled = sc.transform(x_test)  

In [28]:
X_train_scaled  # Check scaled training data

array([[ 2.31910097,  1.35561851,  1.38877361, ..., -0.52775325,
        -0.5925064 , -0.70293502],
       [ 0.72263472,  1.35561851,  1.38877361, ...,  1.89482491,
         1.68774549, -0.70293502],
       [-0.75630028, -1.31358383, -0.60040964, ...,  1.89482491,
         1.68774549, -0.70293502],
       ...,
       [-1.58097794, -1.31358383, -0.60040964, ..., -0.52775325,
        -0.5925064 , -0.70293502],
       [ 5.1594397 ,  0.02101734, -0.60040964, ..., -0.52775325,
        -0.5925064 , -0.70293502],
       [-1.06971696,  0.02101734, -0.60040964, ...,  1.89482491,
        -0.5925064 , -0.70293502]])

In [29]:
# Create an instance of Linear Regression model

lm = LinearRegression()

In [30]:
## training using lm for X_TRAIN,Y_TRAIN
# Train the model using training data
lm.fit(X_train_scaled,Y_train)


In [31]:
## Predict data using above trained model
# Predict target values using trained model
y_pred = lm.predict(X_test_scaled)


In [32]:
# Compare predicted vs actual values for a sample

y_pred[1]

4004730.9081541253

In [33]:
y_test[1]

12250000

In [34]:
# Calculate prediction error for sample

y_test[1] - y_pred[1]   # Compute the error for the first prediction

8245269.091845875

In [35]:
# Evaluate model performance using Mean Squared Error
# Import mean squared error metric

from sklearn.metrics import mean_squared_error

In [36]:
# Compute mean squared error between actual and predicted values
mean_squared_error(y_test, y_pred)

769626499981.7697