In [160]:
#importing all the required libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score 

In [161]:
# Loading the dataset
data = pd.read_csv('Delhi_house_data.csv')

In [162]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Area         1259 non-null   float64
 1   BHK          1259 non-null   int64  
 2   Bathroom     1257 non-null   float64
 3   Furnishing   1254 non-null   object 
 4   Locality     1259 non-null   object 
 5   Parking      1226 non-null   float64
 6   Price        1259 non-null   int64  
 7   Status       1259 non-null   object 
 8   Transaction  1259 non-null   object 
 9   Type         1254 non-null   object 
 10  Per_Sqft     1018 non-null   float64
dtypes: float64(4), int64(2), object(5)
memory usage: 108.3+ KB


In [163]:
data.head()

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0


In [164]:
data.corr()

Unnamed: 0,Area,BHK,Bathroom,Parking,Price,Per_Sqft
Area,1.0,0.449438,0.535104,-0.009297,0.580836,0.162832
BHK,0.449438,1.0,0.773267,-0.070707,0.571523,0.18154
Bathroom,0.535104,0.773267,1.0,-0.032796,0.728108,0.219169
Parking,-0.009297,-0.070707,-0.032796,1.0,-0.000448,0.001607
Price,0.580836,0.571523,0.728108,-0.000448,1.0,0.322859
Per_Sqft,0.162832,0.18154,0.219169,0.001607,0.322859,1.0


In [165]:
data.columns

Index(['Area', 'BHK', 'Bathroom', 'Furnishing', 'Locality', 'Parking', 'Price',
       'Status', 'Transaction', 'Type', 'Per_Sqft'],
      dtype='object')

In [166]:
# Droping rows with missing values
data.dropna(axis=0, inplace=True)  

# Clip outliers
for col in ['Area', 'Bathroom', 'Parking', 'Per_Sqft']:
    data[col] = data[col].clip(lower=0, upper=data[col].quantile(0.95)) 


In [167]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1005 entries, 1 to 1258
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Area         1005 non-null   float64
 1   BHK          1005 non-null   int64  
 2   Bathroom     1005 non-null   float64
 3   Furnishing   1005 non-null   object 
 4   Locality     1005 non-null   object 
 5   Parking      1005 non-null   float64
 6   Price        1005 non-null   int64  
 7   Status       1005 non-null   object 
 8   Transaction  1005 non-null   object 
 9   Type         1005 non-null   object 
 10  Per_Sqft     1005 non-null   float64
dtypes: float64(4), int64(2), object(5)
memory usage: 94.2+ KB


In [168]:
# Selecting relevant features and target variable
features = ['Area', 'BHK','Bathroom', 'Furnishing', 'Locality', 'Parking', 'Type', 'Per_Sqft']
target = 'Price'

In [169]:
# One-hot encode categorical features
categorical_features = ['Furnishing', 'Locality']
numerical_features = ['Area', 'BHK', 'Bathroom']

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), categorical_features),
        ('passthrough', 'passthrough', numerical_features)
    ]
)

data_encoded = preprocessor.fit_transform(data[features])

In [170]:
# Spliting data into training and testing sets into ratio of 80:20
X_train, X_test, y_train, y_test = train_test_split(data_encoded, data[target], test_size=0.2)

In [171]:
# the LinearRegression model
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [172]:
# Evaluate model's performance
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print("MSE:", mse)
print("R-squared:", r2)

MSE: 128052079116430.25
R-squared: 0.8215807560983535
