In [1]:
# prompt: import libraries for model bilding

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# prompt: code to get the data

# Assuming your data is in a CSV file named 'your_data.csv'
data = pd.read_csv('/content/Housing_data.csv')

# Display the first few rows of the data to verify it's loaded correctly
print(data.head())

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  


In [3]:
data.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [5]:
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [6]:
data.isnull().sum()

Unnamed: 0,0
price,0
area,0
bedrooms,0
bathrooms,0
stories,0
mainroad,0
guestroom,0
basement,0
hotwaterheating,0
airconditioning,0


In [7]:
# prompt: give the code for outlier treatment

# Identify numerical features for outlier treatment
numerical_features = data.select_dtypes(include=np.number).columns

# Loop through numerical features and treat outliers using IQR method
for feature in numerical_features:
  Q1 = data[feature].quantile(0.25)
  Q3 = data[feature].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR

  # Replace outliers with the nearest bound
  data[feature] = np.where(data[feature] < lower_bound, lower_bound, data[feature])
  data[feature] = np.where(data[feature] > upper_bound, upper_bound, data[feature])

# Alternatively, you can remove outliers completely
# for feature in numerical_features:
#   Q1 = data[feature].quantile(0.25)
#   Q3 = data[feature].quantile(0.75)
#   IQR = Q3 - Q1
#   lower_bound = Q1 - 1.5 * IQR
#   upper_bound = Q3 + 1.5 * IQR
#   data = data[(data[feature] >= lower_bound) & (data[feature] <= upper_bound)]


# Now, data contains the treated dataset without outliers
print(data.head())

       price    area  bedrooms  bathrooms  stories mainroad guestroom  \
0  9205000.0  7420.0       4.0        2.0      3.0      yes        no   
1  9205000.0  8960.0       4.0        3.5      3.5      yes        no   
2  9205000.0  9960.0       3.0        2.0      2.0      yes        no   
3  9205000.0  7500.0       4.0        2.0      2.0      yes        no   
4  9205000.0  7420.0       4.0        1.0      2.0      yes       yes   

  basement hotwaterheating airconditioning  parking prefarea furnishingstatus  
0       no              no             yes      2.0      yes        furnished  
1       no              no             yes      2.5       no        furnished  
2      yes              no              no      2.0      yes   semi-furnished  
3      yes              no             yes      2.5      yes        furnished  
4      yes              no             yes      2.0       no        furnished  


In [8]:
from sklearn.preprocessing import PolynomialFeatures

# Total rooms (bedrooms + bathrooms)
data['total_rooms'] = data['bedrooms'] + data['bathrooms']
data['price_perarea'] = data['price']/data['area']
# Area per room (bedrooms + bathrooms)
data['area_per_room'] = data['area'] / (data['bedrooms'] + data['bathrooms'])


In [9]:
data.head().T

Unnamed: 0,0,1,2,3,4
price,9205000.0,9205000.0,9205000.0,9205000.0,9205000.0
area,7420.0,8960.0,9960.0,7500.0,7420.0
bedrooms,4.0,4.0,3.0,4.0,4.0
bathrooms,2.0,3.5,2.0,2.0,1.0
stories,3.0,3.5,2.0,2.0,2.0
mainroad,yes,yes,yes,yes,yes
guestroom,no,no,no,no,yes
basement,no,no,yes,yes,yes
hotwaterheating,no,no,no,no,no
airconditioning,yes,yes,no,yes,yes


In [10]:
# prompt: give the code for binary encoding for categorical variablesand one hot lable encoding for furnishing status

# Binary Encoding for Categorical Variables
# Assuming 'mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea' are your categorical variables
for feature in ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']:
    data[feature] = data[feature].map({'yes': 1, 'no': 0})

# One-Hot Encoding for 'furnishingstatus'
data = pd.get_dummies(data, columns=['furnishingstatus'], prefix=['furnishing'])

In [11]:
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,total_rooms,price_perarea,area_per_room,furnishing_furnished,furnishing_semi-furnished,furnishing_unfurnished
0,9205000.0,7420.0,4.0,2.0,3.0,1,0,0,0,1,2.0,1,6.0,1240.566038,1236.666667,True,False,False
1,9205000.0,8960.0,4.0,3.5,3.5,1,0,0,0,1,2.5,0,7.5,1027.34375,1194.666667,True,False,False
2,9205000.0,9960.0,3.0,2.0,2.0,1,0,1,0,0,2.0,1,5.0,924.196787,1992.0,False,True,False
3,9205000.0,7500.0,4.0,2.0,2.0,1,0,1,0,1,2.5,1,6.0,1227.333333,1250.0,True,False,False
4,9205000.0,7420.0,4.0,1.0,2.0,1,1,1,0,1,2.0,0,5.0,1240.566038,1484.0,True,False,False


In [12]:
# prompt: give the code for normalization

# Normalize numerical features using Min-Max scaling
from sklearn.preprocessing import MinMaxScaler

numerical_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'total_rooms', 'price_perarea', 'area_per_room']
scaler = MinMaxScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])



In [13]:
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,total_rooms,price_perarea,area_per_room,furnishing_furnished,furnishing_semi-furnished,furnishing_unfurnished
0,9205000.0,0.651977,0.857143,0.4,0.8,1,0,0,0,1,0.8,1,0.727273,0.398986,0.28231,True,False,False
1,9205000.0,0.825989,0.857143,1.0,1.0,1,0,0,0,1,1.0,0,1.0,0.306755,0.268992,True,False,False
2,9205000.0,0.938983,0.571429,0.4,0.4,1,0,1,0,0,0.8,1,0.545455,0.262138,0.521822,False,True,False
3,9205000.0,0.661017,0.857143,0.4,0.4,1,0,1,0,1,1.0,1,0.727273,0.393263,0.286538,True,False,False
4,9205000.0,0.651977,0.857143,0.0,0.4,1,1,1,0,1,0.8,0,0.545455,0.398986,0.360738,True,False,False


In [14]:
X = data.drop('price', axis=1)  # Features (all columns except 'price')
y = data['price']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,total_rooms,price_perarea,area_per_room,furnishing_furnished,furnishing_semi-furnished,furnishing_unfurnished
46,0.491525,0.571429,0.4,1.0,1,0,0,0,1,0.4,0,0.545455,0.404870,0.270683,True,False,False
93,0.627119,0.571429,0.4,0.0,1,0,1,0,1,1.0,0,0.545455,0.240857,0.346786,False,True,False
335,0.244746,0.285714,0.0,0.0,1,0,1,0,1,0.8,0,0.181818,0.306716,0.293514,True,False,False
412,0.108475,0.571429,0.0,0.4,1,0,1,0,0,0.0,1,0.363636,0.430827,0.097074,False,False,True
471,0.237288,0.571429,0.0,0.4,1,0,0,0,0,0.0,0,0.363636,0.209569,0.187446,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,0.491525,0.857143,0.4,1.0,1,0,0,0,1,0.0,0,0.727273,0.349358,0.207264,False,False,True
106,0.429379,0.857143,0.4,0.0,1,0,1,0,1,0.0,1,0.727273,0.351279,0.178197,False,True,False
270,0.322034,0.571429,0.4,0.8,1,0,0,1,0,0.4,0,0.545455,0.279547,0.175555,True,False,False
435,0.270056,0.285714,0.0,0.0,1,0,0,0,0,0.0,0,0.181818,0.214625,0.317190,False,False,True


In [16]:
X_test

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,total_rooms,price_perarea,area_per_room,furnishing_furnished,furnishing_semi-furnished,furnishing_unfurnished
316,0.480226,0.857143,0.4,0.4,0,0,1,0,0,0.4,0,0.727273,0.160027,0.201979,False,False,True
77,0.548023,0.571429,0.4,0.8,1,0,0,0,1,0.0,1,0.545455,0.304909,0.302393,True,False,False
360,0.270056,0.285714,0.0,0.0,1,0,0,0,0,0.0,0,0.181818,0.259594,0.317190,False,True,False
90,0.378531,0.571429,0.0,0.4,1,0,0,0,1,0.0,0,0.363636,0.419505,0.286538,False,True,False
493,0.261017,0.571429,0.0,0.0,1,0,0,0,0,0.0,0,0.363636,0.168218,0.204093,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,0.491525,0.857143,0.0,0.4,1,0,1,0,0,0.8,0,0.545455,0.518417,0.270683,False,True,False
357,0.596610,0.857143,0.0,0.4,0,0,0,0,0,0.4,0,0.545455,0.097872,0.329663,True,False,False
39,0.491525,0.857143,0.4,1.0,1,0,0,0,1,0.4,0,0.727273,0.432626,0.207264,False,True,False
54,0.491525,0.571429,0.4,0.4,1,1,0,0,1,0.4,0,0.545455,0.392253,0.270683,False,True,False


In [17]:
y_train

Unnamed: 0,price
46,7525000.0
93,6300000.0
335,3920000.0
412,3430000.0
471,3010000.0
...,...
71,6755000.0
106,6160000.0
270,4340000.0
435,3290000.0


In [18]:
# prompt: give the code for the linear regression model

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# Create a Linear Regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# You can also print the coefficients and intercept of the model
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Mean Squared Error: 344211868706.789
R-squared: 0.9147967620937997
Coefficients: [ 8.24177988e+06  1.50121635e+18  1.07229739e+18  2.03714847e+05
  2.28584671e+05  2.01958774e+05 -2.98693166e+04  2.00388500e+05
  4.69949239e+05  1.59229426e+05  4.06088043e+04 -2.35905426e+18
  7.83765321e+06 -3.47943181e+06  1.38155388e+20  1.38155388e+20
  1.38155388e+20]
Intercept: -1.3815538764119543e+20


In [19]:
# prompt: give the rmse value

rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 586695.7207162747


In [20]:

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Create a Decision Tree Regression model
model = DecisionTreeRegressor(random_state=42)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Squared Error: 212888158565.1376
R-squared: 0.947303500922888
Root Mean Squared Error (RMSE): 461398.047855794


In [21]:
# prompt: give the code for the Knn model

from sklearn.neighbors import KNeighborsRegressor

# Create a KNN Regression model
knn_model = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors

# Train the model on the training data
knn_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_knn = knn_model.predict(X_test)

# Evaluate the model
mse_knn = mean_squared_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)

print(f"KNN - Mean Squared Error: {mse_knn}")
print(f"KNN - R-squared: {r2_knn}")
print(f"KNN - Root Mean Squared Error (RMSE): {rmse_knn}")

KNN - Mean Squared Error: 1585713459541.2844
KNN - R-squared: 0.6074861635307254
KNN - Root Mean Squared Error (RMSE): 1259251.1503037368


In [22]:
# prompt: give the code for xg boost

!pip install xgboost

import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score


# Create an XGBoost regressor
xgb_model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

# Train the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)

print(f"XGBoost - Mean Squared Error: {mse_xgb}")
print(f"XGBoost - R-squared: {r2_xgb}")
print(f"XGBoost - Root Mean Squared Error (RMSE): {rmse_xgb}")

XGBoost - Mean Squared Error: 1839768821705.8596
XGBoost - R-squared: 0.5445994898515756
XGBoost - Root Mean Squared Error (RMSE): 1356380.7804985514


In [23]:
# prompt: give the code for cat boost

!pip install catboost

from catboost import CatBoostRegressor

# Create a CatBoost regressor
catboost_model = CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6, loss_function='RMSE')

# Train the model on the training data
catboost_model.fit(X_train, y_train, verbose=False)

# Make predictions on the test data
y_pred_catboost = catboost_model.predict(X_test)

# Evaluate the model
mse_catboost = mean_squared_error(y_test, y_pred_catboost)
r2_catboost = r2_score(y_test, y_pred_catboost)
rmse_catboost = np.sqrt(mse_catboost)

print(f"CatBoost - Mean Squared Error: {mse_catboost}")
print(f"CatBoost - R-squared: {r2_catboost}")
print(f"CatBoost - Root Mean Squared Error (RMSE): {rmse_catboost}")

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7
CatBoost - Mean Squared Error: 172256694582.45206
CatBoost - R-squared: 0.9573610631597758
CatBoost - Root Mean Squared Error (RMSE): 415038.1844872253
