## Download the Dataset

In [1]:
# Install the Kaggle package
!pip install kaggle

# Download the dataset
!kaggle datasets download -d yasserh/housing-prices-dataset

# Unzip the dataset
!unzip housing-prices-dataset.zip -d housing-prices-dataset

Dataset URL: https://www.kaggle.com/datasets/yasserh/housing-prices-dataset
License(s): CC0-1.0
Downloading housing-prices-dataset.zip to /content
  0% 0.00/4.63k [00:00<?, ?B/s]
100% 4.63k/4.63k [00:00<00:00, 8.80MB/s]
Archive:  housing-prices-dataset.zip
  inflating: housing-prices-dataset/Housing.csv  


## Load and Preprocess the Dataset

### Load the dataset

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('/content/housing-prices-dataset/Housing.csv')
dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [3]:
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

In [4]:
print(X[0])

[7420 4 2 3 'yes' 'no' 'no' 'no' 'yes' 2 'yes' 'furnished']


In [5]:
print(X[1])

[8960 4 4 4 'yes' 'no' 'no' 'no' 'yes' 3 'no' 'furnished']


### Splitting dataset

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [7]:
print(X_train[0])

[3620 2 1 1 'yes' 'no' 'no' 'no' 'no' 0 'no' 'unfurnished']


In [8]:
print(y_train[0])

1750000


### Encoding Categorical variables

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4,5,6,7,8,10,11])], remainder='passthrough')
X_train = np.array(ct.fit_transform(X_train))

In [10]:
X_test = np.array(ct.transform(X_test))

In [11]:
print(X_train[0])

[0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 1.0 3620 2 1 1 0]


In [12]:
print(X_test[0])

[0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0 4000 3 1 2 1]


### Checking for missing data

In [13]:
nan_rows = np.any(np.isnan(X_train.astype(np.float64)), axis=1)  # Convert to numeric type
print(X_train[nan_rows])
# no missing data

[]


## Training Models

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 3950288.62  4585000.  ]
 [ 6173868.82  6083000.  ]
 [ 4483635.99  4007500.  ]
 [ 7258732.75  6930000.  ]
 [ 2836727.58  2940000.  ]
 [ 7032947.1   6195000.  ]
 [ 3203851.47  3535000.  ]
 [ 3270994.01  2940000.  ]
 [ 3472554.04  3500000.  ]
 [ 8289978.33  7980000.  ]
 [ 6605321.63  6755000.  ]
 [ 3723366.24  3990000.  ]
 [ 3812376.96  3150000.  ]
 [ 4548966.85  3290000.  ]
 [ 4020476.35  4130000.  ]
 [ 1969836.22  2660000.  ]
 [ 4057262.98  4410000.  ]
 [ 3704586.87  3710000.  ]
 [ 3282767.93  3360000.  ]
 [ 4609423.65  4270000.  ]
 [ 5968243.74  5005000.  ]
 [ 6363698.62  5383000.  ]
 [ 4751300.32  6440000.  ]
 [ 2659595.28  1890000.  ]
 [ 5305573.25  6125000.  ]
 [ 5680819.59  5460000.  ]
 [ 5404106.9   5803000.  ]
 [ 5543050.52  4620000.  ]
 [ 5768360.48  5530000.  ]
 [ 5801753.71  5950000.  ]
 [ 3389277.96  4305000.  ]
 [ 6399092.03  3640000.  ]
 [ 7081030.31  5250000.  ]
 [ 2913042.4   3325000.  ]
 [ 4498664.01  3703000.  ]
 [ 5210561.68  4753000.  ]
 [ 5013457.84  9100000.  ]
 

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.6611214250980382

In [None]:
import pickle
filename = 'linear_regression_model.sav'
pickle.dump(regressor, open(filename, 'wb'))

### Support Vector regression

In [None]:
y_train_svr = y_train.reshape(len(y_train),1)
y_test_svr = y_test.reshape(len(y_test),1)

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train_svr = sc_X.fit_transform(X_train)
y_train_svr = sc_y.fit_transform(y_train_svr)

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train_svr, y_train_svr)

  y = column_or_1d(y, warn=True)


In [None]:
y_pred = sc_y.inverse_transform(regressor.predict(sc_X.transform(X_test)).reshape(-1,1))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test_svr),1)),1))

[[ 3912335.96  4585000.  ]
 [ 6259503.8   6083000.  ]
 [ 4046872.61  4007500.  ]
 [ 5779091.99  6930000.  ]
 [ 3179226.84  2940000.  ]
 [ 6885488.2   6195000.  ]
 [ 3536058.63  3535000.  ]
 [ 3120252.96  2940000.  ]
 [ 3412351.84  3500000.  ]
 [ 8126863.47  7980000.  ]
 [ 6374431.11  6755000.  ]
 [ 3834613.46  3990000.  ]
 [ 4166073.17  3150000.  ]
 [ 5397971.75  3290000.  ]
 [ 4663754.19  4130000.  ]
 [ 2662073.57  2660000.  ]
 [ 4251577.9   4410000.  ]
 [ 4096776.74  3710000.  ]
 [ 3561189.66  3360000.  ]
 [ 4214584.95  4270000.  ]
 [ 5523307.34  5005000.  ]
 [ 6307066.25  5383000.  ]
 [ 4470800.43  6440000.  ]
 [ 2880915.86  1890000.  ]
 [ 5777239.5   6125000.  ]
 [ 6458146.9   5460000.  ]
 [ 5622159.63  5803000.  ]
 [ 5699356.56  4620000.  ]
 [ 4688037.5   5530000.  ]
 [ 5683527.48  5950000.  ]
 [ 3503191.35  4305000.  ]
 [ 5768364.39  3640000.  ]
 [ 7210092.17  5250000.  ]
 [ 2995316.57  3325000.  ]
 [ 4788075.74  3703000.  ]
 [ 4283457.64  4753000.  ]
 [ 4316344.2   9100000.  ]
 

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.6045259725327574

### Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 4907000.  4585000.]
 [ 5873000.  6083000.]
 [ 5145000.  4007500.]
 [ 5600000.  6930000.]
 [ 2940000.  2940000.]
 [ 6650000.  6195000.]
 [ 3430000.  3535000.]
 [ 4193000.  2940000.]
 [ 5250000.  3500000.]
 [ 6790000.  7980000.]
 [ 5775000.  6755000.]
 [ 3500000.  3990000.]
 [ 4473000.  3150000.]
 [ 4480000.  3290000.]
 [ 4480000.  4130000.]
 [ 2100000.  2660000.]
 [ 4319000.  4410000.]
 [ 4480000.  3710000.]
 [ 3640000.  3360000.]
 [ 3640000.  4270000.]
 [ 5250000.  5005000.]
 [ 9681000.  5383000.]
 [ 5740000.  6440000.]
 [ 2100000.  1890000.]
 [ 6510000.  6125000.]
 [ 6405000.  5460000.]
 [ 5229000.  5803000.]
 [ 5810000.  4620000.]
 [ 4767000.  5530000.]
 [ 5110000.  5950000.]
 [ 3605000.  4305000.]
 [ 4900000.  3640000.]
 [ 5950000.  5250000.]
 [ 3423000.  3325000.]
 [ 3640000.  3703000.]
 [ 5950000.  4753000.]
 [ 6020000.  9100000.]
 [ 3640000.  3500000.]
 [ 2660000.  3150000.]
 [ 4200000.  4270000.]
 [ 8575000.  8960000.]
 [ 5250000.  4060000.]
 [ 5740000.  5740000.]
 [ 2940000.

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.30716046750026493

### Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 4369400.  4585000.]
 [ 6140400.  6083000.]
 [ 4655000.  4007500.]
 [ 6605200.  6930000.]
 [ 3004750.  2940000.]
 [ 7051800.  6195000.]
 [ 3146675.  3535000.]
 [ 3768100.  2940000.]
 [ 3544800.  3500000.]
 [ 7602000.  7980000.]
 [ 6068300.  6755000.]
 [ 3496500.  3990000.]
 [ 4378500.  3150000.]
 [ 5031600.  3290000.]
 [ 4933600.  4130000.]
 [ 2187500.  2660000.]
 [ 4329500.  4410000.]
 [ 4873400.  3710000.]
 [ 3479000.  3360000.]
 [ 3944500.  4270000.]
 [ 6979000.  5005000.]
 [ 6568100.  5383000.]
 [ 4567500.  6440000.]
 [ 2338000.  1890000.]
 [ 6215300.  6125000.]
 [ 4885300.  5460000.]
 [ 5683300.  5803000.]
 [ 5257000.  4620000.]
 [ 4412100.  5530000.]
 [ 5958400.  5950000.]
 [ 3628800.  4305000.]
 [ 5101250.  3640000.]
 [ 6867000.  5250000.]
 [ 3229800.  3325000.]
 [ 5397000.  3703000.]
 [ 4021500.  4753000.]
 [ 5910800.  9100000.]
 [ 3824100.  3500000.]
 [ 3467800.  3150000.]
 [ 4088000.  4270000.]
 [ 8438094.  8960000.]
 [ 6569500.  4060000.]
 [ 6543250.  5740000.]
 [ 3804500.

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.551917541419373

### CatBoost

In [None]:
!pip install catboost
from catboost import CatBoostClassifier
classifier = CatBoostClassifier()
classifier.fit(X_train, y_train)

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.5
Learning rate set to 0.075847
0:	learn: 5.2505825	total: 939ms	remaining: 15m 37s
1:	learn: 5.2198020	total: 1.34s	remaining: 11m 9s
2:	learn: 5.1918582	total: 1.79s	remaining: 9m 55s
3:	learn: 5.1602698	total: 2.29s	remaining: 9m 29s
4:	learn: 5.1320039	total: 2.59s	remaining: 8m 35s
5:	learn: 5.1055793	total: 2.95s	remaining: 8m 9s
6:	learn: 5.0776942	total: 3.35s	remaining: 7m 54s
7:	learn: 5.0506404	total: 3.74s	remaining: 7m 44s
8:	learn: 5.0240562	total: 4.11s	remaining: 7m 33s
9:	learn: 4.9966759	total: 4.54s	remaining: 7m 29s
10:	learn: 4.9697696	total: 4.87s	remaining: 7m 18s
11:	learn: 4.9445003	total: 

<catboost.core.CatBoostClassifier at 0x7ab466663c70>

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.5023780878577142

### Artificial Neural Network

In [25]:
import tensorflow as tf
X_train_ann = tf.convert_to_tensor(X_train, dtype=tf.float32)
y_train_ann = tf.convert_to_tensor(y_train, dtype=tf.float32)
y_test_ann = tf.convert_to_tensor(y_test, dtype=tf.float32)
X_test_ann = tf.convert_to_tensor(X_test, dtype=tf.float32)

In [26]:
ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=20, activation='relu'))
ann.add(tf.keras.layers.Dense(units=512, activation='relu'))
ann.add(tf.keras.layers.Dense(units=1024, activation='relu'))
ann.add(tf.keras.layers.Dense(units=2048, activation='relu'))
ann.add(tf.keras.layers.Dense(units=4096, activation='relu'))
ann.add(tf.keras.layers.Dense(units=8192, activation='relu'))
ann.add(tf.keras.layers.Dense(units=16384, activation='relu'))
ann.add(tf.keras.layers.Dense(units=32768, activation='relu'))
ann.add(tf.keras.layers.Dense(units=65536, activation='relu'))
ann.add(tf.keras.layers.Dense(units=1))

In [27]:
ann.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [28]:
ann.fit(X_train_ann, y_train_ann, batch_size = 32, epochs = 5000)

Epoch 1/2000
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 2s/step - loss: 16615245611008.0000
Epoch 2/2000
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2s/step - loss: 8969104916480.0000
Epoch 3/2000
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 2s/step - loss: 5624463097856.0000
Epoch 4/2000
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2s/step - loss: 7668771061760.0000
Epoch 5/2000
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 2s/step - loss: 5344789528576.0000
Epoch 6/2000
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2s/step - loss: 4103051149312.0000
Epoch 7/2000
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 2s/step - loss: 3687667728384.0000
Epoch 8/2000
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 2s/step - loss: 4225903886336.0000
Epoch 9/2000
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2s/step - loss: 6

KeyboardInterrupt: 

In [None]:
y_pred = ann.predict(X_test_ann)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test_ann, y_pred)