In [118]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from scipy.cluster.vq import whiten
from sklearn.model_selection import train_test_split, cross_val_score
import tensorflow as tf

In [2]:
def print_nan_percentage(df: pd.DataFrame, onlyNaNs=False):
    nan_percentage = df.isna().mean() * 100
    for feature, percentage in nan_percentage.items():
        if onlyNaNs and percentage == 0:
            continue
        print(f"{feature}: {percentage:.2f}% NaN values")

In [39]:
def test_model(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    model.fit(X_train, y_train)
    print('Accuracy:', model.score(X_test, y_test))
    
    score = cross_val_score(model, X, y, cv=5)
    print('CV Score:', np.mean(score))

In [3]:
df = pd.read_csv("Bengaluru_House_Data.csv")
df

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [4]:
print_nan_percentage(df, True)

location: 0.01% NaN values
size: 0.12% NaN values
society: 41.31% NaN values
bath: 0.55% NaN values
balcony: 4.57% NaN values


In [5]:
df = df.dropna(subset=["location"])

In [6]:
df["size"].mode()

0    2 BHK
Name: size, dtype: object

In [7]:
df["size"] = df["size"].fillna(df["size"].mode()[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["size"] = df["size"].fillna(df["size"].mode()[0])


In [8]:
df = df.drop(columns=["society"])

In [9]:
df["bath"] = df["bath"].fillna(df["bath"].median())

In [10]:
df["balcony"] = df["balcony"].fillna(df["balcony"].median())

In [11]:
print_nan_percentage(df, True)

In [12]:
data_numeric = df._get_numeric_data()
data_numeric

Unnamed: 0,bath,balcony,price
0,2.0,1.0,39.07
1,5.0,3.0,120.00
2,2.0,3.0,62.00
3,3.0,1.0,95.00
4,2.0,1.0,51.00
...,...,...,...
13315,4.0,0.0,231.00
13316,5.0,2.0,400.00
13317,2.0,1.0,60.00
13318,4.0,1.0,488.00


In [13]:
data_whitened = whiten(data_numeric)
data_normal = pd.DataFrame(normalize(data_whitened), columns=data_numeric.columns)
data_normal

Unnamed: 0,bath,balcony,price
0,0.761256,0.634531,0.133641
1,0.698943,0.699109,0.150747
2,0.369344,0.923579,0.102894
3,0.848258,0.471367,0.241394
4,0.756515,0.630579,0.173362
...,...,...,...
13314,0.887588,0.000000,0.460638
13315,0.714010,0.476120,0.513323
13316,0.752186,0.626971,0.202789
13317,0.648788,0.270392,0.711310


In [14]:
for i in df.columns.difference(data_normal.columns):
    data_normal[i] = pd.factorize(df[i])[0]

In [15]:
data_normal

Unnamed: 0,bath,balcony,price,area_type,availability,location,size,total_sqft
0,0.761256,0.634531,0.133641,0,0,0,0,0
1,0.698943,0.699109,0.150747,1,1,1,1,1
2,0.369344,0.923579,0.102894,2,1,2,2,2
3,0.848258,0.471367,0.241394,0,1,3,2,3
4,0.756515,0.630579,0.173362,0,1,4,0,4
...,...,...,...,...,...,...,...,...
13314,0.887588,0.000000,0.460638,2,1,5,15,1965
13315,0.714010,0.476120,0.513323,0,1,865,3,171
13316,0.752186,0.626971,0.202789,2,1,15,0,377
13317,0.648788,0.270392,0.711310,0,20,128,3,2116


In [92]:
y = data_normal["price"]
X = data_normal.drop(columns=["price"])

# ML

## Linear Regression

In [94]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
test_model(model, X, y)

Accuracy: 0.5036930672760438
CV Score: 0.5009808832854664


## CatBoost (Gradient-Boosting)

In [106]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(verbose=0)
test_model(model, X, y)

Accuracy: 0.9976712104368113
CV Score: 0.9980659239417722


In [110]:
final_ML_model = CatBoostRegressor()
model.fit(X, y)

<catboost.core.CatBoostRegressor at 0x1b61fe27890>

# DL

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [132]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation="relu", input_shape=(7,)),  # reflect 7 input features
    tf.keras.layers.Dense(8, activation="relu"),
    tf.keras.layers.Dense(1) # no activation function for regression output
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [134]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), 
              loss="mean_squared_error", 
              metrics=["mae"])

In [136]:
history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_split=0.2, verbose=1)

Epoch 1/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 18706.2793 - mae: 90.4096 - val_loss: 1032.3341 - val_mae: 18.9961
Epoch 2/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 749.2242 - mae: 16.9804 - val_loss: 383.5142 - val_mae: 13.4179
Epoch 3/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 332.1571 - mae: 12.1761 - val_loss: 146.2363 - val_mae: 8.0176
Epoch 4/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 115.9929 - mae: 6.9967 - val_loss: 52.1649 - val_mae: 4.4143
Epoch 5/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 43.5942 - mae: 3.9517 - val_loss: 24.3397 - val_mae: 2.7943
Epoch 6/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 23.4224 - mae: 2.7594 - val_loss: 15.2203 - val_mae: 2.2986
Epoch 7/100
[1m533/533[0m [32m━━━━━━━━━━━━━━━━━━━━[