In [2]:
import tqdm as notebook_tqdm
import os 
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import kagglehub

In [3]:
# Download latest version
path = kagglehub.dataset_download("yasserh/housing-prices-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/yasserh/housing-prices-dataset?dataset_version_number=1...


100%|█████████████████████████████████████████| 4.63k/4.63k [00:00<00:00, 4.31MB/s]

Extracting files...
Path to dataset files: /home/zhan/.cache/kagglehub/datasets/yasserh/housing-prices-dataset/versions/1





In [3]:
# check df columns and characteristics 
df = pd.read_csv("/home/zhan/.cache/kagglehub/datasets/yasserh/housing-prices-dataset/versions/1/Housing.csv")

df.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [5]:
df.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [6]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [8]:
df.isnull().any().any()
df.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [9]:
# log transform price 
df["price"] = np.log1p(df["price"])

In [10]:
bedroom_invals = df[df['bedrooms'] < 1]
print(len(bedroom_invals))

0


In [11]:
# one hot encode categorical variables 
cat_vars =['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
num_vars = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']

df_encoded_dropped = pd.get_dummies(df, columns=cat_vars, drop_first=True)

In [13]:
# check the df again 
df_encoded_dropped.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,16.403275,7420,4,2,3,2,True,False,False,False,True,True,False,False
1,16.321037,8960,4,4,4,3,True,False,False,False,True,False,False,False
2,16.321037,9960,3,2,2,2,True,False,True,False,False,True,True,False
3,16.318175,7500,4,2,2,3,True,False,True,False,True,True,False,False
4,16.250001,7420,4,1,2,2,True,True,True,False,True,False,False,False


In [17]:
df_encoded_dropped.describe()
print(df_encoded_dropped.dtypes)

price                              float64
area                                 int64
bedrooms                             int64
bathrooms                            int64
stories                              int64
parking                              int64
mainroad_yes                          bool
guestroom_yes                         bool
basement_yes                          bool
hotwaterheating_yes                   bool
airconditioning_yes                   bool
prefarea_yes                          bool
furnishingstatus_semi-furnished       bool
furnishingstatus_unfurnished          bool
dtype: object


In [20]:
# turn the categorical columns into float if they are not yet before TTS

cat_vars = ['mainroad_yes', 'guestroom_yes', 'basement_yes', 'hotwaterheating_yes', 'airconditioning_yes', 'prefarea_yes', 
            'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished']

for col in cat_vars:
    df_encoded_dropped[col] = np.where(df_encoded_dropped[col] == True, 1.0, df_encoded_dropped[col])
    df_encoded_dropped[col] = np.where(df_encoded_dropped[col] == False, 0.0, df_encoded_dropped[col])

In [21]:
# check data types 
print(df_encoded_dropped.dtypes)

price                              float64
area                                 int64
bedrooms                             int64
bathrooms                            int64
stories                              int64
parking                              int64
mainroad_yes                       float64
guestroom_yes                      float64
basement_yes                       float64
hotwaterheating_yes                float64
airconditioning_yes                float64
prefarea_yes                       float64
furnishingstatus_semi-furnished    float64
furnishingstatus_unfurnished       float64
dtype: object


In [22]:
df_encoded_dropped.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
count,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0
mean,15.306987,5150.541284,2.965138,1.286239,1.805505,0.693578,0.858716,0.177982,0.350459,0.045872,0.315596,0.234862,0.416514,0.326606
std,0.372165,2170.141023,0.738064,0.50247,0.867492,0.861586,0.348635,0.382849,0.477552,0.209399,0.46518,0.424302,0.493434,0.469402
min,14.375127,1650.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,15.048071,3600.0,2.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15.283385,4600.0,3.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,15.56297,6360.0,3.0,2.0,2.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
max,16.403275,16200.0,6.0,4.0,4.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
# check OHE correctness 
for col in cat_vars:
    print(df_encoded_dropped[col].unique())

[1. 0.]
[0. 1.]
[0. 1.]
[0. 1.]
[1. 0.]
[1. 0.]
[0. 1.]
[0. 1.]


In [25]:
# train test split
x = df_encoded_dropped.drop(columns=['price'])
y = df_encoded_dropped['price']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [26]:
# check for outliers in x, impute using IQR 

print(num_vars) 

for col in num_vars:
    q1 = X_train[col].quantile(0.25)
    q3 = X_train[col].quantile(0.75)
    iqr = q3 - q1

    upper_bound = q3 + (1.5 * iqr)
    lower_bound = q1 - (1.5 * iqr)

    # cap using IQR bounds: value > upper_bound = upper_bound, value < lower_bound = lower_bound 
    X_train[col] = np.where(X_train[col] > upper_bound, upper_bound, X_train[col])
    X_train[col] = np.where(X_train[col] < lower_bound, lower_bound, X_train[col])

    X_test[col] = np.where(X_test[col] > upper_bound, upper_bound, X_test[col])
    X_test[col] = np.where(X_test[col] < lower_bound, lower_bound, X_test[col])

['area', 'bedrooms', 'bathrooms', 'stories', 'parking']


In [27]:
# standardize 
scaler = StandardScaler()
scaler.fit(X_train[num_vars]) # fit the training data on all numerical columns of X_train 

# TRANSFORM the training data
X_train_scaled = scaler.transform(X_train[num_vars])

# TRANSFORM the test data using the *same* mean and std calculated in step 2
X_test_scaled = scaler.transform(X_test[num_vars])

In [28]:
# convert scaled back to df and concat with cat_vars 

# convert back to df 
X_train_scaled_df = pd.DataFrame(
    X_train_scaled,
    columns=num_vars,    # Use the column names you scaled
    index=X_train.index        # Use the original index to match rows
)

X_test_scaled_df = pd.DataFrame(
    X_test_scaled,
    columns=num_vars,    # Use the column names you scaled
    index=X_test.index        # Use the original index to match rows
)

print(cat_vars) # make sure it is the OHE columns 

# final result will be floats: scaled num_vars are float by default, and we made sure all cat_vars are floats 
# before TTS before adding them back here together 
final_X_train = pd.concat([X_train_scaled_df, X_train[cat_vars]], axis=1)
final_X_test = pd.concat([X_test_scaled_df, X_test[cat_vars]], axis=1)

['mainroad_yes', 'guestroom_yes', 'basement_yes', 'hotwaterheating_yes', 'airconditioning_yes', 'prefarea_yes', 'furnishingstatus_semi-furnished', 'furnishingstatus_unfurnished']


In [29]:
# make sure it is float 
y_train = y_train.astype(float) # panda series 
y_test = y_test.astype(float)

In [30]:
# final check
print(final_X_train.dtypes)
print(final_X_test.dtypes)
print(y_train.dtypes)
print(y_test.dtypes)

area                               float64
bedrooms                           float64
bathrooms                          float64
stories                            float64
parking                            float64
mainroad_yes                       float64
guestroom_yes                      float64
basement_yes                       float64
hotwaterheating_yes                float64
airconditioning_yes                float64
prefarea_yes                       float64
furnishingstatus_semi-furnished    float64
furnishingstatus_unfurnished       float64
dtype: object
area                               float64
bedrooms                           float64
bathrooms                          float64
stories                            float64
parking                            float64
mainroad_yes                       float64
guestroom_yes                      float64
basement_yes                       float64
hotwaterheating_yes                float64
airconditioning_yes                float

In [33]:
final_X_test.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,parking,mainroad_yes,guestroom_yes,basement_yes,hotwaterheating_yes,airconditioning_yes,prefarea_yes,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
316,0.396318,1.505467,1.561266,0.32899,0.391688,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
77,0.693546,0.081641,1.561266,1.627088,-0.815209,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
360,-0.52509,-1.342184,-0.56264,-0.969107,-0.815209,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
90,-0.049524,0.081641,-0.56264,0.32899,-0.815209,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
493,-0.56472,0.081641,-0.56264,-0.969107,-0.815209,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# export as csvs 
final_X_train.to_csv("housing_X_train_processed.csv", index=False)
final_X_test.to_csv("housing_X_test_processed.csv", index=False)

# panda series back to df 
y_train.to_frame().to_csv("housing_y_train.csv", index=False)
y_test.to_frame().to_csv("housing_y_test.csv", index=False)

In [None]:
# NOTE: you need to implement exponent for getting the predictions out of log scale for the log transformed y variable.