In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Housing.csv')
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
data.shape

(545, 13)

In [5]:
encoded_data = pd.get_dummies(data,dtype= 'int',drop_first = True)
encoded_data.columns = encoded_data.columns.str.replace('_yes','')
encoded_data.columns = encoded_data.columns.str.replace('furnishingstatus_','')
encoded_data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,2,1,0,0,0,1,1,0,0
1,12250000,8960,4,4,4,3,1,0,0,0,1,0,0,0
2,12250000,9960,3,2,2,2,1,0,1,0,0,1,1,0
3,12215000,7500,4,2,2,3,1,0,1,0,1,1,0,0
4,11410000,7420,4,1,2,2,1,1,1,0,1,0,0,0


In [6]:
# train test split
x = encoded_data.drop(['price'],axis =1)
y = encoded_data['price']

x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.3, random_state = 42)

# data dimensions
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(381, 13)
(164, 13)
(381,)
(164,)


In [7]:
# Standardizing the train data
scale = StandardScaler()
x_train_scaled = scale.fit_transform(x_train)
x_train_scaled_df = pd.DataFrame(x_train_scaled, columns = scale.get_feature_names_out())

# standardize the test data
x_test_scaled = scale.transform(x_test)
x_test_scaled_df = pd.DataFrame(x_test_scaled ,columns = scale.get_feature_names_out())


In [8]:
x_train_scaled_df.head()


Unnamed: 0,area,bedrooms,bathrooms,stories,parking,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,semi-furnished,unfurnished
0,0.934301,0.055861,-0.553238,-0.90766,1.591603,0.397561,-0.478573,1.334549,-0.235376,-0.682191,1.798147,-0.8647,1.492921
1,-0.710246,-1.274325,-0.553238,-0.90766,-0.800511,0.397561,-0.478573,-0.749317,4.248529,-0.682191,-0.556128,1.15647,-0.669828
2,-0.390167,-1.274325,-0.553238,-0.90766,1.591603,0.397561,-0.478573,-0.749317,-0.235376,1.465865,-0.556128,1.15647,-0.669828
3,0.860719,0.055861,-0.553238,2.573733,1.591603,0.397561,-0.478573,-0.749317,-0.235376,1.465865,-0.556128,1.15647,-0.669828
4,2.065617,0.055861,-0.553238,-0.90766,1.591603,0.397561,-0.478573,1.334549,-0.235376,-0.682191,1.798147,-0.8647,-0.669828


In [9]:
# stats of train data
round(x_train_scaled_df.describe(),2)

Unnamed: 0,area,bedrooms,bathrooms,stories,parking,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,semi-furnished,unfurnished
count,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0,381.0
mean,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,0.0,0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.6,-2.6,-0.55,-0.91,-0.8,-2.52,-0.48,-0.75,-0.24,-0.68,-0.56,-0.86,-0.67
25%,-0.7,-1.27,-0.55,-0.91,-0.8,0.4,-0.48,-0.75,-0.24,-0.68,-0.56,-0.86,-0.67
50%,-0.29,0.06,-0.55,0.25,-0.8,0.4,-0.48,-0.75,-0.24,-0.68,-0.56,-0.86,-0.67
75%,0.55,0.06,-0.55,0.25,0.4,0.4,-0.48,1.33,-0.24,1.47,-0.56,1.16,1.49
max,5.09,4.05,5.77,2.57,2.79,0.4,2.09,1.33,4.25,1.47,1.8,1.16,1.49


# Ml Model building process (emprical classical data)

    # 0.Do all necessary EDA (missing value, data type, outlier, etc..,)
    # 1.encoding >> dep & indep split >> train-test-split
    # 2 model training: train_data >> fit_transform >> model-training (build_model)
    # 3.model testing: test_data >> transform >> model-testing (test pred) >> evaluate model

# new incoming real time data for prediction
    # incoming_x_real >> encoding >> transform >> build_model >> real-time-prediction