 # FE for the housing dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing  import StandardScaler,OrdinalEncoder,OneHotEncoder,Normalizer,MinMaxScaler,PowerTransformer
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.linear_model import LinearRegression,LogisticRegression,Ridge,ElasticNet,Lasso,PoissonRegressor,SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,root_mean_squared_error
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRFRegressor

In [None]:
df=pd.read_csv('nepal_hosuing_datetset.csv')
df.head()

Unnamed: 0,price,location,area,facing,no_of_bedroom,no_of_bathroom,no_of_livingroom,no_of_kitchen,parking,road_size,furniture,no_of_flat
0,32000000.0,"Shital height, Imadol",1069.53,east,5.0,4.0,2.0,3.0,4.0,13.0,semi-furnished,3.0
1,28000000.0,Shital Height,1026.75,west,5.0,4.0,2.0,3.5,2.0,13.0,semi-furnished,3.5
2,40000000.0,"Shital height, Imadol",1369.0,north,6.0,4.0,2.0,2.5,4.0,13.0,fully-furnished,2.5
3,28000000.0,Bhaisepati,1369.0,south,6.0,3.0,2.0,2.5,4.0,16.0,fully-furnished,2.5
4,40000000.0,Harisiddhi,2395.75,east,6.0,4.0,2.0,2.5,4.0,13.0,fully-furnished,2.5


In [None]:
df.duplicated().sum()

1835

In [4]:
features=df.columns

In [5]:
for feature in features:
    try:
        print(f'the min value of {feature}:{df[feature].min()}')
    except:
        pass

the min value of price:0.0
the min value of location:  
the min value of area:0.0
the min value of no_of_bedroom:0.0
the min value of no_of_bathroom:0.0
the min value of no_of_livingroom:0.0
the min value of no_of_kitchen:0.0
the min value of parking:1.0
the min value of road_size:0.0
the min value of no_of_flat:0.0


In [6]:
#collecting the catagorecal and numerical features.
numerical_features=[features for features in df.columns if df[features].dtype !='O']
categorical_features=[features for features in df.columns if df[features].dtype =='O']

print(f'The number of numerical features is {len(numerical_features)}:{numerical_features}')
print(f'The number of categorical features is {len(categorical_features)}:{categorical_features}')

The number of numerical features is 9:['price', 'area', 'no_of_bedroom', 'no_of_bathroom', 'no_of_livingroom', 'no_of_kitchen', 'parking', 'road_size', 'no_of_flat']
The number of categorical features is 3:['location', 'facing', 'furniture']


In [7]:
feature_with_zeros=[]
for  feature in features:
    try:
        min_val=df[feature].min()
        if min_val ==0:
            feature_with_zeros.append(feature)
        else :
            pass
    except: pass

print(feature_with_zeros)

['price', 'area', 'no_of_bedroom', 'no_of_bathroom', 'no_of_livingroom', 'no_of_kitchen', 'road_size', 'no_of_flat']


In [8]:
for feature in feature_with_zeros:
    df[feature]=df[feature].replace(0,np.nan)



In [9]:
for feature in features:
    try:
        print(f'The min value of {feature}:{df[feature].min()}')
    except:
        pass

The min value of price:9500000.0
The min value of location:  
The min value of area:278.07
The min value of no_of_bedroom:1.0
The min value of no_of_bathroom:1.0
The min value of no_of_livingroom:1.0
The min value of no_of_kitchen:1.0
The min value of parking:1.0
The min value of road_size:3.0
The min value of no_of_flat:1.0


In [10]:
df.isnull().sum()


price                296
location               0
area                 209
facing              1224
no_of_bedroom         58
no_of_bathroom        58
no_of_livingroom     176
no_of_kitchen         68
parking             1264
road_size           1256
furniture           1224
no_of_flat            34
dtype: int64

In [11]:
null_numerical_features=[]
null_categorical_features=[]

for feature in df.columns :
    if df[feature].dtype != "O" and df[feature].isnull().sum()>=1:
        null_numerical_features.append(feature)
    elif df[feature].dtype == "O" and df[feature].isnull().sum()>=1:
        null_categorical_features.append(feature)
    else : pass


print(null_categorical_features)
print(null_numerical_features)

['facing', 'furniture']
['price', 'area', 'no_of_bedroom', 'no_of_bathroom', 'no_of_livingroom', 'no_of_kitchen', 'parking', 'road_size', 'no_of_flat']


### Filling NAN for all the numerical  feature excpet for price and area.

In [12]:
null_numerical_features.remove('price')
null_numerical_features.remove('area')


###  Imputing null values in all numerical feature withh null value.

In [13]:
knn=KNNImputer()
for feature in null_numerical_features:
    df[feature]=knn.fit_transform(df[[feature]])

### OneHotEncoding the catg feature with null values.

In [14]:
null_categorical_features

['facing', 'furniture']

In [15]:
df.head()

Unnamed: 0,price,location,area,facing,no_of_bedroom,no_of_bathroom,no_of_livingroom,no_of_kitchen,parking,road_size,furniture,no_of_flat
0,32000000.0,"Shital height, Imadol",1069.53,east,5.0,4.0,2.0,3.0,4.0,13.0,semi-furnished,3.0
1,28000000.0,Shital Height,1026.75,west,5.0,4.0,2.0,3.5,2.0,13.0,semi-furnished,3.5
2,40000000.0,"Shital height, Imadol",1369.0,north,6.0,4.0,2.0,2.5,4.0,13.0,fully-furnished,2.5
3,28000000.0,Bhaisepati,1369.0,south,6.0,3.0,2.0,2.5,4.0,16.0,fully-furnished,2.5
4,40000000.0,Harisiddhi,2395.75,east,6.0,4.0,2.0,2.5,4.0,13.0,fully-furnished,2.5


In [16]:
oe=OrdinalEncoder()
ohe=OneHotEncoder(sparse_output=False)
for feature in null_categorical_features:
    df[feature]=oe.fit_transform(df[[feature]])

location_ohe=ohe.fit_transform(df[['location']])
location=pd.DataFrame(location_ohe)

df = pd.concat([df.drop('location', axis=1), location], axis=1)



In [17]:
df.sample(5)

Unnamed: 0,price,area,facing,no_of_bedroom,no_of_bathroom,no_of_livingroom,no_of_kitchen,parking,road_size,furniture,...,624,625,626,627,628,629,630,631,632,633
607,23000000.0,941.18,,5.0,4.0,2.0,2.0,1.0,14.46899,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
671,50000000.0,1711.25,,7.0,5.0,2.0,2.0,1.0,14.46899,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2551,49500000.0,3272.76,17.0,4.0,1.0,1.0,1.0,1.577435,6.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1360,29000000.0,1069.53,,5.0,4.0,1.0,2.0,1.0,14.46899,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2245,27500000.0,,17.0,3.0,2.0,1.0,1.0,1.577435,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Imputing the categorical -OrdinalENdoded earlier.

In [18]:
null_categorical_features

['facing', 'furniture']

In [19]:
knn=KNNImputer()
for feature in null_categorical_features:
    df[feature]=knn.fit_transform(df[[feature]])

In [20]:
df.shape

(2691, 645)

In [21]:
df.duplicated().sum()

1835

In [22]:
#f=df.drop_duplicates()

In [23]:
df.shape

(2691, 645)

In [24]:
df.isnull().sum()

price             296
area              209
facing              0
no_of_bedroom       0
no_of_bathroom      0
                 ... 
629                 0
630                 0
631                 0
632                 0
633                 0
Length: 645, dtype: int64

### Droping all the rows with null values.

In [25]:
df=df.dropna()

In [26]:
df.shape

(2220, 645)

In [27]:
#train test Split
X=df.drop(columns='price')
y=df['price']

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)

In [28]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [29]:
#pca
pca=PCA(n_components=10)

pca.fit(X_train)

X_train=pca.transform(X_train)
X_test=pca.transform(X_test)


In [30]:
#power transformer
transformer=PowerTransformer(method="yeo-johnson")

transformer.fit(X_train)

X_train=transformer.transform(X_train)
X_test=transformer.transform(X_test)

In [31]:
#standarization
#sr=StandardScaler()

#sr.fit(X_train)

#X_train=sr.transform(X_train)
#X_test=sr.transform(X_test)

In [32]:
#normalization
mr=MinMaxScaler()

mr.fit(X_train)

X_train=mr.transform(X_train)
X_test=mr.transform(X_test)

In [33]:

# Models to evaluate
models = [
    LogisticRegression(),
    LinearRegression(),
    DecisionTreeRegressor(),
    ElasticNet(),
    Ridge(),
    Lasso(),
    PoissonRegressor()
]

# Metrics to evaluate
accuracy_metrics = [mean_absolute_error, root_mean_squared_error, r2_score]


for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    for metric in accuracy_metrics:
        score = metric(y_test, y_pred)
        result = {"model":model.__class__.__name__,"metric":metric.__name__,"score":score}
        print(f'The {metric.__name__} of {model.__class__.__name__} is: {score}')
    print('-----------------------------------------------------------------')
        



The mean_absolute_error of LogisticRegression is: 10488513.511261262
The root_mean_squared_error of LogisticRegression is: 22914723.914081063
The r2_score of LogisticRegression is: 0.02952453031948843
-----------------------------------------------------------------
The mean_absolute_error of LinearRegression is: 7994071.096085471
The root_mean_squared_error of LinearRegression is: 20196040.59505621
The r2_score of LinearRegression is: 0.24614507888973403
-----------------------------------------------------------------
The mean_absolute_error of DecisionTreeRegressor is: 1871846.8468468469
The root_mean_squared_error of DecisionTreeRegressor is: 6639567.10571942
The r2_score of DecisionTreeRegressor is: 0.9185230368170912
-----------------------------------------------------------------
The mean_absolute_error of ElasticNet is: 12606525.642781328
The root_mean_squared_error of ElasticNet is: 23196019.808077756
The r2_score of ElasticNet is: 0.0055516164954089176
----------------------

In [34]:
#grid search cv

param_grid = {
    'max_depth': [1,2,3,4,5,10,20,15,None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['squared_error', 'absolute_error', 'poisson','friedman_mse']
}

grid_search=GridSearchCV(estimator=DecisionTreeRegressor(),param_grid=param_grid,cv=10)

grid_search.fit(X_train,y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score (MSE):", grid_search.best_score_)


Best Parameters: {'criterion': 'poisson', 'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Score (MSE): 0.7425294215626772


In [35]:
grid_search