# Dependency

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Load dataset

In [3]:
df = pd.read_csv("datasets/Housing.csv")
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [5]:
cols = ["mainroad","guestroom","basement","hotwaterheating","airconditioning","prefarea","furnishingstatus"]
le = LabelEncoder()
for col in cols:
    df[col] = le.fit_transform(df[col])

df.head()  

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0


In [6]:
sc = StandardScaler()
df['area'] =  sc.fit_transform(df[['area']])
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,1.046726,4,2,3,1,0,0,0,1,2,1,0
1,12250000,1.75701,4,4,4,1,0,0,0,1,3,0,0
2,12250000,2.218232,3,2,2,1,0,1,0,0,2,1,1
3,12215000,1.083624,4,2,2,1,0,1,0,1,3,1,0
4,11410000,1.046726,4,1,2,1,1,1,0,1,2,0,0


In [7]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5*IQR
upper_bound = Q3 + 1.5*IQR

df = df[(df['area'] >= lower_bound) & (df['area'] <= upper_bound)]
df

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,1.046726,4,2,3,1,0,0,0,1,2,1,0
1,12250000,1.757010,4,4,4,1,0,0,0,1,3,0,0
2,12250000,2.218232,3,2,2,1,0,1,0,0,2,1,1
3,12215000,1.083624,4,2,2,1,0,1,0,1,3,1,0
4,11410000,1.046726,4,1,2,1,1,1,0,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,-0.991879,2,1,1,1,0,1,0,0,2,0,2
541,1767150,-1.268613,3,1,1,0,0,0,0,0,0,0,1
542,1750000,-0.705921,2,1,1,1,0,0,0,0,0,0,2
543,1750000,-1.033389,3,1,1,0,0,0,0,0,0,0,0


In [8]:
x = df.drop('price', axis = 1)
y = df['price']

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [10]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Elastic Net": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(n_estimators=100),

    "SVR": Pipeline([
        ("scaler", StandardScaler()),
        ("svr", SVR(kernel="rbf"))
    ]),

    "KNN": Pipeline([
        ("scaler", StandardScaler()),
        ("knn", KNeighborsRegressor(n_neighbors=5))
    ])
}

In [11]:
cv = KFold(n_splits = 5, shuffle = True, random_state = 42)

In [12]:
from tqdm import tqdm
from sklearn.model_selection import cross_val_score

result = []

# Wrap the loop with tqdm
for name, model in tqdm(models.items(), desc="Training Models"):
    score = cross_val_score(
        model,
        x_train,
        y_train,
        cv=cv,
        scoring='r2'
    )

    result.append({
        "model": name,
        "mean_r2_score": score.mean(),
        "std_r2_score": score.std()
    })

Training Models: 100%|██████████| 9/9 [00:01<00:00,  4.60it/s]


In [14]:
result_df = pd.DataFrame(result).sort_values(
    by="mean_r2_score",
    ascending=False
)

result_df

Unnamed: 0,model,mean_r2_score,std_r2_score
1,Ridge Regression,0.648977,0.04585
0,Linear Regression,0.648554,0.04693
2,Lasso Regression,0.648554,0.04693
3,Elastic Net,0.646761,0.033704
8,KNN,0.618804,0.032884
6,Gradient Boosting Regressor,0.611717,0.038915
5,Random Forest Regressor,0.598633,0.070989
4,Decision Tree Regressor,0.245848,0.323814
7,SVR,-0.075666,0.069745


In [15]:
model = Ridge(alpha=1.0)
model.fit(x_train, y_train)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [16]:
print(model.predict(x_test))

[5209333.65610583 7238784.00789367 3066216.97422262 4558068.2995105
 3340741.4028412  3569794.2770604  5649215.19880907 6406207.00943786
 2764754.38532172 2681433.17537624 9548709.20618889 2829537.90839535
 3192875.16187883 3364627.52414417 3720135.31106176 5289015.59349102
 2991803.71668268 4810444.18364794 4392119.64142561 3529448.9969388
 5788361.36563121 5829061.01329469 2762180.19080652 4762452.20520371
 5218562.5277435  7509623.77967267 3255717.15906878 5232339.02314616
 8177324.28863266 3439773.30715701 6421131.64798691 3354345.88144365
 6731966.93593168 4164331.19225991 3599638.72842568 5776298.83689336
 4779944.18990157 4390660.42139248 3217958.00975858 4638150.49659783
 4534532.48769782 3533120.51655458 7219876.57839505 4032849.6231582
 3706706.14848836 4293733.4226932  6694971.50532264 4007018.90003463
 3802325.15609934 3456670.87163309 7287269.98925441 2836453.40614525
 4387062.33348364 4470938.38062399 3723947.30075557 2731065.17781012
 7506754.24822437 2956519.27880979 42

In [17]:
import joblib
joblib.dump(model, "E:\Projects\AI_playground\models\House_prediction_model.pkl")

  joblib.dump(model, "E:\Projects\AI_playground\models\House_prediction_model.pkl")


['E:\\Projects\\AI_playground\\models\\House_prediction_model.pkl']