In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
data = pd.read_csv('../data/Housing_raw.csv')
Xtrain,Xrest,ytrain,yrest = train_test_split(data.drop(columns=['price']),data['price'],test_size=0.4,random_state=42)
Xval,Xtest,yval,ytest = train_test_split(Xrest,yrest,test_size=0.5,random_state=42)

In [2]:
Xtrain.info()


<class 'pandas.core.frame.DataFrame'>
Index: 327 entries, 519 to 102
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   area              327 non-null    int64 
 1   bedrooms          327 non-null    int64 
 2   bathrooms         327 non-null    int64 
 3   stories           327 non-null    int64 
 4   mainroad          327 non-null    object
 5   guestroom         327 non-null    object
 6   basement          327 non-null    object
 7   hotwaterheating   327 non-null    object
 8   airconditioning   327 non-null    object
 9   parking           327 non-null    int64 
 10  prefarea          327 non-null    object
 11  furnishingstatus  327 non-null    object
dtypes: int64(5), object(7)
memory usage: 33.2+ KB


In [3]:
Xtrain[Xtrain.select_dtypes(include=['object']).columns].nunique()

mainroad            2
guestroom           2
basement            2
hotwaterheating     2
airconditioning     2
prefarea            2
furnishingstatus    3
dtype: int64

In [4]:
Xtrain

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
519,4840,2,1,2,yes,no,no,no,no,0,no,unfurnished
118,6420,3,1,1,yes,no,yes,no,yes,0,yes,furnished
407,2145,3,1,3,yes,no,no,no,no,0,yes,furnished
114,6800,2,1,1,yes,yes,yes,no,no,2,no,furnished
515,3210,3,1,2,yes,no,yes,no,no,0,no,unfurnished
...,...,...,...,...,...,...,...,...,...,...,...,...
71,6000,4,2,4,yes,no,no,no,yes,0,no,unfurnished
106,5450,4,2,1,yes,no,yes,no,yes,0,yes,semi-furnished
270,4500,3,2,3,yes,no,no,yes,no,1,no,furnished
435,4040,2,1,1,yes,no,no,no,no,0,no,unfurnished


In [5]:
def data_preprocessing(data_to_preprocess: pd.DataFrame) -> pd.DataFrame:
    data_to_preprocess = data_to_preprocess.copy()
    for numeric_column in data_to_preprocess.select_dtypes(include=['number']).columns:
        if numeric_column == 'price':
            data_to_preprocess[numeric_column] = data_to_preprocess[numeric_column].fillna(data[numeric_column].mean().round())
        else:
            data_to_preprocess[numeric_column] = data_to_preprocess[numeric_column].fillna(Xtrain[numeric_column].mean().round())
        
    for categorical_column in data_to_preprocess.select_dtypes(include=['object']).columns:
        data_to_preprocess[categorical_column] = data_to_preprocess[categorical_column].fillna(Xtrain[categorical_column].mode()[0])
        if categorical_column == 'furnishingstatus':
            categories_order = ['unfurnished', 'semi-furnished', 'furnished']
        else:
            categories_order = ['no', 'yes']
        category = pd.api.types.CategoricalDtype(categories=categories_order, ordered=True)
        data_to_preprocess[categorical_column] = data_to_preprocess[categorical_column].astype(category).cat.codes.astype('int64')
    return data_to_preprocess

Xtrain = data_preprocessing(Xtrain)
Xval = data_preprocessing(Xval)
Xtest = data_preprocessing(Xtest)
data = data_preprocessing(data)


In [6]:
Xtrain.info()

<class 'pandas.core.frame.DataFrame'>
Index: 327 entries, 519 to 102
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   area              327 non-null    int64
 1   bedrooms          327 non-null    int64
 2   bathrooms         327 non-null    int64
 3   stories           327 non-null    int64
 4   mainroad          327 non-null    int64
 5   guestroom         327 non-null    int64
 6   basement          327 non-null    int64
 7   hotwaterheating   327 non-null    int64
 8   airconditioning   327 non-null    int64
 9   parking           327 non-null    int64
 10  prefarea          327 non-null    int64
 11  furnishingstatus  327 non-null    int64
dtypes: int64(12)
memory usage: 33.2 KB


In [7]:
Xtrain

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
519,4840,2,1,2,1,0,0,0,0,0,0,0
118,6420,3,1,1,1,0,1,0,1,0,1,2
407,2145,3,1,3,1,0,0,0,0,0,1,2
114,6800,2,1,1,1,1,1,0,0,2,0,2
515,3210,3,1,2,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
71,6000,4,2,4,1,0,0,0,1,0,0,0
106,5450,4,2,1,1,0,1,0,1,0,1,1
270,4500,3,2,3,1,0,0,1,0,1,0,2
435,4040,2,1,1,1,0,0,0,0,0,0,0


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
standart_scaler = StandardScaler()
normal_scaler = MinMaxScaler()
Xtrain_standart = pd.DataFrame(standart_scaler.fit_transform(Xtrain), columns=Xtrain.columns)
Xval_standart = pd.DataFrame(standart_scaler.transform(Xval), columns=Xval.columns)
Xtest_standart = pd.DataFrame(standart_scaler.transform(Xtest), columns=Xtest.columns)
Xtrain_normal = pd.DataFrame(normal_scaler.fit_transform(Xtrain), columns=Xtrain.columns)
Xval_normal = pd.DataFrame(normal_scaler.transform(Xval), columns=Xval.columns)
Xtest_normal = pd.DataFrame(normal_scaler.transform(Xtest), columns=Xtest.columns)
standart_data = pd.DataFrame(standart_scaler.fit_transform(data), columns=data.columns)
normal_data = pd.DataFrame(normal_scaler.fit_transform(data), columns=data.columns)
standart_data.to_csv('../data/Housing_standart.csv', index=False)
normal_data.to_csv('../data/Housing_normal.csv', index=False)

In [9]:
import sys
sys.path.append('../')
from src.linear_regression import LinearRegression
from src.knn import KNN
from sklearn.linear_model import LinearRegression as sklearn_LinearRegression
from sklearn.model_selection import ParameterGrid
from sklearn.neighbors import KNeighborsRegressor as sklearn_KNeighborsRegressor
from src.tree_regression import TreeRegression
from sklearn.tree import DecisionTreeRegressor as sklearn_DecisionTreeRegressor
def model_tuning(model_name,Xdata,Xvaldata,sklearn_model):
    if model_name == 'LinearRegression':
        if sklearn_model == False:
            parameters = {
                'solver': ['normal', 'gd'],
                'learning_rate': [0.001, 0.005, 0.01],
                'n_iters': [100, 200, 300]
            }
            parameters_grid = ParameterGrid(parameters)
            best_params = None
            best_score = 0
            for params in parameters_grid:
                model = LinearRegression(**params)
                model.fit(Xdata,ytrain)
                score = model.score(Xvaldata,yval)
                if score > best_score:
                    best_score = score
                    best_params = params
            print(f"Best score  for all variants in non-sklearn LinearRegression: {best_score}")
            print(f"Best params for all variants in non-sklearn LinearRegression: {best_params}")
            return best_params, best_score  
        else:
            model = sklearn_LinearRegression()
            model.fit(Xdata,ytrain)
            score = model.score(Xvaldata,yval)  
            print(f"Best score for all variants in sklearn LinearRegression: {score}")
            print(f"Best params for all variants in sklearn LinearRegression: {model.get_params()}")
            return model, score
    elif model_name == 'KNN':
        if sklearn_model == True:
            parameters = {
                'n_neighbors': [3, 5, 7],
                'p': [1, 2],
                'weights': ['uniform', 'distance']
            }
            parameters_grid = ParameterGrid(parameters)
            best_params = None
            best_score = 0
            for params in parameters_grid:
                model = sklearn_KNeighborsRegressor(**params)
                model.fit(Xdata,ytrain)
                score = model.score(Xvaldata,yval)
                if score > best_score:
                    best_score = score
                    best_params = params
            print(f"Best score for all variants in sklearn KNN: {best_score}")
            print(f"Best params for all variants in sklearn KNN: {best_params}")
            return best_params, best_score
        else:
            parameters = {
                'n_neighbors': [3, 5, 7],
                'p': [1, 2],
                'weights': ['uniform', 'distance'],
                'task_class': ['r']
            }
            parameters_grid = ParameterGrid(parameters)
            best_params = None
            best_score = 0
            for params in parameters_grid:
                model = KNN(**params)
                model.fit(Xdata,ytrain)
                score = model.score(Xvaldata,yval)
                if score > best_score:
                    best_score = score
                    best_params = params
            print(f"Best score for all variants in non-sklearn KNN: {best_score}")
            print(f"Best params for all variants in non-sklearn KNN: {best_params}")
            return best_params, best_score
    else:
        parameters = {
            'max_depth': range(2,10),
            'min_samples_split': [2, 3, 4,5],
            'criterion': ['squared_error', 'absolute_error']
        }
        parameters_grid = ParameterGrid(parameters)
        best_params = None
        best_score = 0
        for params in parameters_grid:
            if sklearn_model == True:
                model = sklearn_DecisionTreeRegressor(**params)
            else:
                model = TreeRegression(**params)
            model.fit(Xdata,ytrain)
            score = model.score(Xvaldata,yval)
            if score > best_score:
                best_score = score
                best_params = params
        if sklearn_model == True:
            print(f"Best score for all variants in sklearn TreeRegression: {best_score}")
            print(f"Best params for all variants in sklearn TreeRegression: {best_params}")
        else:
            print(f"Best score for all variants in non-sklearn TreeRegression: {best_score}")
            print(f"Best params for all variants in non-sklearn TreeRegression: {best_params}")
        return best_params, best_score

In [10]:
model_tuning('LinearRegression',Xtrain_standart,Xval_standart,True)
model_tuning('LinearRegression',Xtrain_normal,Xval_normal,True)
model_tuning('LinearRegression',Xtrain_standart,Xval_standart,False)
model_tuning('LinearRegression',Xtrain_normal,Xval_normal,False)

Best score for all variants in sklearn LinearRegression: 0.6720494865963875
Best params for all variants in sklearn LinearRegression: {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False, 'tol': 1e-06}
Best score for all variants in sklearn LinearRegression: 0.6720494865963874
Best params for all variants in sklearn LinearRegression: {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False, 'tol': 1e-06}
Best score  for all variants in non-sklearn LinearRegression: 0.6720494865963872
Best params for all variants in non-sklearn LinearRegression: {'learning_rate': 0.001, 'n_iters': 100, 'solver': 'normal'}
Best score  for all variants in non-sklearn LinearRegression: 0.6720494865963862
Best params for all variants in non-sklearn LinearRegression: {'learning_rate': 0.001, 'n_iters': 100, 'solver': 'normal'}


({'learning_rate': 0.001, 'n_iters': 100, 'solver': 'normal'},
 np.float64(0.6720494865963862))

In [11]:
model_tuning('KNN',Xtrain_standart,Xval_standart,True)
model_tuning('KNN',Xtrain_normal,Xval_normal,True)
model_tuning('KNN',Xtrain_standart,Xval_standart,False)
model_tuning('KNN',Xtrain_normal,Xval_normal,False)

Best score for all variants in sklearn KNN: 0.618514498964774
Best params for all variants in sklearn KNN: {'n_neighbors': 7, 'p': 2, 'weights': 'distance'}
Best score for all variants in sklearn KNN: 0.5633379234068205
Best params for all variants in sklearn KNN: {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
Best score for all variants in non-sklearn KNN: 0.6086411664181817
Best params for all variants in non-sklearn KNN: {'n_neighbors': 7, 'p': 2, 'task_class': 'r', 'weights': 'uniform'}
Best score for all variants in non-sklearn KNN: 0.5617069219774375
Best params for all variants in non-sklearn KNN: {'n_neighbors': 5, 'p': 1, 'task_class': 'r', 'weights': 'uniform'}


({'n_neighbors': 5, 'p': 1, 'task_class': 'r', 'weights': 'uniform'},
 np.float64(0.5617069219774375))

In [12]:
model_tuning('RegressionTree',Xtrain_standart,Xval_standart,True)
model_tuning('RegressionTree',Xtrain_normal,Xval_normal,True)
model_tuning('RegressionTree',Xtrain_standart,Xval_standart,False)
model_tuning('RegressionTree',Xtrain_normal,Xval_normal,False)


Best score for all variants in sklearn TreeRegression: 0.5288803391647945
Best params for all variants in sklearn TreeRegression: {'criterion': 'squared_error', 'max_depth': 5, 'min_samples_split': 5}
Best score for all variants in sklearn TreeRegression: 0.5085123295453016
Best params for all variants in sklearn TreeRegression: {'criterion': 'squared_error', 'max_depth': 6, 'min_samples_split': 2}
Best score for all variants in non-sklearn TreeRegression: 0.45244902637735696
Best params for all variants in non-sklearn TreeRegression: {'criterion': 'squared_error', 'max_depth': 9, 'min_samples_split': 5}
Best score for all variants in non-sklearn TreeRegression: 0.45244902637735696
Best params for all variants in non-sklearn TreeRegression: {'criterion': 'squared_error', 'max_depth': 9, 'min_samples_split': 5}


({'criterion': 'squared_error', 'max_depth': 9, 'min_samples_split': 5},
 np.float64(0.45244902637735696))

In [13]:
best_params_sklearn_lr = {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False, 'tol': 1e-06}
best_params_custom_lr = {'solver': 'normal', 'learning_rate': 0.001, 'n_iters': 100}

best_params_sklearn_knn = {'n_neighbors': 7, 'p': 2, 'weights': 'distance'}
best_params_custom_knn = {'n_neighbors': 7, 'p': 2, 'weights': 'uniform', 'task_class': 'r'}

best_params_sklearn_tree = {'criterion': 'squared_error', 'max_depth': 5, 'min_samples_split': 2}
best_params_custom_tree = {'criterion': 'squared_error', 'max_depth': 9, 'min_samples_split': 5}

sklearn_lr = sklearn_LinearRegression(**best_params_sklearn_lr)
sklearn_lr.fit(Xtrain, ytrain)
sklearn_test_score = sklearn_lr.score(Xtest, ytest)

custom_lr = LinearRegression(**best_params_custom_lr)
custom_lr.fit(Xtrain, ytrain)
custom_test_score = custom_lr.score(Xtest, ytest)

print(f"Final test R² (sklearn LinearRegression): {sklearn_test_score:.4f} | Params: {best_params_sklearn_lr}")
print(f"Final test R² (custom LinearRegression): {custom_test_score:.4f} | Params: {best_params_custom_lr}")

sklearn_knn = sklearn_KNeighborsRegressor(**best_params_sklearn_knn)
sklearn_knn.fit(Xtrain, ytrain)
sklearn_knn_test_score = sklearn_knn.score(Xtest, ytest)

custom_knn = KNN(**best_params_custom_knn)
custom_knn.fit(Xtrain, ytrain)
custom_knn_test_score = custom_knn.score(Xtest, ytest)

print(f"Final test R² (sklearn KNN): {sklearn_knn_test_score:.4f} | Params: {best_params_sklearn_knn}")
print(f"Final test R² (custom KNN): {custom_knn_test_score:.4f} | Params: {best_params_custom_knn}")

sklearn_tree = sklearn_DecisionTreeRegressor(**best_params_sklearn_tree)
sklearn_tree.fit(Xtrain, ytrain)
sklearn_tree_test_score = sklearn_tree.score(Xtest, ytest)

custom_tree = TreeRegression(**best_params_custom_tree)
custom_tree.fit(Xtrain, ytrain)
custom_tree_test_score = custom_tree.score(Xtest, ytest)

print(f"Final test R² (sklearn TreeRegression): {sklearn_tree_test_score:.4f} | Params: {best_params_sklearn_tree}")
print(f"Final test R² (custom TreeRegression): {custom_tree_test_score:.4f} | Params: {best_params_custom_tree}")

Final test R² (sklearn LinearRegression): 0.6718 | Params: {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False, 'tol': 1e-06}
Final test R² (custom LinearRegression): 0.6718 | Params: {'solver': 'normal', 'learning_rate': 0.001, 'n_iters': 100}
Final test R² (sklearn KNN): 0.1548 | Params: {'n_neighbors': 7, 'p': 2, 'weights': 'distance'}
Final test R² (custom KNN): 0.2308 | Params: {'n_neighbors': 7, 'p': 2, 'weights': 'uniform', 'task_class': 'r'}
Final test R² (sklearn TreeRegression): 0.3753 | Params: {'criterion': 'squared_error', 'max_depth': 5, 'min_samples_split': 2}
Final test R² (custom TreeRegression): 0.2989 | Params: {'criterion': 'squared_error', 'max_depth': 9, 'min_samples_split': 5}
