#Dataset downloading

In [2]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip > /devnull
!unzip Dataset.zip > /devnull

--2021-10-12 11:55:53--  https://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19055526 (18M) [application/x-httpd-php]
Saving to: ‘Dataset.zip’


2021-10-12 11:55:53 (46.4 MB/s) - ‘Dataset.zip’ saved [19055526/19055526]



#Imports

In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error

#Dataset reading

In [9]:
features = pd.read_csv('Dataset/Training/Features_Variant_1.csv', header=None)
x_train = features.drop(columns=53)
y_train = features[53]

In [7]:
features.head(), x_train.head(), y_train.head()

(       0   1    2   3    4      5          6   ...  47  48  49  50  51  52  53
 0  634995   0  463   1  0.0  806.0  11.291045  ...   0   0   0   0   0   1   0
 1  634995   0  463   1  0.0  806.0  11.291045  ...   0   0   0   0   1   0   0
 2  634995   0  463   1  0.0  806.0  11.291045  ...   0   0   0   0   0   1   0
 3  634995   0  463   1  0.0  806.0  11.291045  ...   1   0   0   0   0   0   0
 4  634995   0  463   1  0.0  806.0  11.291045  ...   0   0   1   0   0   0   0
 
 [5 rows x 54 columns],
        0   1    2   3    4      5          6   ...  46  47  48  49  50  51  52
 0  634995   0  463   1  0.0  806.0  11.291045  ...   0   0   0   0   0   0   1
 1  634995   0  463   1  0.0  806.0  11.291045  ...   0   0   0   0   0   1   0
 2  634995   0  463   1  0.0  806.0  11.291045  ...   0   0   0   0   0   0   1
 3  634995   0  463   1  0.0  806.0  11.291045  ...   0   1   0   0   0   0   0
 4  634995   0  463   1  0.0  806.0  11.291045  ...   0   0   0   1   0   0   0
 
 [5 rows x 5

#Solution

In [13]:
min_max_scaler = MinMaxScaler()
x_train_norm = pd.DataFrame(min_max_scaler.fit_transform(x_train))

In [46]:
class Regression:
    def __init__(self, rate=0.05, iters=50):
        self.rate = rate
        self.iters = iters

    def fit(self, X, y):
        s_num, feat_num = X.shape
        
        self.b = 0.0
        self.W = np.zeros(feat_num)
    
        for _ in range(self.iters):
            predicted = X @ self.W + self.b

            b1 = 2 * np.sum(predicted - y) / s_num
            self.b -= self.rate * b1

            W1 = 2 * (X.T @ (predicted - y)) / s_num
            self.W -= self.rate * W1
        
    def predict(self, X):
        return X @ self.W + self.b


folds = KFold(
    n_splits=5,
    shuffle=True,
    random_state=777,
)

i = 1

def get_results(y_pred_fold, y_fold):
    res = dict()
    res['e'] = (y_fold.to_numpy() - y_pred_fold.to_numpy()).mean()
    res['std'] = (y_fold.to_numpy() - y_pred_fold.to_numpy()).std()
    res['r2'] = r2_score(y_fold.to_numpy(), y_pred_fold.to_numpy())
    res['rmse'] = mean_squared_error(y_fold.to_numpy(), y_pred_fold.to_numpy(), squared=False)
    return res

indexes = ["E-train", "STD-train", "R2-train", "RMSE-train", "E-test", "STD-test", "R2-test", "RMSE-test"]
cols = ['T1', 'T2', 'T3', 'T4', 'T5']
res = pd.DataFrame(index=indexes, columns=cols)

for train_i, test_i in folds.split(x_train_norm):
    X_train_fold = x_train_norm.iloc[train_i]
    y_train_fold = y_train.iloc[train_i]
    X_test_fold = x_train_norm.iloc[test_i]
    y_test_fold = y_train.iloc[test_i]
    
    reg = Regression(rate=0.3, iters=400)
    reg.fit(X_train_fold, y_train_fold)

    y_pred_train_fold = reg.predict(X_train_fold)
    y_pred_test_fold = reg.predict(X_test_fold)

    train_res = get_results(y_pred_train_fold, y_train_fold)
    res.at["E-train", f"T{i}"] = train_res['e']
    res.at["STD-train", f"T{i}"] = train_res['std']
    res.at["R2-train", f"T{i}"] = train_res['r2']
    res.at["RMSE-train", f"T{i}"] = train_res['rmse']
    test_res = get_results(y_pred_test_fold, y_test_fold)
    res.at["E-test", f"T{i}"] = test_res['e']
    res.at["STD-test", f"T{i}"] = test_res['std']
    res.at["R2-test", f"T{i}"] = test_res['r2']
    res.at["RMSE-test", f"T{i}"] = test_res['rmse']

    i += 1

res

Unnamed: 0,T1,T2,T3,T4,T5
E-train,-0.0576251,-0.0578978,-0.0606521,-0.0556527,-0.0579882
STD-train,28.2547,31.3916,31.7544,30.1048,31.6044
R2-train,0.279974,0.242955,0.253153,0.251945,0.247304
RMSE-train,28.2547,31.3916,31.7545,30.1049,31.6044
E-test,0.669603,-0.262504,-0.47932,0.0799893,-0.299171
STD-test,38.8938,27.6646,25.5733,32.9046,26.4586
R2-test,0.18816,0.299298,0.271931,0.254811,0.29352
RMSE-test,38.8996,27.6659,25.5778,32.9047,26.4603
