# Experiments with Normalizing Data
## Linear Regression

We ran simple linear regression on the training data using a simple training/validation split and normalizing and unnormalizing different parts of the data. The resutls we go were that normalizing either the X or the Y or both were necessary to obtain reasonable results. However, if the inputs were normalized, then normalizing the outputs were unnecessary, and vice versa. 

In [88]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, train_test_split

In [5]:
# Read training data
Z = pd.read_parquet('../data/train_regression.parquet')

In [75]:
# Read training data
Z = pd.read_parquet('../data/train_regression.parquet')
# Define training inputs and outputs
X = Z.iloc[:,:-1]
y = Z.iloc[:,-1]
# X = (X - X.mean(axis=0))/X.std(axis=0)
# y = (y - y.mean(axis=0))/y.std(axis=0)
X = np.array(X)
y = np.array(y)

In [72]:
# Compute R2 score using all features
r2_vals = []
mse_vals = []
model = LinearRegression()
kf = KFold(n_splits=4)
for train, test in kf.split(X):
    X_train = X[train]
    y_train = y[train]
    X_test = X[test]
    y_test = y[test]
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2_vals.append(r2_score(y_test, y_pred))
    mse_vals.append(mean_squared_error(y_test, y_pred))
print("Mean validation R2-score: ", np.mean(r2_vals))
print("STD validation R2-score: ", np.std(r2_vals))
print("Mean validation MSE: ", np.mean(mse_vals))
print("Std dev of validation MSE: ", np.std(mse_vals))

Mean validation R2-score:  0.9999254116610202
STD validation R2-score:  2.481548168859912e-06
Mean validation MSE:  0.24865689314970998
Std dev of validation MSE:  0.009117647896531965


# Linear Regression
## (Unnormalized X, Unnormalized Y, float64)

In [76]:
# Compute R2 score using all features
r2_vals = []
mse_vals = []
model = LinearRegression()
kf = KFold(n_splits=4)
for train, test in kf.split(X):
    X_train = X[train]
    y_train = y[train]
    X_test = X[test]
    y_test = y[test]
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2_vals.append(r2_score(y_test, y_pred))
    mse_vals.append(mean_squared_error(y_test, y_pred))
print("Mean validation R2-score: ", np.mean(r2_vals))
print("STD validation R2-score: ", np.std(r2_vals))
print("Mean validation MSE: ", np.mean(mse_vals))
print("Std dev of validation MSE: ", np.std(mse_vals))

Mean validation R2-score:  0.9999254116610202
STD validation R2-score:  2.481548168859912e-06
Mean validation MSE:  0.24865689314970998
Std dev of validation MSE:  0.009117647896531965


## Unnormalized X, Unnormalized Y, float64

In [79]:
# Compute R2 score using all features
r2_vals = []
mse_vals = []
model = LinearRegression()
kf = KFold(n_splits=5)
X_32 = np.array(X, dtype=np.float32)
Y_32 = np.array(y, dtype=np.float32)

for train, test in kf.split(X):
    X_train = X_32[train]
    y_train = Y_32[train]
    X_test = X_32[test]
    y_test = Y_32[test]
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2_vals.append(r2_score(y_test, y_pred))
    mse_vals.append(mean_squared_error(y_test, y_pred))
print("Mean validation R2-score: ", np.mean(r2_vals))
print("STD validation R2-score: ", np.std(r2_vals))
print("Mean validation MSE: ", np.mean(mse_vals))
print("Std dev of validation MSE: ", np.std(mse_vals))

Mean validation R2-score:  0.9983756794482023
STD validation R2-score:  0.00029663842760977436
Mean validation MSE:  5.4159727
Std dev of validation MSE:  0.99981517


## Normalized X, Normalized Y, float64

In [83]:
# Compute R2 score using all features
r2_vals = []
mse_vals = []
model = LinearRegression()
kf = KFold(n_splits=5)

for train, test in kf.split(X):
    X_train = X[train]
    y_train = y[train]
    X_test = X[test]
    y_test = y[test]

    X_train_mean = X_train.mean(axis=0)
    X_train_std = X_train.std(axis=0)
    y_train_mean  = y_train.mean(axis=0)
    y_train_std  = y_train.std(axis=0)
    X_train = (X_train - X_train_mean)/X_train_std
    y_train = (y_train - y_train_mean)/y_train_std
    X_test = (X_test - X_train_mean)/X_train_std
    y_test = (y_test - y_train_mean)/y_train_std

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred = y_pred * y_train_std + y_train_mean
    y_test = y_test * y_train_std + y_train_mean

    r2_vals.append(r2_score(y_test, y_pred))
    mse_vals.append(mean_squared_error(y_test, y_pred))
print("Mean validation R2-score: ", np.mean(r2_vals))
print("STD validation R2-score: ", np.std(r2_vals))
print("Mean validation MSE: ", np.mean(mse_vals))
print("Std dev of validation MSE: ", np.std(mse_vals))

Mean validation R2-score:  0.9999254357376184
STD validation R2-score:  2.3411258828265707e-06
Mean validation MSE:  0.24856790767479026
Std dev of validation MSE:  0.008534127680323723


## Normalized X, Unnormalized Y, float64

In [84]:
# Compute R2 score using all features
r2_vals = []
mse_vals = []
model = LinearRegression()
kf = KFold(n_splits=5)

for train, test in kf.split(X):
    X_train = X[train]
    y_train = y[train]
    X_test = X[test]
    y_test = y[test]

    X_train_mean = X_train.mean(axis=0)
    X_train_std = X_train.std(axis=0)
    y_train_mean  = y_train.mean(axis=0)
    y_train_std  = y_train.std(axis=0)
    X_train = (X_train - X_train_mean)/X_train_std
    # y_train = (y_train - y_train_mean)/y_train_std
    X_test = (X_test - X_train_mean)/X_train_std
    # y_test = (y_test - y_train_mean)/y_train_std

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # y_pred = y_pred * y_train_std + y_train_mean
    # y_test = y_test * y_train_std + y_train_mean

    r2_vals.append(r2_score(y_test, y_pred))
    mse_vals.append(mean_squared_error(y_test, y_pred))
print("Mean validation R2-score: ", np.mean(r2_vals))
print("STD validation R2-score: ", np.std(r2_vals))
print("Mean validation MSE: ", np.mean(mse_vals))
print("Std dev of validation MSE: ", np.std(mse_vals))

Mean validation R2-score:  0.9999254357375473
STD validation R2-score:  2.341125877041843e-06
Mean validation MSE:  0.2485679079108051
Std dev of validation MSE:  0.008534127618196703


## Unnormalized X, normalized Y, float64

In [87]:
# Compute R2 score using all features
r2_vals = []
mse_vals = []
model = LinearRegression()
kf = KFold(n_splits=5)

for train, test in kf.split(X):
    X_train = X[train]
    y_train = y[train]
    X_test = X[test]
    y_test = y[test]

    X_train_mean = X_train.mean(axis=0)
    X_train_std = X_train.std(axis=0)
    y_train_mean  = y_train.mean(axis=0)
    y_train_std  = y_train.std(axis=0)
    # X_train = (X_train - X_train_mean)/X_train_std
    y_train = (y_train - y_train_mean)/y_train_std
    # X_test = (X_test - X_train_mean)/X_train_std
    y_test = (y_test - y_train_mean)/y_train_std

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred = y_pred * y_train_std + y_train_mean
    y_test = y_test * y_train_std + y_train_mean

    r2_vals.append(r2_score(y_test, y_pred))
    mse_vals.append(mean_squared_error(y_test, y_pred))
print("Mean validation R2-score: ", np.mean(r2_vals))
print("STD validation R2-score: ", np.std(r2_vals))
print("Mean validation MSE: ", np.mean(mse_vals))
print("Std dev of validation MSE: ", np.std(mse_vals))

Mean validation R2-score:  0.9999254357868622
STD validation R2-score:  2.3410439817496452e-06
Mean validation MSE:  0.24856774233031106
Std dev of validation MSE:  0.008533842295182127
