# Machine Learning - Sheet 1

## a) Load two datasets and fit OLS

In [30]:
from statsmodels.regression.linear_model import OLS
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from itertools import combinations_with_replacement

In [31]:
# load data
data_x = np.loadtxt("sheet1x.txt")  # 4 lines with 50 entries each
data_y = np.loadtxt("sheet1y.txt")  # 1 line with 50 entries

# transpose dimensions to match input format of OLS
data_x = data_x.T

print(data_x.shape)
print(data_y.shape)

# train test split
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.33, random_state=2)
print(x_test.shape)

(50, 4)
(50,)
(17, 4)


In [32]:
# fit model
model = OLS(y_train,  x_train)
results = model.fit()
# print summary
print(results.summary())

# evaluate fit
params = results.params
new_y = model.predict(params, exog=x_test)
# compute mse
mse = mean_squared_error(y_test, new_y)
print("MSE: ", mse)

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.674
Model:                            OLS   Adj. R-squared (uncentered):              0.629
Method:                 Least Squares   F-statistic:                              14.97
Date:                Wed, 29 Apr 2020   Prob (F-statistic):                    9.51e-07
Time:                        16:08:56   Log-Likelihood:                          5.8353
No. Observations:                  33   AIC:                                     -3.671
Df Residuals:                      29   BIC:                                      2.315
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## b) Interpret summaries:
- first three variables have pvalue < 0.05 -> variables statistically significant
- R squared is good
- F statistic is good
- small MSE

## c) Explore interactions

In [33]:
# create transposed copy to be able to multiply columns
data_x_transposed = data_x.T

In [34]:
# compute all combinations of columns without duplicates
combinations = list(combinations_with_replacement(range(data_x_transposed.shape[0]), 2))
print(combinations)


[(0, 0), (0, 1), (0, 2), (0, 3), (1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)]


In [35]:
# iterate over combinations of columns
for (i1, i2) in combinations :
    # multiply
    new_col = np.multiply(data_x_transposed[i1], data_x_transposed[i2])
    # expand dimensions to append
    new_col = np.expand_dims(new_col, axis=1)
    # append new column
    data_x = np.append(data_x, new_col, axis=1)

In [36]:
print(data_x.shape)

# train test split
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.33, random_state=2)
print(x_test.shape)


(50, 14)
(17, 14)


In [37]:
# fit model
model_2 = OLS(y_train, x_train)
results_2 = model_2.fit()
# print summary
print(results_2.summary())

# evaluate fit
params = results_2.params
new_y = model_2.predict(params, exog=x_test)
# compute MSE
mse = mean_squared_error(y_test, new_y)
print("MSE: ", mse)

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          5.376e+27
Date:                Wed, 29 Apr 2020   Prob (F-statistic):                   4.21e-259
Time:                        16:08:56   Log-Likelihood:                          1035.9
No. Observations:                  33   AIC:                                     -2044.
Df Residuals:                      19   BIC:                                     -2023.
Df Model:                          14                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## b) Interpret summaries:
- five variables have pvalue < 0.05 -> $x_1, x_3, x_1^2, x_1x_2, x_2x_3$ are statistically significant
- R squared is very good
- F statistic is very good
- very small MSE