# ColumbiaX-01-Linear-Regression

## Set Up Session

In [1]:
import numpy as np

## Data Generation

In [2]:
# Parameters:
w_lambda = 2  
y_sigma2 = 3

In [3]:
# Configuration:
N, D = (1024, 42)

In [4]:
# Generate weights:
w_mu = np.random.randn(D + 1)
w_cov = 1.0 / w_lambda * np.eye(D + 1)
w = np.random.multivariate_normal(w_mu, w_cov)
# Set bias:
w[-1] = w[-1] + 3

In [5]:
# Generate samples:
X = np.concatenate(
    (np.random.randn(N, D), np.ones(shape=(N, 1))),
    axis = 1
)

In [6]:
# Generate observations:
y_mu = np.matmul(X, w)
y_cov = y_sigma2 * np.eye(N)
y = np.random.multivariate_normal(y_mu, y_cov)

In [7]:
# Train-test split:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.1, random_state = 42
)

In [8]:
# Save data for command-line application:
import pandas as pd

pd.DataFrame(data = X_train).to_csv('X_train.csv', index=False, header=False)
pd.DataFrame(data = y_train).to_csv('y_train.csv', index=False, header=False)
pd.DataFrame(data = X_test).to_csv('X_test.csv', index=False, header=False)

## Posterior Estimation

In [9]:
# Posterior covariance matrix:
w_cov_posterior = np.linalg.pinv(w_lambda + np.matmul(X_train.T, X_train) / y_sigma2)

## Ridge Regression

Given samples and observations, solve the following ridge regression problem under the given generation & observation noises

$$
w_{RR} = \arg\min_w \|y - Xw\|^2 + \lambda\|w\|^2.
$$

Which is

$$
w_{RR} = (\lambda\sigma^2I + X^TX)^{-1}X^Ty
$$

In [10]:
# Solve it:
w_rr = np.matmul(
    w_cov_posterior / y_sigma2,
    np.matmul(
        X_train.T,
        y_train
    )
)

In [11]:
# Relative error, l2-norm:
print "[Relative Error (L2-Norm)]: {:.2f}%".format(
    100.0 * np.linalg.norm(w_rr - w) / np.linalg.norm(w)
)

[Relative Error (L2-Norm)]: 6.16%


## Active Learning

In [12]:
# Posterior estimation variance:
y_sigma2_posterior = np.asarray(
    [
        np.matmul(
            x,
            np.matmul(
                w_cov_posterior,
                x.T
            )
        ) 
        for x in X_test
    ]
)

In [13]:
# Select top 10:
probe_sequence = 1 + np.argsort(y_sigma2_posterior)[::-1][:10]

## Generate Output

In [14]:
# Ridge regression weights:
w_rr_output_name = "wRR_{w_lambda}.csv".format(w_lambda=w_lambda)
with open(w_rr_output_name, "w") as w_rr_output:
    w_rr_output.write("%s" % "\n".join(str(w_rr_val) for w_rr_val in w_rr))

In [15]:
# Active learning probe sequence:
probe_seq_output_name = "active_{w_lambda}_{y_sigma2}.csv".format(w_lambda=w_lambda, y_sigma2=y_sigma2)
with open(probe_seq_output_name, "w") as probe_seq_output:
    probe_seq_output.write("%s" % ",".join(str(probe_sequence_idx) for probe_sequence_idx in probe_sequence))