# Exercise 1: Linear Regression

In [None]:
import numpy as np 
import h5py
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

In [None]:
! curl https://cernbox.cern.ch/s/6Ec5pGFEpFWeH6S/download -o ./Data-MLtutorial.tar.gz
! tar -xvzf ./Data-MLtutorial.tar.gz -C ./
! rm ./Data-MLtutorial.tar.gz 

In [None]:
# Load the dataset

target = np.array([])
inputs = np.array([])

datafiles = ['./Data-MLtutorial/JetDataset/jetImage_7_100p_0_10000.h5']

for file_ in datafiles:
    with h5py.File(file_, 'r') as f:
        print("Appending {}".format(file_))
        jets = np.array(f.get('jets'))
        tmp_inputs = np.array(f.get("jets"))[:,[5, 10]] # That's `j_tau2_b1` and `j_tau32_b1`
        tmp_target = np.array(f.get('jets'))[:,6] # That's `j_tau3_b1`
        inputs = np.concatenate([inputs, tmp_inputs], axis=0) if inputs.size else tmp_inputs
        target = np.concatenate([target, tmp_target], axis=0) if target.size else tmp_target

inputs = inputs / 100. # Prevents numerical issues
print(target.shape, inputs.shape)

Let's try to implement an linear regression that uses [gradient descent](https://ml-cheatsheet.readthedocs.io/en/latest/gradient_descent.html) to optimize its parameters. Use **(Mean Square Error)** as the objective to minimize, given by: $L(y, f(x)) = \frac{1}{2n} \sum\limits^{n} (y - w^TX)^2$.

**Methods to implement (in order):**

- `__init__` - initializes class with `coef_` and `intercept_` attributes set to `None`.
- `init_weights` - accepts an input `num_features`. Sets the `coef_` and `intercept_` attributes to random values drawn from a normal distribution with mean 0 and standard deviation 1 using `np.random.normal`. Use a numpy array for `coef_` and a float for `intercept_`.
- `predict` - accepts an input `X` and outputs a prediction numpy array using the current parameters.
- `score` - accepts inputs `X` and `y_true` and outputs the $R^2$ score of the model.
- `calc_loss` - accepts inputs `X` and `y_true` as numpy arrays. Calculates and returns the Mean Square Error loss metric with the current parameters.
- `calc_grad` - accepts inputs `X` and `y_true` and, using the current parameters, outputs the gradients ([coef_grad, intercept_grad]). Does not update the parameters.
- `fit` - accepts inputs `X`, `y_true` and default kwargs `max_iter`, `learning_rate` and fits a model using gradient descent. Should call `calc_grad` to get gradients from current parameters and update the parameters using the gradient.

In [None]:
class LinearRegressionModel():

    def __init__(self):
        """Initializes class with coef_ and intercept_ attributes set to None."""
        pass

    def init_weights(self, num_features):
        """Accepts an input num_features.
        
        Sets the coef_ and intercept_ attributes to random values
        drawn from a normal distribution with mean 0 and standard
        deviation 1 using np.random.normal. Use a numpy array for
        coef_ and a float for intercept_."""
        pass

    def predict(self, X):
        """Creates a prediction with the current coef/intercept values"""
        pass

    def score(self, X, y_true):
        """Accepts inputs X and y_true and outputs the R2 score of the model."""
        pass

    def calc_loss(self, X, y_true):
        """Calculates the loss value using current coef_ and intercept_ values"""
        pass
    
    def calc_grad(self, X, y_true):
        """Calculates gradients for coef/intercept values"""
        pass
    
    def fit(self, X, y_true, max_iter=10000, learning_rate=0.01):
        """
        Accepts inputs X, y_true and default kwargs max_iter,
        learning_rate. Fits a model using gradient descent.
        Should call calc_grad to get gradients from current
        parameters and update the parameters using the gradient.
        """
        pass

In [None]:
my_regressor = LinearRegressionModel()
my_regressor.init_weights(2)
my_regressor.fit(inputs, target)

In [None]:
plt.scatter(target, my_regressor.predict(inputs))
plt.plot([0, 100], [0, 100], 'r')
plt.xlabel('Expected')
plt.ylabel('Predicted');

In [None]:
print("The resulting equation: {}x + {} has a R2 score of {}".format(
    my_regressor.coef_,
    my_regressor.intercept_,
    my_regressor.score(inputs, target)))

**Expected R2: 0.73**

In [None]:
reg = LinearRegression().fit(inputs, target)
print("The resulting equation: {}x + {} has a R2 score of {}".format(
    reg.coef_,
    reg.intercept_,
    reg.score(inputs, target)))
plt.scatter(target, my_regressor.predict(inputs))
plt.plot([0, 100], [0, 100], 'r')
plt.xlabel('Expected')
plt.ylabel('Predicted');