In [870]:
# Import all packages

import scipy as sp
import scipy.linalg as spla
import scipy.optimize as spopt
import scipy.stats as spst

import numpy as np
import numpy.linalg as npla

import statsmodels as sm
import statsmodels.api as sma
smfOLS = sma.regression.linear_model.OLS.from_formula
smfLGT = sm.discrete.discrete_model.Logit.from_formula

import seaborn as sb
import pandas as pd

import matplotlib.pyplot as plt

from inspect import signature as sig

from sklearn.linear_model import Ridge as skRidge
from sklearn.linear_model import Lasso as skLasso
from sklearn.linear_model import ElasticNet as skEN

from sklearn.linear_model import LogisticRegression as skLGT

import statsmodels.formula.api as smf

from inspect import signature

import statsmodels

from sklearn.model_selection import KFold
import itertools

### Assignment 1 (1 point): Prove the equivalence of the two approaches, that is, $\hat \beta$ would be the same if we redefine $y$

### From the lecture we know that:
- CS  : $\mathcal{L}=\prod F^{\frac{1+y_i}{2}}(\beta' x_i /\sigma) \cdot (1-F(\beta' x_i /\sigma))^{\frac{1-y_i}{2}} = \prod F(y_i \cdot \beta' x_i /\sigma)$ and $y_i \in \{-1,1\}$
- ECON: $\mathcal{L}=\prod F^{y_i}(\beta' x_i /\sigma) \cdot (1-F(\beta' x_i /\sigma))^{1-y_i}$ and $y_i \in \{0,1\}$

## ASSIGNMENT 1 - SOLUTION

Proof of equivalence of two formulas

##### Let's firstly assume the case when $y_i = -1$ (cs) or $y_i = 0$ (econ):
- CS: $\mathcal{L} = \prod F(y_i \cdot \beta' x_i /\sigma) = \prod F(-1 \cdot \beta' x_i /\sigma)$ 
- Econ: $\mathcal{L}=\prod F^{y_i}(\beta' x_i /\sigma) \cdot (1-F(\beta' x_i /\sigma))^{1-y_i} = \prod F^{0}(\beta' x_i /\sigma) \cdot (1-F(\beta' x_i /\sigma))^{1-0} = \prod F^{0}(\beta' x_i /\sigma) \cdot (1-F(\beta' x_i /\sigma))^{1} = \prod 1-F(\beta' x_i /\sigma)$

From the lecture we know that $F_{log}(\varepsilon)=\frac{1}{1+\exp^{-\varepsilon}}=1-F_{log}(-\varepsilon)$

$\Rightarrow \prod F(-1 \cdot \beta' x_i /\sigma) = \prod 1-F(\beta' x_i /\sigma)$

##### Then let's also check the case when $y_i = 1$ (cs) or $y_i = 1$ (econ):
- CS: $\mathcal{L} = \prod F(y_i \cdot \beta' x_i /\sigma) = \prod F(1 \cdot \beta' x_i /\sigma) = \prod F(\beta' x_i /\sigma)$ 
- Econ: $\mathcal{L}=\prod F^{y_i}(\beta' x_i /\sigma) \cdot (1-F(\beta' x_i /\sigma))^{1-y_i} = \prod F^{0}(\beta' x_i /\sigma) \cdot (1-F(\beta' x_i /\sigma))^{1-0} = \prod F^{1}(\beta' x_i /\sigma) \cdot (1-F(\beta' x_i /\sigma))^{0} = \prod F^{1}(\beta' x_i /\sigma)$

Actually we got the same results $\Rightarrow$ for $y_i = 1$ (cs) or $y_i = 1$ (econ) formulas are also equal

##### So, we can conclude that if we redefine $y$, then both approaches will be equal in all the cases: if likelihood functions are equal, then optimizing them, we will get the same betas

### Assignment 2 (1 point): Find and fix the error in the code (moved the code below)

### From the lecture we know that
Read what regularization parameters are because they can be called differently in different packages
- ols: $$ \quad Loss = \frac{1}{2n}\sum (y_i - \beta' x_i)^2 + a * ||w||_1 + b * \frac{||w||^2_2}{2} \to \min_{\beta}, \quad \text{where} \quad l1_{ratio} = \frac{a}{a+b}, \quad \alpha=a+b$$
- logit: $$ \quad Loss = \sum \log(1+\exp(-y_i \beta' x_i)) + a * ||w||_1 + b * \frac{||w||^2_2}{2} \to \min_{\beta}, \quad \text{where} \quad l1_{ratio} = \frac{a}{a+b}, \quad C=a+b$$

## ASSIGNMENT 2 - SOLUTION

Idea is that logaddexp = log(exp(o) + exp(z)) and if we pass o = np.ones(), then we get vector of exp(1) = exp
But according to the formula of loss (below), we need to get the vector of ones. So, we need to pass np.zeros() to the function because exp(0) = 1 and this is exactly the vector of ones that we need to get!
$$ \quad Loss = \sum \log(1+\exp(-y_i \beta' x_i)) + a * ||w||_1 + b * \frac{||w||^2_2}{2} \to \min_{\beta}, \quad \text{where} \quad l1_{ratio} = \frac{a}{a+b}, \quad C=a+b$$

In [882]:
# Original code
def OLS_loss(Y, X, beta, a, b):
    z = Y - X@beta
    return np.square(z).sum()/(2*Y.size) + a*np.abs(beta).sum() + b*np.square(beta).sum()/2

def LGT_loss(Y, X, beta, a, b):
    # important for #3 - here is how to calculate value inside the exponent    
    z = -Y*X@beta # need to be not very large numbers, loagddexp needed to make computation easier
    o = np.ones(Y.size)
    return np.logaddexp(o, z).sum() + a*np.abs(beta).sum() + b*np.square(beta).sum()/2

In [883]:
# Correct code
def OLS_loss(Y, X, beta, a, b):
    z = Y - X@beta
    return np.square(z).sum()/(2*Y.size) + a*np.abs(beta).sum() + b*np.square(beta).sum()/2

def LGT_loss(Y, X, beta, a, b):
    z = -Y*X@beta
    # np.zeros instead of np.ones
    o = np.zeros(Y.size)
    return np.logaddexp(o, z).sum() + a*np.abs(beta).sum() + b*np.square(beta).sum()/2

### Assignment 3 (3 point): Write your own logistic regression that replicates the output of the function below

## ASSIGNMENT 3 - SOLUTION

Idea is to firstly define the loss correctly and then minimize it globally. This is actually the same as previous assignment

In [884]:
# Create dataset
x=np.random.normal(size=1000)
eps=np.random.logistic(size=1000)

# careful here goes -1,1
y = np.sign(1 + 2*x + eps)
x = x.reshape(1000, 1)
yecon = (y+1)/2

# put columns together
data = np.hstack((x, yecon.reshape(1000, 1)))
data.shape

# put data into dataframe
df = pd.DataFrame(data, columns = ['x', 'y'])
df.head()

Unnamed: 0,x,y
0,-0.808529,0.0
1,0.016261,1.0
2,-1.814209,0.0
3,0.522363,0.0
4,-0.155434,1.0


In [885]:
# my function should give the same output as this one
def assignment_3(formula, data):
    model = smf.logit(formula, data=df).fit()
    return model.params
    
x = assignment_3('y ~ x', df)

Optimization terminated successfully.
         Current function value: 0.453747
         Iterations 7


In [886]:
x

Intercept    1.017969
x            1.781618
dtype: float64

In [887]:
def LGT_loss_3(Y, X, beta):
    
    z = -Y * X @ beta
    o = np.zeros(Y.size)
    
    return np.logaddexp(o, z).sum()

In [888]:
def assignment_3_answer(formula, data):
    
    ycolumn = formula.split(' ~ ')[0]
    xcolumns = formula.split(' ~ ')[1].split(' + ')
    Y = data[ycolumn].values
    X = data[xcolumns].values
    X = statsmodels.tools.tools.add_constant(X)
    
    Y = Y * 2 - 1
    
    Y = Y.reshape(len(Y),1)
    
    covs = 2
    bounds = [(None, None)]*covs
    betas = spopt.shgo(lambda b: LGT_loss_3(Y, X, b), bounds).x
    
    return betas

In [889]:
assignment_3_answer('y ~ x', df)

array([1.01796925, 1.78161848])

PS: я потратил 4 часа, удивляясь, почему результаты не сходятся

Оказывается, в лекции была указана формула для CS, а не для Econ :))

Добавив одну строку, приводящую Y в нужный вид, все получилось

### Assignment 4 (2 points): Write your own logistic regression with $\alpha$ and $l1ratio$ as parameters

## ASSIGNMENT 4 - SOLUTION

- I simply changed the loss function from the previous assignment, adding regularization
- Interesting thing here is that parameteres a and b were not explicitly passed to the function
    - So, I used the formula from the seminar to derive them from this: $\quad l1_{ratio} = \frac{a}{a+b}, \quad \alpha=a+b$

In [890]:
def LGT_loss_4(Y, X, beta, alpha, l1ratio):
        
    # we can derrive a and b parameters from alpha and l1ratio
    # alpha = a + b
    # l1ratio = a / (a + b)
    # a = l1ratio * (a + alpha - a) -> we put b = alpha - a instead of b here
    
    a = l1ratio * alpha
    b = alpha - a
    
    z = -Y*X@beta
    o = np.zeros(Y.size)
    return np.logaddexp(o, z).sum() + a*np.abs(beta).sum() + b*np.square(beta).sum()/2

In [891]:
def assignment_4_answer(formula, data, alpha, l1ratio):
    
    ycolumn = formula.split(' ~ ')[0]
    xcolumns = formula.split(' ~ ')[1].split(' + ')
    Y = data[ycolumn].values
    X = data[xcolumns].values
    X = statsmodels.tools.tools.add_constant(X)
    Y = Y * 2 - 1
    Y = Y.reshape(len(Y),1)
    
    covs = 2
    bounds = [(None, None)]*covs
    betas = spopt.shgo(lambda beta: LGT_loss_4(Y, X, beta, alpha, l1ratio), bounds).x
    
    return betas

In [892]:
assignment_4_answer(formula = 'y ~ x', data = df, alpha = 2, l1ratio = 0.1)

array([0.98856345, 1.72435463])

### Assignment 5 (3 points): Write your own logistic regression with number of folds as parameter

## ASSIGNMENT 5 - SOLUTION

- I took the code from the previous homework to run cross-validation to get optimal hyperparameters
- Used loss function from the previous assignment (#4)
- Found optimal alpha and l1ratio, put them into loss function and minimized by beta (vector of coefficients)
- Got the estimation of beta from minimization problem, this is actually the answer

In [897]:
def LGT_loss_5(Y, X, beta, alpha, l1ratio):
        
    # we can derrive a and b parameters from alpha and l1ratio
    # alpha = a + b
    # l1ratio = a / (a + b)
    # a = l1ratio / (a + alpha - a) -> we put b = alpha - a instead of b here
    
    a = l1ratio * alpha
    b = alpha - a
    
    z = -Y*X@beta
    o = np.zeros(Y.size)
    
    return np.logaddexp(o, z).sum() + a*np.abs(beta).sum() + b*np.square(beta).sum()/2

In [898]:
def run(alpha, l1ratio, Y, X, kf):
    
    avg = 0
    
    for train_index, test_index in kf.split(X):
        
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        covs = 2
        bounds = [(-20, 20)]*covs
        betas = spopt.shgo(lambda beta: LGT_loss_5(Y_train, X_train, beta, alpha, l1ratio), bounds).x
          
        Y_pred = np.round(1/(1+np.exp(X@betas)))    
        accuracy = (Y_pred == Y_test).sum() / len(Y_test)
    
        avg = avg + accuracy
        
    # get average from all our folds
    return avg/kf.n_splits

In [899]:
def assignment_5_answer(formula, data, folds):
    
    ycolumn = formula.split(' ~ ')[0]
    xcolumns = formula.split(' ~ ')[1].split(' + ')
    Y = data[ycolumn].values
    X = data[xcolumns].values
    X = statsmodels.tools.tools.add_constant(X)
    Y = Y * 2 - 1
    Y = Y.reshape(len(Y),1)

    alphas = []
    l1ratios = []
    results = []
        
    kf = KFold(n_splits=folds)
    
    for alpha, l1ratio in itertools.product(np.linspace(0,2,20), np.linspace(0,2,20)):
        results.append(run(alpha, l1ratio, Y, X, kf))
        alphas.append(alpha)
        l1ratios.append(l1ratio)
    
    opt_l1ratio = l1ratios[np.argmin(results)]
    opt_alpha = alphas[np.argmin(results)]
    
    covs = 2
    bounds = [(-20, 20)]*covs
    betas = spopt.shgo(lambda beta: LGT_loss_5(Y, X, beta, opt_alpha, opt_l1ratio), bounds).x
    
    return betas

In [901]:
betas = assignment_5_answer(formula = 'y ~ x', data = df, folds=3)
betas

array([0.99710526, 1.74029156])

### Assignment 6 (bonus 5 points): Write the ordered/latent/rand logit and compare to existing library

## ASSIGNMENT 6 - NO SOLUTION

### Assignment 7 (bonus 5 points): Write confident Logit with crossvalidation for window size

## ASSIGNMENT 7 - NO SOLUTION