Lasso Crime


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

breakCheckValue = 1e-3

# A.5 - violent crimes
X_train, Y_train, X_test, Y_test, df_train, df_test = load_data()

lambdas_, omegas_, b_ = Lasso_crime(X_train, Y_train)
plotNonZero(lambdas_, omegas_, 'Plot 3 (A.5 a)')

variables_ = ['agePct12t29', 'pctWSocSec', 'pctUrban', 'agePct65up', 'householdsize']
regPath(lambdas_, omegas_, variables_)

plotSE(X_train, Y_train, X_test, Y_test, omegas_, b_, lambdas_)

# inspecting values for λ = 30

w_, b_ = coordDescend(X_train, Y_train, 30, np.zeros(np.shape(X_train)[1]))
print(df_train.columns.values[np.argmax(w_) + 1])
print(df_train.columns.values[np.argmin(w_) + 1])

# coordinate Descend
def coordDescend(X, Y, lambda_, w_initial):
    n_, d_ = np.shape(X)

    w = np.copy(w_initial)
    breakCondition = True

    # X_ = (X - train_mean) / train_sigma
    X_ = X

    b = np.mean(Y - np.matmul(w.reshape(1, d_), np.transpose(X_)))
    oldVal = np.matmul((Y - np.matmul(w.reshape(1, d_), X_.T) - b),
                       (Y - np.matmul(w.reshape(1, d_), X_.T) - b).T) + lambda_ * np.sum(np.abs(w))

    a = np.empty(d_)
    for k in range(0, d_):
        a[k] = 2 * np.matmul(X_[:, k], X_[:, k])


    while (breakCondition):
        b = np.mean(Y - np.matmul(w.reshape(1, d_), np.transpose(X_)))
        w_old = np.copy(w)
        for k in range(0, d_):
            X_c = np.delete(X_, k, axis=1)
            w_c = np.delete(w, k)
            c_k = 2 * np.matmul(Y - b - np.matmul(X_c, w_c), X_[:, k])

            if c_k < -lambda_:
                w[k] = (c_k + lambda_) / a[k]
            elif c_k > lambda_:
                w[k] = (c_k - lambda_) / a[k]
            else:
                w[k] = 0
        newVal = np.matmul((Y - np.matmul(w.reshape(1, d_), X_.T) - b),
                       (Y - np.matmul(w.reshape(1, d_), X_.T) - b).T) + lambda_ * np.sum(np.abs(w))
        if newVal > oldVal:
            print("Sanity Check Error!")
        oldVal = np.copy(newVal)
        breakCondition = breakCheck(w, w_old)
    print(lambda_)
    return w, b

# Checks to see if values of w are changing enough to continue
# the coordinate Descend algorithm.
def breakCheck(w_1, w_2):
    if (np.any(np.abs(np.add(w_1, -w_2)) > breakCheckValue)):
        return True
    return False
    
def lambdaMax(X, Y):
    return np.max(2 * np.dot((Y - np.mean(Y)), X))
    
def plotNonZero(lambdas, omegas, title):
    nonZeros = []
    for w in omegas:
        nonZeros.append(np.count_nonzero(w))
    plt.plot(lambdas, nonZeros)
    plt.xscale('log')
    plt.xlabel('log(λ)')
    plt.ylabel('Non-Zeros')
    plt.title(title)
    plt.show()

def load_data():
    df_train_ = pd.read_table("crime-train.txt")
    df_test_ = pd.read_table("crime-test.txt")
    y_train = df_train_["ViolentCrimesPerPop"].values
    x_train = df_train_.iloc[:, 1:].values
    y_test = df_test_["ViolentCrimesPerPop"].values
    x_test = df_test_.iloc[:, 1:].values
    return x_train, y_train, x_test, y_test, df_train_, df_test_


def Lasso_crime(X, Y):
    lambda_max = lambdaMax(X, Y)
    lambda_ = lambda_max
    n_, d_ = np.shape(X)
    w = np.zeros(d_)
    w, b = coordDescend(X, Y, lambda_, w)
    lambdas = []
    b_list = []
    # a list containing all the w values calculated by lasso
    omegas = []
    while True:
        w_l, b = coordDescend(X, Y, lambda_, w)
        lambdas.append(lambda_)
        lambda_ /= 2
        omegas.append(np.copy(w_l))
        w = w_l
        b_list.append(b)
        if lambda_ < 0.01:
            break
    return lambdas, omegas, b_list


def regPath(lambdas, omegas, variables):
    indexes = [df_train.columns.get_loc(v) - 1 for v in variables]
    plt.title('Plot 4 (A.5 b)')
    plt.xscale('log')
    plt.xlabel('log(λ)')
    plt.ylabel('coefficients')
    omegas = np.array(omegas)
    plt.plot(lambdas, omegas[:, indexes])
    plt.legend(variables)
    plt.show()


def plotSE(x_train, y_train, x_test, y_test, omegas, b, lambdas):
    train_errors = []
    test_errors = []
    omegas = np.array(omegas)
    for c in range(0, len(lambdas)):
        w_c = omegas[c][:]
        y_pred_train = np.matmul(x_train, w_c.T) + b[c]
        train_errors.append(np.sum(np.mean(np.square(y_train - y_pred_train))))
        y_pred_test = np.matmul(x_test, w_c.T) + b[c]
        test_errors.append(np.sum(np.mean(np.square(y_test - y_pred_test))))
    plt.xscale('log')
    plt.xlabel('log(λ)')
    plt.ylabel('Squared Error')
    plt.title('Plot 5 (A.5 c)')
    plt.plot(lambdas, train_errors)
    plt.plot(lambdas, test_errors)
    plt.legend(['train-set error', 'test-set error'])
    plt.show()