#### Question 6

In [29]:
import numpy as np
from sklearn.linear_model import LassoCV, Lasso

## read mystery.dat
data = np.loadtxt('mystery.dat', delimiter=',')
x = data[:, :-1]
y = data[:, -1]

## normalize x based on maximum value in all x data
x_norm = x / np.max(np.max(X, axis=0), axis=0)

## find best alpha using LassoCV
## test 100 alphas ranging from 1e-4 to 1e4
lasso_cv = LassoCV(alphas=np.logspace(-4, 4, 100), cv=5)
lasso_cv.fit(x_norm, y)
alpha_best = lasso_cv.alpha_
print(f"best alpha: {alpha_best}")

## fit Lasso model with best alpha
lasso = Lasso(alpha=alpha_best)
lasso.fit(x_norm, y)

## get absolute value of lasso coefficients
coefs = np.abs(lasso.coef_)

## get indices of the 10 largest coefficients
## note: that the coefficients are sorted in descending order
indices = np.argsort(coefs)[::-1][:10]

## print the indices (coordinate number) of the 10 largest coefficients
print("indices of the 10 largest coefficients:")
for i in indices:
    print(f"coordinate number: {i}, coefficient: {coefs[i]}")


best alpha: 0.026560877829466867
indices of the 10 largest coefficients:
coordinate number: 4, coefficient: 4.7131842771824
coordinate number: 6, coefficient: 4.705164100703595
coordinate number: 1, coefficient: 4.310931741925735
coordinate number: 22, coefficient: 4.286751680551222
coordinate number: 10, coefficient: 4.002280617715773
coordinate number: 26, coefficient: 3.9016126507383193
coordinate number: 2, coefficient: 3.7254083754072687
coordinate number: 12, coefficient: 3.703791161879716
coordinate number: 18, coefficient: 3.3730434729455077
coordinate number: 16, coefficient: 3.2806820617254204


#### Question 7

In [92]:
## import libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import os

## read heart.csv
## note: the first column is the index, so we skip it
## col names are age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
data = np.loadtxt('heart.csv', delimiter=',', dtype=None, skiprows=1)
column_names = os.popen('head -1 heart.csv').read().split(',')
x = data[:, :-1]
y = data[:, -1]

## normalize each x column based on maximum value in column
## note: this will scale each feature [0,1]
x_norm = x / np.max(x, axis=0)

## split data into training and test sets
## note: train_test_split shuffles the data before splitting
## note: test points = 103 and train points = 200 for test_size=0.339
x_train, x_test, y_train, y_test = train_test_split(x_norm, y, test_size=0.339, random_state=33)

## fit logistic regression model
log_reg = LogisticRegression(max_iter=1000,solver='liblinear')
log_reg.fit(x_train, y_train)

## predict on test set and calculate accuracy
y_pred = log_reg.predict(x_test)
test_accuracy = accuracy_score(y_test, y_pred)

## get absolute value of logistic regression coefficients
coefs = np.abs(log_reg.coef_[0])

## get indices of the 3 largest coefficients
## note: that the coefficients are sorted in descending order
indices = np.argsort(coefs)[::-1][:3]

## top 3 coefficients and features
print("top 3 coefficients and features:")
for i in indices:
    print(f"index: {i} coefficient: {coefs[i]}, feature: {column_names[i]}")

## log_reg test error 
test_error = 1 - test_accuracy
print("log_reg test error:", test_error)

## 5-fold cross-validation error 
cv_scores = cross_val_score(log_reg, x_train, y_train, cv=5, scoring='accuracy')
cv_error = 1 - cv_scores.mean()
print("mean 5-fold cross-validation error:", cv_error)

# compare log_reg test error and 5-fold cross-validation error 
print(f"Difference (log_reg - cv): {test_error - cv_error:.4f}")

top 3 coefficients and features:
index: 11 coefficient: 1.778677470015258, feature: ca
index: 2 coefficient: 1.6573955754020404, feature: cp
index: 7 coefficient: 1.5275059507432016, feature: thalach
log_reg test error: 0.18446601941747576
mean 5-fold cross-validation error: 0.19999999999999996
Difference (log_reg - cv): -0.0155
