In [1]:
from sklearn import linear_model
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

In [2]:
X = pd.read_csv('data/data_preprocessed.csv')
X_test = pd.read_csv('data/test_data_preprocessed.csv')
y = pd.read_csv('data/outcome.csv')['HeartDisease']
y_test = pd.read_csv('data/outcome_test.csv')['HeartDisease']

In [3]:
lasso = linear_model.Lasso(alpha=0.1)
lasso.fit(X,y)

In [4]:
pred = lasso.predict(X)
test_pred = lasso.predict(X_test)

In [5]:
roc_auc_score
print(roc_auc_score(y, pred))
print(roc_auc_score(y_test, test_pred))

0.9066313711414213
0.8818181818181818


In [6]:
print(f1_score(y, pred>0.5))
print(f1_score(y_test, test_pred>0.5))

0.8484076433121019
0.8372093023255814


In [7]:
print(confusion_matrix(y, pred>0.5))
print(confusion_matrix(y_test, test_pred>0.5))

[[282  54]
 [ 65 333]]
[[59 15]
 [20 90]]


# Lasso coefs analysis

In [8]:
# Get coeff by DESC order, put 0 if really small
columns = [c for c in X.columns if c != 'Unnamed: 0']
coefs = lasso.coef_

features_sorted = sorted(zip(columns, coefs), key=lambda x: -abs(x[1]))

THRESHOLD = 1e-16
non_zero = [(f, c) for f, c in features_sorted if np.absolute(c) > THRESHOLD]
zero = [(f, c) for f, c in features_sorted if np.absolute(c) < THRESHOLD]

# Print with clear sections
print("---[ NON-ZERO COEFFICIENTS ]---")
for feature, coef in non_zero:
    print(f"{feature}: {coef:.6f}")

print("\n---[ ZERO COEFFICIENTS ]---") 
for feature, coef in zero:
    print(f"{feature}: 0")

---[ NON-ZERO COEFFICIENTS ]---
ChestPainType_ASY: -0.122621
ChestPainType_ATA: 0.095720
Oldpeak: 0.050092
RestingBP: 0.013700
ExerciseAngina: -0.004096
Age: 0.000088

---[ ZERO COEFFICIENTS ]---
Sex: 0
Cholesterol: 0
FastingBS: 0
MaxHR: 0
ST_Slope: 0
ChestPainType_NAP: 0
ChestPainType_TA: 0
RestingECG_LVH: 0
RestingECG_Normal: 0
RestingECG_ST: 0
