In [16]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [18]:
# Load the dataset
data = pd.read_csv('framingham_heart_study.csv')
data.dropna(inplace=True)


In [20]:
# Separate features and target
X = data.drop(columns=['TenYearCHD'])  # Features
y = data['TenYearCHD']                 # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Predict probabilities for the test set
y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]  # Probabilities for the positive class (CHD risk)

# Predict classes for the test set
y_pred = log_reg.predict(X_test_scaled)

In [24]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)

print("Logistic Regression Evaluation:")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"ROC-AUC Score: {roc_auc:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Logistic Regression Evaluation:
Accuracy: 84.02%
ROC-AUC Score: 0.73

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       610
           1       0.69      0.07      0.13       122

    accuracy                           0.84       732
   macro avg       0.77      0.53      0.52       732
weighted avg       0.82      0.84      0.78       732



In [26]:
# Example: Probability score for a single sample
sample_data = pd.DataFrame({
    'male': [1],
    'age': [55],
    'education': [1],
    'currentSmoker': [1],
    'cigsPerDay': [10],
    'BPMeds': [0],
    'prevalentStroke': [0],
    'prevalentHyp': [1],
    'diabetes': [0],
    'totChol': [240],
    'sysBP': [140],
    'diaBP': [90],
    'BMI': [30],
    'heartRate': [75],
    'glucose': [100]
})

# Standardize the sample
sample_scaled = scaler.transform(sample_data)

# Predict probability for the sample
sample_prob = log_reg.predict_proba(sample_scaled)[:, 1]
print(f"\nSample Data Probability of CHD Risk: {sample_prob[0]:.2f}")



Sample Data Probability of CHD Risk: 0.29


In [28]:
# Output the coefficients
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': log_reg.coef_[0]
})
print(coefficients)


            Feature  Coefficient
0              male     0.307671
1               age     0.571186
2         education    -0.045600
3     currentSmoker     0.064952
4        cigsPerDay     0.177517
5            BPMeds     0.052768
6   prevalentStroke     0.069755
7      prevalentHyp     0.055539
8          diabetes     0.033621
9           totChol     0.116975
10            sysBP     0.338067
11            diaBP    -0.061224
12              BMI     0.055556
13        heartRate    -0.007110
14          glucose     0.115263
