In [None]:
import pandas as pd

# Load the dataset
file_path = 'heart-2.csv'
df = pd.read_csv(file_path)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Normalize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions and Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report

(0.7951219512195122,
 '              precision    recall  f1-score   support\n\n           0       0.85      0.72      0.78       102\n           1       0.76      0.87      0.81       103\n\n    accuracy                           0.80       205\n   macro avg       0.80      0.79      0.79       205\nweighted avg       0.80      0.80      0.79       205\n')

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Copy original features
X_fe = X.copy()

# Interaction features
X_fe['age_chol'] = X_fe['age'] * X_fe['chol']
X_fe['thalach_age'] = X_fe['thalach'] / (X_fe['age'] + 1e-5)  # Avoid division by zero

# Binning 'age' and 'chol'
X_fe['age_bin'] = pd.cut(X_fe['age'], bins=[0, 40, 50, 60, 70, 100], labels=False)
X_fe['chol_bin'] = pd.cut(X_fe['chol'], bins=[0, 200, 240, 280, 400], labels=False)

# Polynomial features (only on selected continuous variables)
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(X_fe[['age', 'chol', 'thalach', 'oldpeak']])
poly_feature_names = poly.get_feature_names_out(['age', 'chol', 'thalach', 'oldpeak'])

# Combine everything
X_fe = X_fe.drop(['age', 'chol', 'thalach', 'oldpeak'], axis=1)
X_fe_poly = pd.DataFrame(poly_features, columns=poly_feature_names)
X_fe_final = pd.concat([X_fe.reset_index(drop=True), X_fe_poly], axis=1)

# Drop or fill NaNs after feature engineering
X_fe_final_clean = X_fe_final.fillna(0)

# Normalize the features again
X_scaled_fe_clean = scaler.fit_transform(X_fe_final_clean)

# Train-test split
X_train_fe, X_test_fe, y_train, y_test = train_test_split(X_scaled_fe_clean, y, test_size=0.2, random_state=42)

# Retrain logistic regression
model_fe = LogisticRegression(max_iter=1000)
model_fe.fit(X_train_fe, y_train)

# Predict and evaluate
y_pred_fe = model_fe.predict(X_test_fe)
accuracy_fe = accuracy_score(y_test, y_pred_fe)
report_fe = classification_report(y_test, y_pred_fe)

accuracy_fe, report_fe

(0.824390243902439,
 '              precision    recall  f1-score   support\n\n           0       0.89      0.74      0.81       102\n           1       0.78      0.91      0.84       103\n\n    accuracy                           0.82       205\n   macro avg       0.83      0.82      0.82       205\nweighted avg       0.83      0.82      0.82       205\n')

In [None]:
print("Accuracy = ", accuracy_fe)

Accuracy =  0.824390243902439


In [None]:
import joblib

# Save the trained logistic regression model
model_path = 'logistic_regression_fe_model.pkl'
joblib.dump(model_fe, model_path)

model_path

'logistic_regression_fe_model.pkl'