In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('heart_disease_uci.csv')

# Replace missing values
df['trestbps'].fillna(df['trestbps'].mean(), inplace=True)
# df['chol'].replace(0, pd.NA, inplace=True)
df['chol'].fillna(df['chol'].mean(), inplace=True)
df['fbs'].fillna('N/A', inplace=True)
df['restecg'].fillna('N/A', inplace=True)
df['thalch'].fillna(df['thalch'].mean(), inplace=True)
df['exang'].fillna('N/A', inplace=True)
df['oldpeak'].fillna(df['oldpeak'].mean(), inplace=True)
df['slope'].fillna('N/A', inplace=True)
df['num'] = (df['num'] > 0).astype(int)

# Drop 'ca' and 'thal' columns
df.drop(['ca', 'thal'], axis=1, inplace=True)

# Drop rows where 'chol' value is 0
df = df[df['chol'] != 0]

# Save the processed DataFrame back to a CSV file
df.to_csv('preprocessed.csv', index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['trestbps'].fillna(df['trestbps'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['chol'].fillna(df['chol'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

# Load the preprocessed CSV file
df = pd.read_csv('preprocessed.csv')

# Separate features (X) and target variable (y)
X = df.drop('num', axis=1)
y = df['num']

In [4]:
# Identify categorical columns
categorical_columns = ['sex', 'cp','dataset','fbs','restecg','exang','slope']

# Create a ColumnTransformer to apply one-hot encoding to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ],
    remainder='passthrough'
)

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the data (one-hot encoding)
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_transformed)
X_test_scaled = scaler.transform(X_test_transformed)

In [8]:
print(X_train_scaled[0])

[ 1.72819752 -1.72819752 -0.96709621 -0.56061191  1.86982368 -0.24167557
 -0.81877412  1.22988009 -0.50104493  0.45615875 -0.43186565 -0.12361285
 -0.55288051 -1.19619995  2.16684981 -0.04092728 -1.20449976  1.36427417
 -0.24933304 -0.26049404  1.33954937 -0.52963565 -0.74923808  0.68831955
  0.12060484 -0.15090457  0.8326323  -1.6938712  -0.83143989]


In [6]:
model = LogisticRegression(random_state=30)

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)
y_pred = np.round(y_pred)

In [7]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)

acc = accuracy_score(y_test, y_pred)


print(f'Mean Squared Error: {mse: .2f}')
print(f'Accuracy: {acc:.2f}')


# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Extract values from the confusion matrix
true_negative, false_positive, false_negative, true_positive = conf_matrix.ravel()

# Calculate specificity
specificity = true_negative / (true_negative + false_positive)
print(f'Specificity: {specificity:.2f}')

# Calculate precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

# Calculate false positive rate
false_positive_rate = false_positive / (false_positive + true_negative)
print(f'False Positive Rate: {false_positive_rate:.2f}')

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1:.2f}')

Mean Squared Error:  0.15
Accuracy: 0.85
Specificity: 0.78
Precision: 0.78
Recall: 0.93
False Positive Rate: 0.22
F1 Score: 0.84


In [8]:
from joblib import dump
dump(model, 'model.pkl')
dump(preprocessor, 'preprocessor.pkl')

['preprocessor.pkl']