In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("/home/ichigo/Desktop/Project2_files/Medical diagnosis using AI/Datasets/heart_disease_data.csv")

In [3]:
df = df.drop(columns=['fbs', 'restecg', 'slope'])

In [4]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,thalach,exang,oldpeak,ca,thal,target
0,63,1,3,145,233,150,0,2.3,0,1,1
1,37,1,2,130,250,187,0,3.5,0,2,1
2,41,0,1,130,204,172,0,1.4,0,2,1
3,56,1,1,120,236,178,0,0.8,0,2,1
4,57,0,0,120,354,163,1,0.6,0,2,1


In [5]:
# Splitting features and target
X = df.drop(columns=['target'])  # Adjust target column name as needed
y = df['target']

In [8]:
# Handling class imbalance with SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [9]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [10]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [11]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Random Forest": RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42),
    "SVM": SVC(kernel='linear', class_weight='balanced'),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1])
}


In [12]:
# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        "Accuracy": accuracy,
        "Classification Report": classification_report(y_test, y_pred)
    }
    print(f"\n---- {name} ----")
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))


---- Logistic Regression ----
Accuracy: 0.8181818181818182
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.78      0.81        32
           1       0.81      0.85      0.83        34

    accuracy                           0.82        66
   macro avg       0.82      0.82      0.82        66
weighted avg       0.82      0.82      0.82        66


---- Random Forest ----
Accuracy: 0.8181818181818182
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.78      0.81        32
           1       0.81      0.85      0.83        34

    accuracy                           0.82        66
   macro avg       0.82      0.82      0.82        66
weighted avg       0.82      0.82      0.82        66


---- SVM ----
Accuracy: 0.803030303030303
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.75      0.79        32
        

Parameters: { "use_label_encoder" } are not used.

