In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("C:/Users/rachel/Desktop/School/Summer 2024/DSA2020_Artificial Intelligence/Project/Hypertension data.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,57,1.0,3,145,233,1,0,150,0,2.3,0,0,1,1
1,64,0.0,2,130,250,0,1,187,0,3.5,0,0,2,1
2,52,1.0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,0.0,1,120,236,0,1,178,0,0.8,2,0,2,1
4,66,0.0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# EDA
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26083 entries, 0 to 26082
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       26083 non-null  int64  
 1   sex       26058 non-null  float64
 2   cp        26083 non-null  int64  
 3   trestbps  26083 non-null  int64  
 4   chol      26083 non-null  int64  
 5   fbs       26083 non-null  int64  
 6   restecg   26083 non-null  int64  
 7   thalach   26083 non-null  int64  
 8   exang     26083 non-null  int64  
 9   oldpeak   26083 non-null  float64
 10  slope     26083 non-null  int64  
 11  ca        26083 non-null  int64  
 12  thal      26083 non-null  int64  
 13  target    26083 non-null  int64  
dtypes: float64(2), int64(12)
memory usage: 2.8 MB


In [4]:
# Check for missing values

def missing (df):
    missing_number = df.isnull().sum().sort_values(ascending=False)
    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number'])
    return missing_values

missing(df)

Unnamed: 0,Missing_Number
sex,25
age,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [5]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
df['sex'] = imputer.fit_transform(df[['sex']])

In [6]:
def missing (df):
    missing_number = df.isnull().sum().sort_values(ascending=False)
    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number'])
    return missing_values

missing(df)

Unnamed: 0,Missing_Number
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [7]:
# Define features and target
X = df.drop('target', axis=1)
y = df['target']


In [8]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC()
}


In [11]:
# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'{name} Performance:')
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Precision: {precision_score(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'F1 Score: {f1_score(y_test, y_pred)}\n')

Logistic Regression Performance:
Accuracy: 0.8558558558558559
Precision: 0.8371356147021546
Recall: 0.9173611111111111
F1 Score: 0.8754141815772035

Decision Tree Performance:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

Random Forest Performance:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0

Gradient Boosting Performance:
Accuracy: 0.9842821544949204
Precision: 0.9723160027008778
Recall: 1.0
F1 Score: 0.9859637110578569

SVM Performance:
Accuracy: 0.984857197623155
Precision: 0.9801165581076449
Recall: 0.9927083333333333
F1 Score: 0.9863722615145766



In [None]:
age = input("What is the patient's age?")
sex = input("What is the patient's gender?")
cp = input("What is the chest pain type?")
trestbps = input("What is the resting blood pressure in mm Hg?")
chol = input("What is serum cholestorol in mg/dl?")
fbs = input("Is the fasting blood sugar > 120 mg/dl?")
restecg = input("What is the patient's resting ECG results?")
thalach = input("What is the patient's maximum heart rate achieved?")
exang = input("Does the patient have exercise induced angina?")
oldpeak = input("What is ST depression induced by exercise relative to rest?")
Slope = input("What is the patient's slope of the peak exercise ST segment?")
Ca = input("What is the number of major vessels (0-3) colored by fluoroscopy?")
Thal = input("Does the patient have thalassemia?")