In [31]:
#Importing and Data Pre-Processing
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.multioutput import MultiOutputClassifier
from sklearn.datasets import make_multilabel_classification

data = pd.read_csv("Desktop/heart_2020_cleaned.csv")

data = data.drop('GenHealth', axis=1)
data = data.drop('PhysicalHealth', axis=1)
data = data.drop('MentalHealth', axis=1)

#Diabetes Convert to Just Yes and No
replacement_dict = {
    'No, borderline diabetes': 'No',
    'Yes (during pregnancy)': 'Yes'
}
data['Diabetic'] = data['Diabetic'].replace(replacement_dict)

# Label Encoding for binary variables
binary_columns = ['HeartDisease', 'Smoking', 'Stroke', 'Diabetic', 'Asthma', 'KidneyDisease', 'AlcoholDrinking', 'DiffWalking', 'PhysicalActivity','SkinCancer', 'Sex', "AgeCategory"]
label_encoders = {}

for col in binary_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Convert categorical variables using get_dummies (One-Hot Encoding)
categorical_columns = ['Race']
data = pd.get_dummies(data, columns=categorical_columns)

# Normalize numeric columns
numeric_columns = ['BMI','SleepTime']
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

joblib.dump(scaler, 'scaler.save')

#Output Processed Data

df = pd.DataFrame(data) 

# Save DataFrame to a CSV file
df.to_csv('processed_data.csv', index=False)

# Split data into features and target
columns_to_drop = ['HeartDisease', 'Diabetic', 'Stroke', 'Asthma', 'KidneyDisease']
X = data.drop(columns=columns_to_drop, axis=1)
y = data[['HeartDisease', 'Stroke', 'Diabetic', 'Asthma', 'KidneyDisease']]

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("pre-steps done")

X.head()

pre-steps done


Unnamed: 0,BMI,Smoking,AlcoholDrinking,DiffWalking,Sex,AgeCategory,PhysicalActivity,SleepTime,SkinCancer,Race_American Indian/Alaskan Native,Race_Asian,Race_Black,Race_Hispanic,Race_Other,Race_White
0,-1.84475,1,0,0,0,7,1,-1.460354,1,0,0,0,0,0,1
1,-1.256338,0,0,0,0,12,1,-0.067601,0,0,0,0,0,0,1
2,-0.274603,1,0,0,1,9,1,0.628776,0,0,0,0,0,0,1
3,-0.647473,0,0,0,0,11,0,-0.763977,1,0,0,0,0,0,1
4,-0.726138,0,0,1,0,4,1,0.628776,0,0,0,0,0,0,1




In [23]:
#MULTI TARGET LOGISTIC REGRESSION
log_model = LogisticRegression(max_iter=300, random_state=42)
multi_target_logreg = MultiOutputClassifier(log_model, n_jobs=-1)
multi_target_logreg.fit(X_train, y_train)

# Predict on the test data
y_pred = multi_target_logreg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Multi Target Logistic Regression")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

print(y_pred)

Logistic Regression
Accuracy: 0.6748385684579183
Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.03      0.05      5592
           1       0.00      0.00      0.00      2490
           2       0.47      0.06      0.11      8628
           3       0.39      0.00      0.00      8542
           4       0.00      0.00      0.00      2348

   micro avg       0.46      0.03      0.05     27600
   macro avg       0.26      0.02      0.03     27600
weighted avg       0.36      0.03      0.05     27600
 samples avg       0.01      0.01      0.01     27600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#SVM MODEL
# Initialize the SVM model with probability estimation enabled
print("Before start")
svm_model = SVC(probability=True, random_state=42)

# Train the model
print("Start train")
svm_model.fit(X_train, y_train)

# Predict on the test data
print("Start predict")
y_pred = svm_model.predict(X_test)
probabilities = svm_model.predict_proba(X_test)

# Evaluate the model
print("Evaluating")
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("SVM Model")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)
print("Probabilities:\n", probabilities[:, 1])  # Probability of having heart disease

Before start)
Start train


In [25]:
#RANDOM FOREST
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust parameters

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("RANDOM FOREST")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)



RANDOM FOREST
Accuracy: 0.587438827999187
Classification Report:
               precision    recall  f1-score   support

           0       0.20      0.12      0.15      5592
           1       0.08      0.03      0.05      2490
           2       0.29      0.19      0.23      8628
           3       0.18      0.10      0.13      8542
           4       0.08      0.04      0.05      2348

   micro avg       0.21      0.12      0.15     27600
   macro avg       0.17      0.10      0.12     27600
weighted avg       0.20      0.12      0.15     27600
 samples avg       0.04      0.04      0.04     27600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
#XGBoost Model

import xgboost as xgb

# Initialize the XGBoost Classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.05, max_depth=5, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on the test data
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("XGBoost Model")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

XGBoost Model
Accuracy: 0.676886755577792
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.00      0.00      5592
           1       0.00      0.00      0.00      2490
           2       0.47      0.01      0.01      8542
           3       0.52      0.04      0.07      8628
           4       0.00      0.00      0.00      2348

   micro avg       0.51      0.01      0.03     27600
   macro avg       0.33      0.01      0.02     27600
weighted avg       0.44      0.01      0.03     27600
 samples avg       0.01      0.00      0.00     27600



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#Neural Networks

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Scale the features (neural networks perform better with scaled data)
print("Before Start")
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the neural network model
print("Build Neural Network")
model = Sequential()
model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))  # Input layer + hidden layer
model.add(Dense(16, activation='relu'))  # Another hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer

# Compile the model
print("Compile Model")
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

# Train the model
print("Train")
model.fit(X_train, y_train, epochs=50, batch_size=10, verbose=1)

# Evaluate the model
print("Evaluate")
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

In [33]:
def categorize_age(age):

# Define the age categories and their corresponding labels
    age_categories = {
        '0': range(18, 25),
        '1': range(25, 30),
        '2': range(30, 35),
        '3': range(35, 40),
        '4': range(40, 45),
        '5': range(45, 50),
        '6': range(50, 55),
        '7': range(55, 60),
        '8': range(60, 65),
        '9': range(65, 70),
        '10': range(70, 75),
        '11': range(75, 80),
        '12': range(80, 150)  
    }

    # Loop through the dictionary and find the matching age category
    for category, age_range in age_categories.items():
        if age in age_range:
            return category

    return "Unknown"  # Return 'Unknown' if age does not fit any category

def categorize_race(race):

    # Initialize all race category flags as a list of zeros
    races = [0] * 6  # One slot for each race category

    # Only set the appropriate index based on user input
    if 0 <= race < len(races):
        races[race] = 1

    return races

def get_user_input(): 
    sex = int(input("Enter your sex assigned at birth. Female = 0. Male = 1.")) 
    age = int(input("Enter your age: ")) 
    print("American Indian/Alaskan Native = 0. Asian = 1. Black = 2. White = 3. Hispanic = 4. Other = 5.") 
    race = int(input("Enter your race: ")) 
    BMI = float(input("Enter your BMI: ")) 
    smoking = int(input("Do you smoke? 1 = Yes. 0 = No.")) 
    sleepTime = float(input("Enter daily sleep duration in hours: ")) 
    alcohol = int(input("Do you drink alcohol? 1 = Yes. 0 = No.")) 
    physicalActivity = int(input("Do you exercise regularly? 1 = Yes. 0 = No.")) 
    skinCancer = int(input("Do you have skin cancer? 1 = Yes. 0 = No.")) 
    diffWalking = int(input("Do you have difficulty walking? 1 = Yes. 0 = No."))

    #Categorize
    age = categorize_age(age)
    races = categorize_race(race)
    scaler = joblib.load('scaler.save')

    # Reshape data for scaling
    #BMI = np.array(BMI).reshape(-1, 1)
    #sleepTime = np.array(sleepTime).reshape(-1, 1)

    #Normalize Numeric Variables
    #BMI = scaler.transform(BMI)
    #sleepTime = scaler.transform(sleepTime)

    #BMI = BMI.flatten()[0]
    #sleepTime = sleepTime.flatten()[0]
    #print(BMI)
    #print(sleepTime)
    return np.array([[BMI, smoking, alcohol, diffWalking, sex, age, physicalActivity, sleepTime, skinCancer] + races])

user_input = get_user_input()
user_input = user_input.astype(np.float32)
print(user_input)
predicted = multi_target_logreg.predict(user_input) 
print("Predicted:", predicted)

Enter your sex assigned at birth. Female = 0. Male = 1. 1
Enter your age:  64


American Indian/Alaskan Native = 0. Asian = 1. Black = 2. White = 3. Hispanic = 4. Other = 5.


Enter your race:  3
Enter your BMI:  30
Do you smoke? 1 = Yes. 0 = No. 1
Enter daily sleep duration in hours:  8
Do you drink alcohol? 1 = Yes. 0 = No. 1
Do you exercise regularly? 1 = Yes. 0 = No. 0
Do you have skin cancer? 1 = Yes. 0 = No. 1
Do you have difficulty walking? 1 = Yes. 0 = No. 1




ValueError: X has 1 features, but StandardScaler is expecting 2 features as input.