In [6]:
#Importing and Data Pre-Processing
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv("Desktop/heart_2020_cleaned.csv")

# Label Encoding for binary variables
binary_columns = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'PhysicalActivity', 
                  'Asthma', 'KidneyDisease', 'SkinCancer']
label_encoders = {}

for col in binary_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Convert categorical variables using get_dummies (One-Hot Encoding)
categorical_columns = ['Sex', 'AgeCategory', 'Race', 'Diabetic', 'GenHealth']
data = pd.get_dummies(data, columns=categorical_columns)

# Normalize numeric columns
numeric_columns = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']
scaler = StandardScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

#Output Processed Data

# Assuming 'df' is your DataFrame
df = pd.DataFrame(data) 

# Save DataFrame to a CSV file
df.to_csv('processed_data.csv', index=False)

# Split data into features and target
X = data.drop('HeartDisease', axis=1)
y = data['HeartDisease']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("pre steps done")

pre steps done


In [None]:
#LOGISTIC REGRESSION
# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=300, random_state=42)
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Logistic Regression")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

In [None]:
#SVM MODEL
# Initialize the SVM model with probability estimation enabled
print("Before start")
svm_model = SVC(probability=True, random_state=42)

# Train the model
print("Start train")
svm_model.fit(X_train, y_train)

# Predict on the test data
print("Start predict")
y_pred = svm_model.predict(X_test)
probabilities = svm_model.predict_proba(X_test)

# Evaluate the model
print("Evaluating")
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("SVM Model")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)
print("Probabilities:\n", probabilities[:, 1])  # Probability of having heart disease

Before start)
Start train


In [None]:
#RANDOM FOREST
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust parameters

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("RANDOM FOREST")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

In [None]:
#XGBoost Model

# Initialize the XGBoost Classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.05, max_depth=5, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on the test data
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("XGBoost Model")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

In [None]:
#Neural Networks

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Scale the features (neural networks perform better with scaled data)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the neural network model
model = Sequential()
model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))  # Input layer + hidden layer
model.add(Dense(16, activation='relu'))  # Another hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=10, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")