## Data Loading and Exploration

In [7]:
import hvplot.pandas
import numpy as np
import pandas as pd
import panel as pn
from ucimlrepo import fetch_ucirepo 

# Fetch dataset 
heart_disease = fetch_ucirepo(id=45) 

# Data (as pandas DataFrames) 
X = heart_disease.data.features 
y = heart_disease.data.targets 

# Metadata 
print(heart_disease.metadata) 

# Variable information 
print(heart_disease.variables)

# Display first few rows
print(X.head())
print(y.head())

# Basic statistics
print(X.describe())


{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'ID': 231, 'type': 'NATIVE', 'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M

## Data Cleaning

In [8]:
# Check for missing values
print(X.isnull().sum())

# Fill missing values (if any)
X = X.fillna(X.mean())

# Encode categorical variables
categorical_features = ['cp', 'restecg', 'exang', 'slope', 'thal']
X = pd.get_dummies(X, columns=categorical_features)

# Verify the cleaning process
print(X.head())


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
dtype: int64
   age  sex  trestbps  chol  fbs  thalach  oldpeak   ca   cp_1   cp_2  ...  \
0   63    1       145   233    1      150      2.3  0.0   True  False  ...   
1   67    1       160   286    0      108      1.5  3.0  False  False  ...   
2   67    1       120   229    0      129      2.6  2.0  False  False  ...   
3   37    1       130   250    0      187      3.5  0.0  False  False  ...   
4   41    0       130   204    0      172      1.4  0.0  False   True  ...   

   restecg_2  exang_0  exang_1  slope_1  slope_2  slope_3  thal_3.0  \
0       True     True    False    False    False     True     False   
1       True    False     True    False     True    False      True   
2       True    False     True    False     True    False     False   
3      False     True    False    False    False    

# Feature Engineering

In [1]:
# Example of creating a new feature (age category)
X['age_cat'] = pd.cut(X['age'], bins=[0, 30, 50, 70, 100], labels=[1, 2, 3, 4])

# Normalize/Standardize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Verify the feature engineering process
print(X_scaled.head())


NameError: name 'pd' is not defined

# Splitting the Data

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Verify the split
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)


NameError: name 'X_scaled' is not defined

# Modeling

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)


NameError: name 'X_train' is not defined