---
### <center>Task 3 : Heart Disease Predication</center>
---

### 1. <u>Install Requirement Libraries</u>

In [3]:
pip install pandas scikit-learn matplotlib

Note: you may need to restart the kernel to use updated packages.


---

###  <u>Load and Isepect the Dataset</u>

In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv("heart.csv")

# Display first few rows to understand the data structure
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Display column names and data types
print("\nDataset information:")
print(df.info())

FileNotFoundError: [Errno 2] No such file or directory: 'heart.csv'

---

###  <u>Data Cleaning</u>

In [None]:
print("\nMissing values per column:")
print(df.isnull().sum())

# Drop rows with missing values (alternative: could impute instead)
df_clean = df.dropna()

# Check for duplicate rows
print("\nNumber of duplicate rows:", df_clean.duplicated().sum())



Missing values per column:
id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

Number of duplicate rows: 0


---

###  <u>Exploratory Data Analysis</u>

In [None]:
print("\nDescriptive statistics for numerical features:")
print(df_clean.describe())

# Value counts for categorical columns
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'num']
print("\nValue counts for categorical features:")
for col in categorical_cols:
    if col in df_clean.columns:
        print(f"\n{col}:")
        print(df_clean[col].value_counts())



Descriptive statistics for numerical features:
               id         age    trestbps        chol      thalch     oldpeak  \
count  299.000000  299.000000  299.000000  299.000000  299.000000  299.000000   
mean   153.872910   54.521739  131.715719  246.785953  149.327759    1.058528   
std     95.896287    9.030264   17.747751   52.532582   23.121062    1.162769   
min      1.000000   29.000000   94.000000  100.000000   71.000000    0.000000   
25%     75.500000   48.000000  120.000000  211.000000  132.500000    0.000000   
50%    151.000000   56.000000  130.000000  242.000000  152.000000    0.800000   
75%    227.500000   61.000000  140.000000  275.500000  165.500000    1.600000   
max    749.000000   77.000000  200.000000  564.000000  202.000000    6.200000   

               ca         num  
count  299.000000  299.000000  
mean     0.672241    0.946488  
std      0.937438    1.230409  
min      0.000000    0.000000  
25%      0.000000    0.000000  
50%      0.000000    0.000000 

---
### <u>Data Processing</u>

In [None]:
df_processed = pd.get_dummies(df_clean, columns=['sex', 'cp', 'restecg', 'slope', 'thal'], drop_first=True)

# Define target variable (num: 0 = no disease, >0 = disease)
df_processed['target'] = (df_processed['num'] > 0).astype(int)

# Select features and target
X = df_processed.drop(['num', 'target', 'dataset', 'id'], axis=1, errors='ignore')
y = df_processed['target']

# Split data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


---
### <u>Model Training</u>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)


---
### <u>Model Interpretation</u>

In [None]:
print("\nFeature coefficients from logistic regression:")
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
}).sort_values('Coefficient', ascending=False)

print(coef_df)


Feature coefficients from logistic regression:
                     Feature  Coefficient
7                         ca     1.094731
8                   sex_Male     0.688870
5                      exang     0.449508
6                    oldpeak     0.439013
17    thal_reversable defect     0.401521
1                   trestbps     0.370461
14                slope_flat     0.346192
2                       chol     0.192695
13  restecg_st-t abnormality    -0.007176
0                        age    -0.014355
16               thal_normal    -0.102524
12            restecg_normal    -0.137909
15           slope_upsloping    -0.181778
3                        fbs    -0.219021
9         cp_atypical angina    -0.224814
4                     thalch    -0.323420
11         cp_typical angina    -0.519460
10            cp_non-anginal    -0.792993


---
### <u>Model Evaluation</u>

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate model performance
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Model Evaluation:
Accuracy: 0.90

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        35
           1       0.88      0.88      0.88        25

    accuracy                           0.90        60
   macro avg       0.90      0.90      0.90        60
weighted avg       0.90      0.90      0.90        60


Confusion Matrix:
[[32  3]
 [ 3 22]]


---