In [1]:
pwd

'C:\\Users\\asus\\Heart diseases prediction'

In [42]:
# Import pandas for data handling
import pandas as pd

# Load the dataset
df = pd.read_csv("heart.csv")
print("Step 1: Dataset Loaded")
print(df.head())

Step 1: Dataset Loaded
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  


In [44]:
from scipy.stats import zscore

# Remove outliers for numerical columns using Z-score thresholding
df_no_outliers = df[(zscore(df.select_dtypes(include=[float, int])) < 3).all(axis=1)]
print("Step 2: Removed Outliers")
print("Data after removing outliers:")
print(df_no_outliers.describe())


Step 2: Removed Outliers
Data after removing outliers:
              Age   RestingBP  Cholesterol   FastingBS       MaxHR  \
count  902.000000  902.000000   902.000000  902.000000  902.000000   
mean    53.487805  131.854767   197.347007    0.232816  136.848115   
std      9.444115   17.682612   107.585613    0.422860   25.451226   
min     28.000000    0.000000     0.000000    0.000000   60.000000   
25%     47.000000  120.000000   173.000000    0.000000  120.000000   
50%     54.000000  130.000000   222.000000    0.000000  138.000000   
75%     60.000000  140.000000   266.000000    0.000000  156.000000   
max     77.000000  185.000000   518.000000    1.000000  202.000000   

          Oldpeak  HeartDisease  
count  902.000000    902.000000  
mean     0.857428      0.548780  
std      1.013157      0.497891  
min     -2.600000      0.000000  
25%      0.000000      0.000000  
50%      0.500000      1.000000  
75%      1.500000      1.000000  
max      4.000000      1.000000  


In [56]:
# Step 3: Encoded Categorical Columns
from sklearn.preprocessing import LabelEncoder

# Create a deep copy to avoid modifying a view of the original DataFrame
df_no_outliers = df_no_outliers.copy()

# Columns to be label encoded
text_columns = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
label_encoders = {}

# Encode categorical columns
for column in text_columns:
    le = LabelEncoder()
    df_no_outliers.loc[:, column] = le.fit_transform(df_no_outliers[column])
    label_encoders[column] = le

print("Step 3: Encoded Categorical Columns")
print(df_no_outliers.head())


Step 3: Encoded Categorical Columns
   Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0   40    1              1        140          289          0           1   
1   49    0              2        160          180          0           1   
2   37    1              1        130          283          0           2   
3   48    0              0        138          214          0           1   
4   54    1              2        150          195          0           1   

   MaxHR  ExerciseAngina  Oldpeak  ST_Slope  HeartDisease  
0    172               0      0.0         2             0  
1    156               0      1.0         1             1  
2     98               0      0.0         2             0  
3    108               1      1.5         1             1  
4    122               0      0.0         2             0  


In [69]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Apply scaling to the entire dataset
scaled_df = df_no_outliers.copy()
scaled_df[scaled_df.columns] = scaler.fit_transform(df_no_outliers)
print("Step 4: Applied Scaling")
print(scaled_df.head())


Step 4: Applied Scaling
        Age       Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  \
0 -1.428963  0.514856       0.224990   0.460891     0.852380  -0.550879   
1 -0.475460 -1.942289       1.271075   1.592573    -0.161329  -0.550879   
2 -1.746797  0.514856       0.224990  -0.104950     0.796580  -0.550879   
3 -0.581404 -1.942289      -0.821096   0.347722     0.154874  -0.550879   
4  0.054264  0.514856       1.271075   1.026732    -0.021827  -0.550879   

   RestingECG     MaxHR  ExerciseAngina   Oldpeak  ST_Slope  HeartDisease  
0    0.012337  1.381913       -0.820652 -0.846763  1.045634     -1.102822  
1    0.012337  0.752911       -0.820652  0.140799 -0.620730      0.906765  
2    1.601989 -1.527222       -0.820652 -0.846763  1.045634     -1.102822  
3    0.012337 -1.134095        1.218544  0.634579 -0.620730      0.906765  
4    0.012337 -0.583719       -0.820652 -0.846763  1.045634     -1.102822  


In [71]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define features and target variable
X = scaled_df.drop(columns='HeartDisease')
y = scaled_df['HeartDisease']

# Check unique values in y to ensure it's binary or categorical
print("Unique values in target variable (y) before conversion:", y.unique())

# Convert `y` to binary if necessary
if y.dtype not in [int, bool] or len(y.unique()) > 2:
    y = y.apply(lambda x: 1 if x > 0 else 0)

# Confirm unique values after conversion
print("Unique values in target variable (y) after conversion:", y.unique())

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier()
}

# Dictionary to store model accuracies
accuracies = {}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[model_name] = accuracy
    print(f"{model_name} Accuracy without PCA: {accuracy:.4f}")

print("Step 5: Model Accuracies without PCA:", accuracies)


Unique values in target variable (y) before conversion: [-1.10282193  0.9067647 ]
Unique values in target variable (y) after conversion: [0 1]
SVM Accuracy without PCA: 0.8895
Logistic Regression Accuracy without PCA: 0.8508
Random Forest Accuracy without PCA: 0.8729
Step 5: Model Accuracies without PCA: {'SVM': 0.8895027624309392, 'Logistic Regression': 0.850828729281768, 'Random Forest': 0.8729281767955801}


In [75]:
from sklearn.decomposition import PCA

# Initialize PCA to keep 95% of the variance
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Dictionary to store accuracies after PCA
accuracies_pca = {}

# Train and evaluate each model with PCA-transformed data
for model_name, model in models.items():
    model.fit(X_train_pca, y_train)
    y_pred_pca = model.predict(X_test_pca)
    accuracy_pca = accuracy_score(y_test, y_pred_pca)
    accuracies_pca[model_name] = accuracy_pca
    print(f"{model_name} Accuracy with PCA: {accuracy_pca:.4f}")

print("Step 6: Model Accuracies with PCA:", accuracies_pca)


SVM Accuracy with PCA: 0.8785
Logistic Regression Accuracy with PCA: 0.8453
Random Forest Accuracy with PCA: 0.8398
Step 6: Model Accuracies with PCA: {'SVM': 0.8784530386740331, 'Logistic Regression': 0.8453038674033149, 'Random Forest': 0.8397790055248618}
