<a href="https://colab.research.google.com/github/AnanyaCSE-039/ML-LAB/blob/main/1BM22CS039_ML_Lab_Exam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# Step 1: Load the dataset
# Assuming the dataset is stored in 'heart.csv'
df = pd.read_csv('heart.csv')

# Step 2: Remove outliers using Z-score
# **Update numeric_cols based on the actual column names printed above**
# For example, if 'Age' is 'age' in the dataframe, change 'Age' to 'age'
numeric_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak'] # Example correction, replace with actual names
z_scores = stats.zscore(df[numeric_cols])
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df_clean = df[filtered_entries].reset_index(drop=True)

# Step 3: Encode categorical variables and apply scaling
# Identify categorical and numerical columns
# **Update categorical_cols and numeric_cols based on the actual column names**
categorical_cols = ['sex', 'cp', 'restecg', 'exang', 'slope'] # Example correction, replace with actual names
numeric_cols = ['age', 'trestbps', 'chol', 'fbs', 'thalach', 'oldpeak'] # Example correction, replace with actual names

# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ])

# Apply preprocessing
X = df_clean.drop('target', axis=1) # **Assuming 'target' is the name for HeartDisease in your CSV**
y = df_clean['target'] # **Assuming 'target' is the name for HeartDisease in your CSV**
X_processed = preprocessor.fit_transform(X)

# Get feature names after one-hot encoding
cat_encoded_cols = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
feature_names = list(cat_encoded_cols) + numeric_cols

# Convert processed features back to DataFrame for clarity
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

# Step 4: Build classification models and evaluate accuracy
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed_df, y, test_size=0.2, random_state=42)

# Define models
models = {
    'SVM': SVC(kernel='rbf', random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate each model
accuracies = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")

# Find the best model
best_model_name = max(accuracies, key=accuracies.get)
print(f"\nBest Model: {best_model_name} with Accuracy: {accuracies[best_model_name]:.4f}")

# Step 5: Apply PCA and evaluate impact on model accuracy
# Determine number of components to retain 95% variance
pca = PCA()
pca.fit(X_processed_df)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"\nNumber of PCA components to retain 95% variance: {n_components}")

# Create pipeline with PCA and the best model
best_model = models[best_model_name]
pipeline = Pipeline([
    ('pca', PCA(n_components=n_components)),
    ('classifier', best_model)
])

# Train and evaluate model with PCA
pipeline.fit(X_train, y_train)
y_pred_pca = pipeline.predict(X_test)
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f"{best_model_name} Accuracy with PCA: {accuracy_pca:.4f}")
print(f"Accuracy Change with Impact: {accuracy_pca - accuracies[best_model_name]:.4f}")

# Save the processed dataset (optional)
X_processed_df['target'] = y # **Assuming 'target' is the name for HeartDisease in your CSV**
X_processed_df.to_csv('processed_heart.csv', index=False)


SVM Accuracy: 0.7797
Logistic Regression Accuracy: 0.8136
Random Forest Accuracy: 0.7458

Best Model: Logistic Regression with Accuracy: 0.8136

Number of PCA components to retain 95% variance: 10
Logistic Regression Accuracy with PCA: 0.7966
Accuracy Change with Impact: -0.0169
