<a href="https://colab.research.google.com/github/Aasthapriy44/ML_6A_003/blob/main/1BM22CS003_LAB_11_PCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
from sklearn.decomposition import PCA
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the digits dataset
digits = load_digits()

# Split the data into features (X) and target (y)
X = digits.data
y = digits.target

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform scaling (standardization) using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build PCA model with 2 components
pca = PCA(n_components=2)

# Fit the PCA model on the training data and transform it
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Initialize Logistic Regression model
log_reg = LogisticRegression(max_iter=10000)

# Train the Logistic Regression model using the PCA-transformed data
log_reg.fit(X_train_pca, y_train)

# Predict using the trained model
y_pred = log_reg.predict(X_test_pca)

# Calculate and print the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using PCA with 2 components: {accuracy:.4f}")


Accuracy using PCA with 2 components: 0.5167


In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer

try:
    df = pd.read_csv('heart.csv')
    print("Dataset loaded successfully.")
    print(df.head())
except FileNotFoundError:
    print("Error: 'heart_disease.csv' not found. Please make sure the file is in the correct directory.")
    exit()

# 2. Remove outliers using Z-score

numerical_cols = df.select_dtypes(include=np.number).columns

for col in numerical_cols:
    z = np.abs((df[col] - df[col].mean()) / df[col].std())
    df = df[z <= 3]

print(f"Shape of DataFrame after outlier removal: {df.shape}")

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
# Separate target variable (assuming the last column is the target)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# 3. Convert text columns to numbers using label encoding and one-hot encoding
#print("\nConverting categorical columns...")
if len(categorical_cols) > 0:
    # Apply one-hot encoding to categorical columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],
        remainder='passthrough')

    X_processed = preprocessor.fit_transform(X)
    feature_names = preprocessor.get_feature_names_out(X.columns)
    X = pd.DataFrame(X_processed, columns=feature_names)
    #print("Categorical columns one-hot encoded.")
#else:
    #print("No categorical columns found for encoding.")

# 4. Apply scaling
#print("\nApplying scaling...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
#print("Scaling applied.")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)

# 5. Build a classification model using various methods
#print("\nBuilding and evaluating classification models...")
models = {
    'SVM': SVC(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, solver='liblinear'),
    'Random Forest': RandomForestClassifier(random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")

best_model_name = max(results, key=results.get)
best_accuracy = results[best_model_name]
print(f"\nThe best performing model is {best_model_name} with an accuracy of {best_accuracy:.4f}")

# 6. Now use PCA to reduce dimensions, retrain your model and see what impact it has

pca = PCA(n_components=0.95) # Retain 95% of variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"Original number of features: {X_train.shape[1]}")
print(f"Number of features after PCA: {X_train_pca.shape[1]}")

results_pca = {}
for name, model in models.items():
    model.fit(X_train_pca, y_train)
    y_pred_pca = model.predict(X_test_pca)
    accuracy_pca = accuracy_score(y_test, y_pred_pca)
    results_pca[name] = accuracy_pca
    print(f"{name} Accuracy with PCA: {accuracy_pca:.4f}")

print("\nComparison of accuracies:")
for model_name in results:
    print(f"{model_name}: Original Accuracy = {results[model_name]:.4f}, Accuracy with PCA = {results_pca[model_name]:.4f}")



Dataset loaded successfully.
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  
Shape of DataFrame after outlier removal: (899, 12)
SVM Accuracy: 0.8778
Logistic Regression Accuracy: 0.8889
Random Forest Accuracy: 0.8667

The best performing model is Logisti