<a href="https://colab.research.google.com/github/2303A52083/2303A52083-23CSBTB39-40-SML/blob/main/SML11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Load the data
data = pd.read_csv('bcs.csv')

# Step 2: Identify features and target variable
# Drop rows with missing values in 'Patient_Status'
data = data.dropna(subset=['Patient_Status'])

# Encode the target variable 'Patient_Status'
label_encoder = LabelEncoder()
data['Patient_Status'] = label_encoder.fit_transform(data['Patient_Status'])  # Alive=0, Dead=1

# Drop unnecessary columns like 'Date_of_Surgery' and 'Date_of_Last_Visit'
data = data.drop(columns=['Date_of_Surgery', 'Date_of_Last_Visit'])

# Separate features and target
X = data.drop(columns=['Patient_Status'])
y = data['Patient_Status']

# Encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 3: Train models and find accuracy
# Initialize models
svm = SVC(random_state=42)
knn = KNeighborsClassifier()
log_reg = LogisticRegression(random_state=42)

# Train and evaluate models
models = {'SVM': svm, 'KNN': knn, 'Logistic Regression': log_reg}
accuracies = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracies[model_name] = accuracy_score(y_test, y_pred)
    print(f'{model_name} Accuracy (before PCA): {accuracies[model_name]:.2f}')

# Step 4: Reduce dimensionality with PCA
pca = PCA(n_components=5)  # Adjust the number of components as needed
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Step 5: Retrain models on PCA-transformed data and evaluate accuracy
pca_accuracies = {}

for model_name, model in models.items():
    model.fit(X_train_pca, y_train)
    y_pred_pca = model.predict(X_test_pca)
    pca_accuracies[model_name] = accuracy_score(y_test, y_pred_pca)
    print(f'{model_name} Accuracy (after PCA): {pca_accuracies[model_name]:.2f}')

# Display all accuracies
print("\nAccuracies before PCA:", accuracies)
print("Accuracies after PCA:", pca_accuracies)


SVM Accuracy (before PCA): 0.77
KNN Accuracy (before PCA): 0.77
Logistic Regression Accuracy (before PCA): 0.77
SVM Accuracy (after PCA): 0.77
KNN Accuracy (after PCA): 0.71
Logistic Regression Accuracy (after PCA): 0.77

Accuracies before PCA: {'SVM': 0.7692307692307693, 'KNN': 0.7692307692307693, 'Logistic Regression': 0.7692307692307693}
Accuracies after PCA: {'SVM': 0.7692307692307693, 'KNN': 0.7076923076923077, 'Logistic Regression': 0.7692307692307693}
