In [22]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

# Load the dataset (fixing the file path issue)
data = pd.read_csv(r"C:\Users\ASWANTH\Downloads\New folder (2)\data.csv")

# Drop 'Unnamed: 32' column explicitly if it contains all missing values
data = data.drop(columns=['Unnamed: 32'], axis=1)

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Encode categorical target (assuming 'diagnosis' is the target column)
label_encoder = LabelEncoder()
data_imputed['diagnosis'] = label_encoder.fit_transform(data_imputed['diagnosis'])

# Separate features and target
X = data_imputed.drop('diagnosis', axis=1)
y = data_imputed['diagnosis']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Logistic Regression
log_reg_clf = LogisticRegression(max_iter=1000, random_state=42)
log_reg_clf.fit(X_train, y_train)
log_reg_pred = log_reg_clf.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, log_reg_pred)

# Bagging Classifier
bagging_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)
bagging_clf.fit(X_train, y_train)
bagging_pred = bagging_clf.predict(X_test)
bagging_accuracy = accuracy_score(y_test, bagging_pred)

# Boosting Classifier (AdaBoost)
boosting_clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50, algorithm='SAMME', random_state=42)
boosting_clf.fit(X_train, y_train)
boosting_pred = boosting_clf.predict(X_test)
boosting_accuracy = accuracy_score(y_test, boosting_pred)

# Voting Classifier
knn_clf = KNeighborsClassifier()
voting_clf = VotingClassifier(
    estimators=[('lr', log_reg_clf), ('dt', DecisionTreeClassifier()), ('knn', knn_clf)],
    voting='hard'
)
voting_clf.fit(X_train, y_train)
voting_pred = voting_clf.predict(X_test)
voting_accuracy = accuracy_score(y_test, voting_pred)

# Print accuracy scores
print("Logistic Regression Accuracy:", log_reg_accuracy)
print("Bagging Classifier Accuracy:", bagging_accuracy)
print("Boosting Classifier Accuracy:", boosting_accuracy)
print("Voting Classifier (Ensemble) Accuracy:", voting_accuracy)


Logistic Regression Accuracy: 0.9590643274853801
Bagging Classifier Accuracy: 0.9532163742690059
Boosting Classifier Accuracy: 0.9824561403508771
Voting Classifier (Ensemble) Accuracy: 0.9590643274853801
