# BREAST CANCER CLASSSIFICATION

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


In [7]:
# Load the dataset
file_path = '/home/rguktongole/Downloads/data.csv'
data = pd.read_csv(file_path)

In [8]:
# Drop unnecessary columns
data = data.drop(['id', 'Unnamed: 32'], axis=1)

In [9]:
# Encode the target variable
label_encoder = LabelEncoder()
data['diagnosis'] = label_encoder.fit_transform(data['diagnosis'])

In [10]:
# Split the data into features and target variable
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Standardize the feature variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [13]:
# Define a function to evaluate models
def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return accuracy, report

In [14]:
# Initialize models
knn = KNeighborsClassifier()
rf = RandomForestClassifier(random_state=42)
log_reg = LogisticRegression(max_iter=10000, random_state=42)
dt = DecisionTreeClassifier(random_state=42)
lin_reg = LinearRegression()


In [15]:
# Evaluate models
models = {'K-Nearest Neighbors': knn, 'Random Forest': rf, 'Logistic Regression': log_reg, 'Decision Tree': dt}


In [16]:
results = {}
for name, model in models.items():
    accuracy, report = evaluate_model(model)
    results[name] = {'Accuracy': accuracy, 'Report': report}


In [17]:
# Linear Regression needs special handling since it's not a classifier
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)
y_pred_lin_class = (y_pred_lin > 0.5).astype(int)
lin_reg_accuracy = accuracy_score(y_test, y_pred_lin_class)
lin_reg_report = classification_report(y_test, y_pred_lin_class)

results['Linear Regression'] = {'Accuracy': lin_reg_accuracy, 'Report': lin_reg_report}

results

{'K-Nearest Neighbors': {'Accuracy': 0.9473684210526315,
  'Report': '              precision    recall  f1-score   support\n\n           0       0.96      0.96      0.96        71\n           1       0.93      0.93      0.93        43\n\n    accuracy                           0.95       114\n   macro avg       0.94      0.94      0.94       114\nweighted avg       0.95      0.95      0.95       114\n'},
 'Random Forest': {'Accuracy': 0.9649122807017544,
  'Report': '              precision    recall  f1-score   support\n\n           0       0.96      0.99      0.97        71\n           1       0.98      0.93      0.95        43\n\n    accuracy                           0.96       114\n   macro avg       0.97      0.96      0.96       114\nweighted avg       0.97      0.96      0.96       114\n'},
 'Logistic Regression': {'Accuracy': 0.9736842105263158,
  'Report': '              precision    recall  f1-score   support\n\n           0       0.97      0.99      0.98        71\n        