####  Breast Cancer Detection using Wisconsin (Diagnostic) Data Set

### Data Acquisition

In [1]:
import pandas as pd


In [2]:
# Using the pandas library to load the dataset

In [3]:

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
column_names = ["id", "diagnosis"] + [f"feature_{i}" for i in range(30)]
data = pd.read_csv(url, header=None, names=column_names)


### Data Preprocessing

In [4]:
# Drop the 'id' column as it sems to be an identifier

In [5]:
data = data.drop('id', axis=1)


In [6]:
# Mapping the 'diagnosis' column to binary values:

In [7]:
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})


In [8]:
# Split the dataset into train and test sets using sklearn's split function

In [9]:
from sklearn.model_selection import train_test_split

In [10]:


X = data.drop('diagnosis', axis=1)
y = data['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


### Model Training & Evaluation

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

#####  Compare performance of well known algorithms:

##### * Logistic Regression
##### * Decision Tree Classifier
##### * Random Forest Classifier
##### * Support Vector Machines (SVM)
##### * k-Nearest Neighbors (kNN)

In [12]:

# Initialize the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "kNN": KNeighborsClassifier()
}

# Train and evaluate
results = {}

#output the accuracy for each algorithm

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    results[name] = accuracy

print(results)


{'Logistic Regression': 0.9766081871345029, 'Decision Tree': 0.935672514619883, 'Random Forest': 0.9649122807017544, 'SVM': 0.935672514619883, 'kNN': 0.9590643274853801}


### Compare & Choose Best Model

In [13]:
best_model_name = max(results, key=results.get)

print(f"Best performing model is: {best_model_name} with an accuracy of {results[best_model_name]}")


Best performing model is: Logistic Regression with an accuracy of 0.9766081871345029


In [14]:
best_model=LogisticRegression(max_iter=10000)
best_model.fit(X_train, y_train)

### Prediction

In [15]:
# Creating a prediction function to use the  best model to make the prediction

In [16]:
def predict_with_best_model(input_data):
    """
    Use the best model to make a prediction.

    Parameters:
    - input_data: A 2D array or DataFrame representing the features of the data point(s)

    Returns:
    - predictions: Predicted labels for the input data
    """
    predictions = best_model.predict(input_data)
    return predictions


### Model Usage

In [17]:
# Usage of the prediction function. We can take the first 5 test samples from the dataset to do predictions for the sample data
#using the best performing model.

#Depending on the model, a result of  1 indicates a malignant tumor and 0 a benign tumor.

In [18]:
sample_data = X_test.iloc[:5]
predictions = predict_with_best_model(sample_data)

print(predictions)


[0 1 1 0 0]
