In [40]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [41]:
# Assume that the first column is the ID, second is the label (M = malignant, B = benign), and the rest are features
X = data.loc[:, 2:].values
y = data.loc[:, 1].values

In [42]:
# Convert labels to integers for the model
y = (y == 'M').astype(int)


In [43]:
# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [44]:
# Scale the features for better performance
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [45]:
# Create a logistic regression model
model = LogisticRegression()


In [46]:
# Train the model
model.fit(X_train, y_train)

# Test the model
predictions = model.predict(X_test)

In [47]:
# Print metrics to evaluate the model
print(f'Accuracy: {accuracy_score(y_test, predictions)}')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, predictions)}')
print(f'Classification Report: \n {classification_report(y_test, predictions)}')

Accuracy: 0.9736842105263158
Confusion Matrix: 
 [[70  1]
 [ 2 41]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [50]:
# Import necessary libraries
import numpy as np

# Here is an example of a new data sample (this should have the same number of features as your training data)
new_sample = np.array([1.79900000e+01, 1.03800000e+01, 1.22800000e+02, 1.00100000e+03,
       1.18400000e-01, 2.77600000e-01, 3.00100000e-01, 1.47100000e-01,
       2.41900000e-01, 4.87100000e-02, 1.09500000e+00, 9.05300000e-01,
       8.58900000e+00, 1.53400000e+02, 6.39900000e-03, 4.90400000e-02,
       5.37300000e-02, 1.58700000e-02, 3.00300000e-02, 6.19300000e-03,
       2.53800000e+01, 1.73300000e+01, 1.84600000e+02, 2.01900000e+03,
       1.62200000e-01, 6.65600000e-01, 7.11900000e-01, 2.65400000e-01,
       4.60100000e-01, 1.18900000e-01])

# Note: this new_sample is an example and the actual sample should be collected from the field or given in your problem statement.

# Reshape the new sample to match the input shape that the model expects
new_sample = new_sample.reshape(1, -1)

# Scale the new sample using the same scaler used for the training data
new_sample = sc.transform(new_sample)

# Use the model to make a prediction for the new sample
new_prediction = model.predict(new_sample)

# Print out the prediction
if new_prediction[0] == 0:
    print("The model predicts that the tumor is benign.")
else:
    print("The model predicts that the tumor is malignant.")


The model predicts that the tumor is malignant.
