In [1]:
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import confusion_matrix

import pandas as pd

from matplotlib.colors import ListedColormap



# Import dataset

# Read the file and discard lines containing ?

# Try to download the file

try:

    import requests

    r = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")

    with open("breast-cancer-wisconsin-filtered.csv", mode='w') as out:

            for line in r.text.split("\n"):

                if "?" not in line:

                    out.write(line+"\n")

    print("Downloaded data from Internet")

# Fall back to reading file from local disk

except Exception:

    with open("breast-cancer-wisconsin.data") as f:

        with open("breast-cancer-wisconsin-filtered.csv", mode='w') as out:

            for line in f.readlines():

                if "?" not in line:

                    out.write(line)

    print("Failed to read data from Internet... loaded from local disk instead.")





dataset = pd.read_csv("breast-cancer-wisconsin-filtered.csv", header=None)



# Dataset contains:

'''

   #  Attribute                     Domain

   -- -----------------------------------------

   1. Sample code number            id number

   2. Clump Thickness               1 - 10

   3. Uniformity of Cell Size       1 - 10

   4. Uniformity of Cell Shape      1 - 10

   5. Marginal Adhesion             1 - 10

   6. Single Epithelial Cell Size   1 - 10

   7. Bare Nuclei                   1 - 10

   8. Bland Chromatin               1 - 10

   9. Normal Nucleoli               1 - 10

  10. Mitoses                       1 - 10

  11. Class:                        (2 for benign, 4 for malignant)

'''



X = dataset.iloc[:, [1,2,3,4,5,6,7,8,9]].values # Independent variables are columns 1-9 (col 0 is not used)

y = dataset.iloc[:, 10].values # Dependent variable is column 10



# Split the data into training (75%) and testing (25%) groups

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)



# Scale the data:

sc_X = StandardScaler() # Instantiate a scaler object

# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler.fit_transform

X_train = sc_X.fit_transform(X_train) # Fit the scaler to X_train and transform X_train

X_test = sc_X.transform(X_test) # Transform X_test - the scaler has already been "fit" using the training X values



# Set up the Logistic Regression model

classifier = LogisticRegression(random_state=0)

classifier.fit(X_train, y_train) # Fit the model using the training values



# Generate predicted values for the test dataset

y_pred = classifier.predict(X_test)



# Create a Confusion Matrix to determine the accuracy of the model

cm = confusion_matrix(y_test, y_pred) # Compare the actual y values with the predicted y values



# Calculate accuracy percentage

accuracy = (cm[0][0]+cm[1][1]) / len(y_test)

print("Accuracy:", accuracy)



# Prompt the user for data and make a prediction

items = [

"Clump Thickness               1 - 10",

"Uniformity of Cell Size       1 - 10",

"Uniformity of Cell Shape      1 - 10",

"Marginal Adhesion             1 - 10",

"Single Epithelial Cell Size   1 - 10",

"Bare Nuclei                   1 - 10",

"Bland Chromatin               1 - 10",

"Normal Nucleoli               1 - 10",

"Mitoses                       1 - 10",

]

userVals = []

for item in items:

    res = input("Enter " + item + " : ")

    userVals.append(int(res))



# The model predicts an outcome based on age and salary

# These values need to be scaled using the scaler set up previously

userInput_X = np.array([userVals]) # Array with user's values

userInput_X = sc_X.transform(userInput_X) # Scale the input



# Using the model ("classifier") that was previously created, predict the outcome

user_pred_y = classifier.predict(userInput_X)

print("Predicted outcome for user data: {}".format(user_pred_y[0]), end='')

if user_pred_y[0] == 2:

    print(" (Benign)")

elif user_pred_y[0] == 4:

    print(" (Malignant)")

else:

    print(" (Error/Unknown)")
    

Downloaded data from Internet
Accuracy: 0.9473684210526315
Enter Clump Thickness               1 - 10 : 6
Enter Uniformity of Cell Size       1 - 10 : 7
Enter Uniformity of Cell Shape      1 - 10 : 3
Enter Marginal Adhesion             1 - 10 : 2
Enter Single Epithelial Cell Size   1 - 10 : 1
Enter Bare Nuclei                   1 - 10 : 9
Enter Bland Chromatin               1 - 10 : 8
Enter Normal Nucleoli               1 - 10 : 7
Enter Mitoses                       1 - 10 : 1
Predicted outcome for user data: 4 (Malignant)
