In [1]:
#This cell starts the code and imports all the libraries I'll be using in my program

import pandas as pd
#Used generally as a data analysis and pre-processing tool throighout the program

import matplotlib.pyplot as plt
#I will use this most likely after training to give visual representations of the model's performance over time

#The following classes are from the Sci-kit Learn library used for training and testing the model using different methods
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

#The confusion matrix helps us evaluate the performances of various models and algorithms
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


ModuleNotFoundError: No module named 'pandas'

In [2]:
#This cell loads and reads my pathology slide dataset from my local device
df = pd.read_csv("/Users/adwit/Downloads/archive/data.csv")

In [3]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
#This cell initiates the first part of data pre-processing which is data cleaning my removing unnecessary attributes
df = df.drop(columns="Unnamed: 32", axis=1)
df = df.drop(columns="id", axis=1)

In [5]:
#The dependent variable vector (diagnosis attribute) can take 2 values M for malignant (cancerous) or B for benign (non-cancerous)
#I will convert the Ms to 1s and Bs to 0s as the neural network can then give binary predictions making it easier to work with
df["diagnosis"] = df["diagnosis"].map({"M":1, "B":0})

In [6]:
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [7]:
#To make it less overwhelming, I've split the features on the dataset into 3 types (worst, standard error and mean)
#I'll look at which category influences the diagnosis attribute the most by seperating the categories.
worst_features = list(df.columns[21:31])
se_features = list(df.columns[11:21])
mean_features = list(df.columns[1:11])

#We can only see correlations between attributes if they're in the same category now, so I'll add 'diagnosis' to all categories
worst_features.append("diagnosis")
se_features.append("diagnosis")
mean_features.append("diagnosis")

In [8]:
#Change element to see correlations with other attribute sets
#The output table shows how well each attribute correlates with every other attribute using a correlation co-efficient
corr = df[se_features].corr()
corr

Unnamed: 0,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,diagnosis
radius_se,1.0,0.213247,0.972794,0.95183,0.164514,0.356065,0.332358,0.513346,0.240567,0.227754,0.567134
texture_se,0.213247,1.0,0.223171,0.111567,0.397243,0.2317,0.194998,0.230283,0.411621,0.279723,-0.008303
perimeter_se,0.972794,0.223171,1.0,0.937655,0.151075,0.416322,0.362482,0.556264,0.266487,0.244143,0.556141
area_se,0.95183,0.111567,0.937655,1.0,0.07515,0.28484,0.270895,0.41573,0.134109,0.127071,0.548236
smoothness_se,0.164514,0.397243,0.151075,0.07515,1.0,0.336696,0.268685,0.328429,0.413506,0.427374,-0.067016
compactness_se,0.356065,0.2317,0.416322,0.28484,0.336696,1.0,0.801268,0.744083,0.394713,0.803269,0.292999
concavity_se,0.332358,0.194998,0.362482,0.270895,0.268685,0.801268,1.0,0.771804,0.309429,0.727372,0.25373
concave points_se,0.513346,0.230283,0.556264,0.41573,0.328429,0.744083,0.771804,1.0,0.31278,0.611044,0.408042
symmetry_se,0.240567,0.411621,0.266487,0.134109,0.413506,0.394713,0.309429,0.31278,1.0,0.369078,-0.006522
fractal_dimension_se,0.227754,0.279723,0.244143,0.127071,0.427374,0.803269,0.727372,0.611044,0.369078,1.0,0.077972


In [9]:
#This array contains all the attributes I've concluded to use to train my model based on the correlation tests (>=|0.5|)
#Didn't just use every feature to reduce risk of overfitting
prediction_attr = ['radius_mean', 'perimeter_mean', 'area_mean', 'concave points_mean',
                   'radius_se', 'area_se',
                   'radius_worst', 'perimeter_worst', 'area_worst']

In [10]:
#Using a Sci-kit Learn class to seperate and keep 15% of the data for model validation (testing)
#Selecting a specific number for the random_state attribute means that we'll get the same split every time this function is run.
#The function above creates 2 seperate dataframes, one called 'train' and one called 'test'.
#Both dataframes contain every attribute I've selected but just different individual records.
train, test = train_test_split(df, test_size=0.15, random_state=1)

In [11]:
#The lines below split the train dataframe into the features (train_x) and the dependent variable vector (train_y)
train_x = train[prediction_attr]
train_y = train['diagnosis']

#The lines below split the test dataframe into the features (test_x) and the dependent variable vector (test_y)
test_x = test[prediction_attr]
test_y = test['diagnosis']

In [12]:
#The 1st model I'm trying to use to train my AI is the standard MLP classification system (imported from sklearn earlier)
#The hyperparameters are set automatically by this sklearn package so we don't need to worry about them
model = RandomForestClassifier()
model.fit(train_x, train_y)

RandomForestClassifier()

In [13]:
predictions = model.predict(test_x)

In [14]:
#This function returns a 2D array where we can see how many of each category the model predicted correctly and incorrectly
performance = confusion_matrix(test_y, predictions)
print(performance)

#This function returns the accuracy which is calculated by (correct predictions/ total predictions)
accuracy = accuracy_score(test_y, predictions)
print("The accuracy is", accuracy*100, "%")

[[52  0]
 [ 5 29]]
The accuracy is 94.18604651162791 %


In [15]:
#This can give a more visual understanding of whats going on
print("PREDICTIONS BY AI")
for i in range(5):
    if predictions[i]==0:
        print("Benign")
    else:
        print("Malignant")
print(". .")
for i in range(5, 0, -1):
    if predictions[len(predictions)-i] == 0:
        print("Benign")
    else:
        print("Malignant")

print("\nACTUAL ANSWERS")
print(test_y.map({0: "Benign", 1: "Malignant"}))

PREDICTIONS BY AI
Benign
Malignant
Benign
Malignant
Benign
. .
Malignant
Benign
Malignant
Benign
Benign

ACTUAL ANSWERS
421       Benign
47     Malignant
292       Benign
186    Malignant
414    Malignant
         ...    
335    Malignant
308       Benign
370    Malignant
403       Benign
120       Benign
Name: diagnosis, Length: 86, dtype: object


In [None]:
##### USER INPUT PREDICTIONS (for demo)
FNum= len(prediction_attr)
def Mapped(parameter):
    if parameter == 0:
        return "Benign"
    else:
        return "Malignant (Cancerous)"

Ans=input("Hi Adwit, would you like to enter your own data today? y/n ")
if Ans=="y":
    subFeatures=[]
    print("Enter your value for: ")
    for i in range(FNum):
        temp=int(input(prediction_attr[i]))
        subFeatures.append(temp)
    Features = []
    Features.append(subFeatures)
    prediction = model.predict(Features)
    print("I predict this case is", Mapped(prediction))

else:
    print("Here's a sample of a pre-existing dataset\n", test_x)
    row = int(input("\n\nEnter the row number. "))-1
    Features=test_x.iloc[row:(row+1), :]
    print(Features)
    prediction = model.predict(Features)
    print("I predict this case is", Mapped(prediction))

    Actual = test_y.iloc[row]
    print("The true diagnosis from the dataset is", Mapped(Actual))