In [1]:
#!pip install --upgrade networkx

In [2]:
#!pip install pybbn

In [3]:

import pandas as pd # for data manipulation 
import networkx as nx # for drawing graphs
import matplotlib.pyplot as plt # for drawing graphs
import numpy as np

# for creating Bayesian Belief Networks (BBN)
from pybbn.graph.dag import Bbn
from pybbn.graph.edge import Edge, EdgeType
from pybbn.graph.jointree import EvidenceBuilder
from pybbn.graph.node import BbnNode
from pybbn.graph.variable import Variable
from pybbn.pptc.inferencecontroller import InferenceController

In [4]:
# Read in the data csv
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header = None, encoding='utf-8')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       699 non-null    int64 
 1   1       699 non-null    int64 
 2   2       699 non-null    int64 
 3   3       699 non-null    int64 
 4   4       699 non-null    int64 
 5   5       699 non-null    int64 
 6   6       699 non-null    object
 7   7       699 non-null    int64 
 8   8       699 non-null    int64 
 9   9       699 non-null    int64 
 10  10      699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [6]:
df.columns = ['ID', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 
              'Mitosis', 'Class (Benign/malignant)']

In [7]:
df.head()

Unnamed: 0,ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitosis,Class (Benign/malignant)
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [8]:
df.nunique()

ID                             645
Clump Thickness                 10
Uniformity of Cell Size         10
Uniformity of Cell Shape        10
Marginal Adhesion               10
Single Epithelial Cell Size     10
Bare Nuclei                     11
Bland Chromatin                 10
Normal Nucleoli                 10
Mitosis                          9
Class (Benign/malignant)         2
dtype: int64

In [9]:
df['Bare Nuclei'].nunique()

11

In [10]:
df.isna().sum()

ID                             0
Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitosis                        0
Class (Benign/malignant)       0
dtype: int64

In [11]:
df.drop_duplicates()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   ID                           699 non-null    int64 
 1   Clump Thickness              699 non-null    int64 
 2   Uniformity of Cell Size      699 non-null    int64 
 3   Uniformity of Cell Shape     699 non-null    int64 
 4   Marginal Adhesion            699 non-null    int64 
 5   Single Epithelial Cell Size  699 non-null    int64 
 6   Bare Nuclei                  699 non-null    object
 7   Bland Chromatin              699 non-null    int64 
 8   Normal Nucleoli              699 non-null    int64 
 9   Mitosis                      699 non-null    int64 
 10  Class (Benign/malignant)     699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [12]:
df['Bare Nuclei'].unique()

array(['1', '10', '2', '4', '3', '9', '7', '?', '5', '8', '6'],
      dtype=object)

In [24]:
df['Bare Nuclei']= df['Bare Nuclei'].replace('?', np.nan)

In [25]:
df.isna().sum()

ID                              0
Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitosis                         0
Class (Benign/malignant)        0
dtype: int64

In [29]:
#replace all NaN values with zeros
df['Bare Nuclei'] = df['Bare Nuclei'].fillna(0)

df['Bare Nuclei'] = df['Bare Nuclei'].astype(int)
#Finding the mean of the column having NaN
mean_value= df['Bare Nuclei'].mean().round()
mean_value  

# Replace NaNs in column with the mean of values in the same column
df['Bare Nuclei'].fillna(value= mean_value, inplace=True)


In [None]:
df=df.fillna(df.mean())

In [34]:

# calculate the conditional probability for clump_thickness
clump_thickness_cpd = df.groupby(['Clump Thickness', 'Class (Benign/malignant)']).size().reset_index(name='counts')
clump_thickness_cpd = clump_thickness_cpd.pivot(index='Clump Thickness', columns='Class (Benign/malignant)', values='counts')
clump_thickness_cpd = clump_thickness_cpd.div(clump_thickness_cpd.sum(axis=1), axis=0)

# calculate the conditional probability for uniformity_of_cell_size
uniformity_of_cell_size_cpd = df.groupby(['Uniformity of Cell Size', 'Class (Benign/malignant)']).size().reset_index(name='counts')
uniformity_of_cell_size_cpd = uniformity_of_cell_size_cpd.pivot(index='Uniformity of Cell Size', columns='Class (Benign/malignant)', values='counts')
uniformity_of_cell_size_cpd = uniformity_of_cell_size_cpd.div(uniformity_of_cell_size_cpd.sum(axis=1), axis=0)

# Calculate the conditional probability for uniformity_of_cell_shape
uniformity_of_cell_shape_cpd = df.groupby(['Uniformity of Cell Shape', 'Class (Benign/malignant)']).size().reset_index(name='counts')
uniformity_of_cell_shape_cpd = uniformity_of_cell_shape_cpd.pivot(index='Uniformity of Cell Shape', columns='Class (Benign/malignant)', values='counts')
uniformity_of_cell_shape_cpd = uniformity_of_cell_shape_cpd.div(uniformity_of_cell_shape_cpd.sum(axis=1), axis=0)

# Calculate the conditional probability for marginal_adhesion
marginal_adhesion_cpd = df.groupby(['Marginal Adhesion', 'Class (Benign/malignant)']).size().reset_index(name='counts')
marginal_adhesion_cpd = marginal_adhesion_cpd.pivot(index='Marginal Adhesion', columns='Class (Benign/malignant)', values='counts')
marginal_adhesion_cpd = marginal_adhesion_cpd.div(marginal_adhesion_cpd.sum(axis=1), axis=0)


# Calculate the conditional probability for single_epithelial_cell_size
single_epithelial_cell_size_cpd = df.groupby(['Single Epithelial Cell Size', 'Class (Benign/malignant)']).size().reset_index(name='counts')
single_epithelial_cell_size_cpd = single_epithelial_cell_size_cpd.pivot(index='Single Epithelial Cell Size', columns='Class (Benign/malignant)', values='counts')
single_epithelial_cell_size_cpd = single_epithelial_cell_size_cpd.div(single_epithelial_cell_size_cpd.sum(axis=1), axis=0)


# Calculate the conditional probability for bare_nuclei
bare_nuclei_cpd = df.groupby(['Bare Nuclei', 'Class (Benign/malignant)']).size().reset_index(name='counts')
bare_nuclei_cpd = bare_nuclei_cpd.pivot(index='Bare Nuclei', columns='Class (Benign/malignant)', values='counts')
bare_nuclei_cpd = bare_nuclei_cpd.div(bare_nuclei_cpd.sum(axis=1), axis=0)


# Calculate the conditional probability for bland_chromatin
bland_chromatin_cpd = df.groupby(['Bland Chromatin', 'Class (Benign/malignant)']).size().reset_index(name='counts')
bland_chromatin_cpd = bland_chromatin_cpd.pivot(index='Bland Chromatin', columns='Class (Benign/malignant)', values='counts')
bland_chromatin_cpd = bland_chromatin_cpd.div(bland_chromatin_cpd.sum(axis=1), axis=0)

# Calculate the conditional probability for bland_chromatin
normal_nucleoli_cpd = df.groupby(['Normal Nucleoli', 'Class (Benign/malignant)']).size().reset_index(name='counts')
normal_nucleoli_cpd = normal_nucleoli_cpd.pivot(index='Normal Nucleoli', columns='Class (Benign/malignant)', values='counts')
normal_nucleoli_cpd = normal_nucleoli_cpd.div(normal_nucleoli_cpd.sum(axis=1), axis=0)

# Calculate the conditional probability for Mitosis
Mitosis_cpd = df.groupby(['Mitosis', 'Class (Benign/malignant)']).size().reset_index(name='counts')
Mitosis_cpd = Mitosis_cpd.pivot(index='Mitosis', columns='Class (Benign/malignant)', values='counts')
Mitosis_cpd = Mitosis_cpd.div(Mitosis_cpd.sum(axis=1), axis=0)


In [35]:

from pybbn.graph.dag import Bbn
from pybbn.graph.edge import Edge
from pybbn.graph.node import BbnNode
from pybbn.graph.variable import Variable
from pybbn.pptc.inferencecontroller import InferenceController

# Define the variables
clump_thickness = Variable('Clump Thickness', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
uniformity_of_cell_size = Variable('Uniformity of Cell Size', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
uniformity_of_cell_shape = Variable('Uniformity of Cell Shape', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
marginal_adhesion = Variable('Marginal Adhesion', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
single_epithelial_cell_size = Variable('Single Epithelial Cell Size', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
bare_nuclei = Variable('Bare Nuclei', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
bland_chromatin = Variable('Bland Chromatin', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
normal_nucleoli = Variable('Normal Nucleoli', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
mitoses = Variable('Mitoses', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
class_label = Variable('Class Label', [2, 4])

# Create the BBN
bbn = Bbn() \
    .add_node(BbnNode(clump_thickness)) \
    .add_node(BbnNode(uniformity_of_cell_size)) \
    .add_node(BbnNode(uniformity_of_cell_shape)) \
    .add_node(BbnNode(marginal_adhesion)) \
    .add_node(BbnNode(single_epithelial_cell_size)) \
    .add_node(BbnNode(bare_nuclei)) \
    .add_node(BbnNode(bland_chromatin)) \
    .add_node(BbnNode(normal_nucleoli)) \
    .add_node(BbnNode(mitoses)) \
    .add_node(BbnNode(class_label)) \
    .add_edge(Edge(clump_thickness, class_label)) \
    .add_edge(Edge(uniformity_of_cell_size, class_label)) \
    .add_edge(Edge(uniformity_of_cell_shape, class_label)) \
    .add_edge(Edge(marginal_adhesion, class_label)) \
    .add_edge(Edge(single_epithelial_cell_size, class_label)) \
    .add_edge(Edge(bare_nuclei, class_label)) \
    .add_edge(Edge(bland_chromatin, class_label)) \
    .add_edge(Edge(normal_nucleoli, class_label)) \
    .add_edge(Edge(mitoses, class_label))


TypeError: Variable.__init__() missing 1 required positional argument: 'values'

In [None]:
# train the network using the dataset

# make a diagnosis on a new patient
new_patient = [3, 3, 3, 3, 3, 3, 3, 3, 3]

ic = InferenceController(bbn)
class_label_posterior = ic.posterior(class_label, 
                                    evidence={clump_thickness: new_patient[0],
                                              uniformity_of_cell_size: new_patient[1],
                                              uniformity_of_cell_shape: new_patient[2],
                                              marginal_adhesion: new_patient[3],
                                              single_epithelial_cell_size: new_patient[4],
                                              bare_nuclei: new_patient[5],
                                              bland_chromatin: new_patient[6],
                                              normal_nucleoli: new_patient[7],
                                              mitoses: new_patient[8]})

diagnosis = class_label_posterior.argmax()
print('Diagnosis:', diagnosis)