# Actividad PBL 2. Redes Bayesianas

### **Integrantes**
* Salette Noemi Villalobos
* Samuel Méndez Villegas
* Paola Vega Ortega
* Ethan Verduzco Pérez

### **Docente**
* Dr. Daniel Otero Fadul

In [4]:
import networkx as nx # for drawing graphs
import matplotlib.pyplot as plt # for drawing graphs

# for creating Bayesian Belief Networks (BBN)
from pybbn.graph.dag import Bbn ## Recurden instalarla desde la termanl "pip install pybbn"

from pybbn.graph.edge import Edge, EdgeType
from pybbn.graph.jointree import EvidenceBuilder
from pybbn.graph.node import BbnNode
from pybbn.graph.variable import Variable
from pybbn.pptc.inferencecontroller import InferenceController

In [2]:
import pandas as pd
import numpy as np ## Se utilizará para acceder a los valores de los cuartiles

In [19]:
diabetes = pd.read_csv('diabetes-dataset.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


# Procesamiento de la base de datos

## Agregar columna 'overweight' a la base de datos

In [20]:
overweight = []

for i in diabetes['BMI']:
    if i >= 25:
        overweight.append(1) # personas con sobrepeso
    else:
        overweight.append(0) # personas sin sobrepeso

diabetes['Overweight'] = overweight
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Overweight
0,2,138,62,35,0,33.6,0.127,47,1,1
1,0,84,82,31,125,38.2,0.233,23,0,1
2,0,145,0,0,0,44.2,0.630,31,1,1
3,0,135,68,42,250,42.3,0.365,24,1,1
4,1,139,62,41,480,40.7,0.536,21,0,1
5,0,173,78,32,265,46.5,1.159,58,0,1
6,4,99,72,17,0,25.6,0.294,28,0,1
7,8,194,80,0,0,26.1,0.551,67,0,1
8,2,83,65,28,66,36.8,0.629,24,0,1
9,2,89,90,30,0,33.5,0.292,42,0,1


## Discretizar variables continuas

In [21]:
def discretize(df):
    
    for i in df:
    
        if (i == 'Pregnancies') | (i == 'Outcome') | (i == 'Overweight'): ## Estas columnas ya son discretas
            continue
        elif (i == 'DiabetesPedigreeFunction'):
            
            df['DiabetesPedigreeFunction'] = df['DiabetesPedigreeFunction'] * 100
            
            q1 = np.percentile(df[i], 25)
            q2 = np.percentile(df[i], 50)
            q3 = np.percentile(df[i], 75)
                
            print('Variable:', i)
            print('Cuartil 1:', q1)
            print('Cuartil 2:', q2)
            print('Cuartil 3:', q3)
                
            ## Se realiza la discretización
            df[i] = df[i].replace(df[i][df[i] < q1], 1)            
            df[i] = df[i].replace(df[i][(df[i] >= q1) & (df[i] < q2)], 2)
            df[i] = df[i].replace(df[i][(df[i] >= q2) & (df[i] < q3)], 3)
            df[i] = df[i].replace(df[i][df[i] >= q3], 4)
            
        else: 
                
            ## Se obtienen los cuartiles para formar los rangos

            q1 = np.percentile(df[i][df[i] > 0], 25)
            q2 = np.percentile(df[i][df[i] > 0], 50)
            q3 = np.percentile(df[i][df[i] > 0], 75)

            print('Variable:', i)
            print('Cuartil 1:', q1)
            print('Cuartil 2:', q2)
            print('Cuartil 3:', q3)

            ## Se realiza la discretización

            df[i] = df[i].replace(df[i][df[i] < q1], 1)
            df[i] = df[i].replace(df[i][(df[i] >= q1) & (df[i] < q2)], 2)
            df[i] = df[i].replace(df[i][(df[i] >= q2) & (df[i] < q3)], 3)
            df[i] = df[i].replace(df[i][df[i] >= q3], 4)
            
    return df

In [22]:
df = discretize(diabetes) 
df

Variable: Glucose
Cuartil 1: 99.0
Cuartil 2: 117.0
Cuartil 3: 141.0
Variable: BloodPressure
Cuartil 1: 64.0
Cuartil 2: 72.0
Cuartil 3: 80.0
Variable: SkinThickness
Cuartil 1: 22.0
Cuartil 2: 29.0
Cuartil 3: 36.0
Variable: Insulin
Cuartil 1: 76.75
Cuartil 2: 126.0
Cuartil 3: 190.0
Variable: BMI
Cuartil 1: 27.5
Cuartil 2: 32.4
Cuartil 3: 36.8
Variable: DiabetesPedigreeFunction
Cuartil 1: 24.4
Cuartil 2: 37.6
Cuartil 3: 62.4
Variable: Age
Cuartil 1: 24.0
Cuartil 2: 29.0
Cuartil 3: 40.0


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Overweight
0,2,3,1,3,1,3.0,1.0,4,1,1
1,0,1,4,3,2,4.0,1.0,1,0,1
2,0,4,1,1,1,4.0,4.0,3,1,1
3,0,3,2,4,4,4.0,2.0,2,1,1
4,1,3,1,4,4,4.0,3.0,1,0,1
5,0,4,3,3,4,4.0,4.0,4,0,1
6,4,2,3,1,1,1.0,2.0,2,0,1
7,8,4,4,1,1,1.0,3.0,4,0,1
8,2,1,2,2,1,4.0,4.0,2,0,1
9,2,1,4,3,1,3.0,2.0,4,0,1


In [None]:
diabetes['DiabetesPedigreeFunction'] = diabetes['DiabetesPedigreeFunction'].replace(diabetes['DiabetesPedigreeFunction'][diabetes['DiabetesPedigreeFunction'] < 0.244], 1)
diabetes

## Función para calcular las tablas de probabilidades para cada uno de los nodos

In [23]:
def is_list_empty(list):
    if len(list) == 0:
        return True
    return False

In [24]:
dib_dict = {'Pregnancies' : [], 
            'Age' : [], 
            'DiabetesPedigreeFunction' : [], 
            'Overweight' : [], 
            'Insulin' : ['Outcome'], 
            'SkinThickness' : ['Overweight'], 
            'BMI' : ['Overweight'], 
            'BloodPressure' : ['Outcome', 'Overweight'], 
            'Outcome' : ['Pregnancies', 'Age', 'DiabetesPedigreeFunction', 'Overweight']}


In [25]:
def probability_tables(df, dictionary):
    
    for key in dictionary:
        
        if is_list_empty(dictionary[key]) == True:  ## Se refiere a los nodos que no tienen padres
            
            
            for i in sorted(df[key].unique()):
                prob = df[df[key] == i].shape[0] / df[key].shape[0]
                
                dictionary[key].append(prob)
                
    return dictionary

In [26]:
print(probability_tables(df, dib_dict))

{'Pregnancies': [0.1505, 0.178, 0.142, 0.0975, 0.0955, 0.0705, 0.0655, 0.05, 0.048, 0.035, 0.027, 0.012, 0.0115, 0.011, 0.0035, 0.001, 0.0015], 'Age': [0.2305, 0.2595, 0.247, 0.263], 'DiabetesPedigreeFunction': [0.2485, 0.251, 0.2495, 0.251], 'Overweight': [0.1465, 0.8535], 'Insulin': ['Outcome'], 'SkinThickness': ['Overweight'], 'BMI': ['Overweight'], 'BloodPressure': ['Outcome', 'Overweight'], 'Outcome': ['Pregnancies', 'Age', 'DiabetesPedigreeFunction', 'Overweight']}


In [27]:
## Probabilidades de las clases sin padres

overweight_prob = []
insulin_prob = []
age_prob = []
pregnant_prob = []

df[df['Age'] == 4].shape[0] / df['Age'].shape[0]

0.263

In [28]:
def parents_probabily(df):
    
    parents = ['Age', 'DiabetesPedigreeFunction', 'Overweight', 'Pregnancies']
    
    overweight_prob = []
    insulin_prob = []
    age_prob = []
    pregnant_prob = []
    
    for i in parents:
        for j in sorted(df[i].unique()):
            prob = df[df[i] == j].shape[0] / df[i].shape[0]
            
            if i == 'Age':
                age_prob.append(prob)
            elif i == 'DiabetesPedigreeFunction':
                insulin_prob.append(prob)
            elif i == 'Overweight':
                overweight_prob.append(prob)
            elif i == 'Pregnancies':
                pregnant_prob.append(prob)
        
    return overweight_prob, insulin_prob, age_prob, pregnant_prob
                

In [29]:
overweight_prob, insulin_prob, age_prob, pregnant_prob = parents_probabily(df)

In [30]:
print(overweight_prob)
print(insulin_prob)
print(age_prob)
print(pregnant_prob)

[0.1465, 0.8535]
[0.2485, 0.251, 0.2495, 0.251]
[0.2305, 0.2595, 0.247, 0.263]
[0.1505, 0.178, 0.142, 0.0975, 0.0955, 0.0705, 0.0655, 0.05, 0.048, 0.035, 0.027, 0.012, 0.0115, 0.011, 0.0035, 0.001, 0.0015]


In [None]:
df['DiabetesPedigreeFunction']

## Visualización de la red Bayesiana

In [None]:
pregnant_prob = []
age_prob = []
insulin_prob = []
diabetes_prob = []
dfp_prob = []
overweight_prob = []
skin_prob = []
bmi_prob = []
bp_prob = []

pregnant = BbnNode(Variable(0, 'Pregnant', []), pregnant_prob) # ¿PUEDE TOMA 16 VALORES?
age = BbnNode(Variable(1, 'Age', ['1', '2', '3', '4']), age_prob)
diabetes = BbnNode(Variable(2, 'Diabetes', ['0', '1']), diabetes_prob)
insulin = BbnNode(Variable(3, 'Insulin', ['1', '2', '3', '4']), insulin_prob)
dfp = BbnNode(Variable(4, 'DFP', ['1', '2', '3', '4']), dfp_prob)
bp = BbnNode(Variable(5, 'BP', ['1', '2', '3', '4']), bp_prob)
bmi = BbnNode(Variable(6, 'BMI', ['1', '2', '3', '4']), bmi_prob)
overweight = BbnNode(Variable(7, 'Overweight', ['0', '1']), overweight_prob)
skin = BbnNode(Variable(8, 'Skin', ['1', '2', '3', '4']), skin_prob)

![](diabetes.png)

In [None]:
bbn = Bbn() \
    .add_node(pregnant) \
    .add_node(age) \
    .add_node(diabetes) \
    .add_node(insulin) \
    .add_node(dfp) \
    .add_node(bp) \
    .add_node(bmi) \
    .add_node(overweight) \
    .add_node(skin) \

    .add_edge(Edge(pregnant, diabetes, EdgeType.DIRECTED)) \
    .add_edge(Edge(age, diabetes, EdgeType.DIRECTED)) \
    .add_edge(Edge(dfp, diabetes, EdgeType.DIRECTED)) \
    .add_edge(Edge(diabetes, insulin, EdgeType.DIRECTED)) \
    .add_edge(Edge(diabetes, bp, EdgeType.DIRECTED)) \
    .add_edge(Edge(overweight, diabetes, EdgeType.DIRECTED)) \
    .add_edge(Edge(overweight, bp, EdgeType.DIRECTED)) \
    .add_edge(Edge(overweight, bmi, EdgeType.DIRECTED)) \
    .add_edge(Edge(overweight, skin, EdgeType.DIRECTED)) \

# Convert the BBN to a join tree
join_tree = InferenceController.apply(bbn)

In [None]:
# Set node positions
pos = {}

# Set options for graph looks
options = {"font_size" : 16, "node_size" : 11000, "node_color" : "yellow", 
           "edgecolors" : "black", "edge_color" : "red", "linewidths" : 5, 
           "width": 5}
    
# Generate graph
n, d = bbn.to_nx_graph()
nx.draw(n, with_labels=True, labels=d, pos=pos, **options)

# Update margins and print the graph
ax = plt.gca()
ax.margins(0.3)
plt.axis("off")
plt.show()

In [None]:
# Define a function for printing marginal probabilities
def print_probs():
    for node in join_tree.get_bbn_nodes():
        potential = join_tree.get_bbn_potential(node)
        print("Node:", node)
        print("Values:")
        print(potential)
        print('----------------')
    
# Use the above function to print marginal probabilities
print_probs()

In [None]:
# To add evidence of events that happened so probability distribution can be recalculated
def evidence(ev, nod, cat, val):
    ev = EvidenceBuilder() \
    .with_node(join_tree.get_bbn_node_by_name(nod)) \
    .with_evidence(cat, val) \
    .build()
    join_tree.set_observation(ev)

In [None]:
diabetes_problem_dictionary = {'Pregnant' : [], 'Age' : [], 'DFP' : [], 'Overweight' : [], 
                            'Insulin' : ['Diabetes'], 'Skin' : ['Overweight'], 
                              'BMI' : ['Overweight'], 'BP' : ['Diabetes', 'Overweight'], 
                              'Diabetes' : ['Pregnant', 'Age', 'DFP', 'Overweight']}

In [None]:
concursante_probabilities = [1/3, 1/3, 1/3]
premio_probabilities = [1/3, 1/3, 1/3]
monty_probabilities = [0, 1/2, 1/2, 
                       0, 0, 1, 
                       0, 1, 0, 
                       0, 0, 1, 
                       1/2, 0, 1/2, 
                       1, 0, 0, 
                       0, 1, 0, 
                       1, 0, 0, 
                       1/2, 1/2, 0]

concursante = BbnNode(Variable(0, 'Concursante', ['1', '2', '3']), concursante_probabilities)
premio = BbnNode(Variable(1, 'Premio', ['1', '2', '3']), premio_probabilities)
monty = BbnNode(Variable(2, 'Monty', ['1', '2', '3']), monty_probabilities)

In [None]:
bbn = Bbn() \
    .add_node(concursante) \
    .add_node(premio) \
    .add_node(monty) \
    .add_edge(Edge(concursante, monty, EdgeType.DIRECTED)) \
    .add_edge(Edge(premio, monty, EdgeType.DIRECTED)) \

# Convert the BBN to a join tree
join_tree = InferenceController.apply(bbn)

In [None]:
# Set node positions
pos = {0: (-1, 0.5), 1: (1, 0.5), 2: (0, 0)}

# Set options for graph looks
options = {"font_size" : 16, "node_size" : 11000, "node_color" : "yellow", 
           "edgecolors" : "black", "edge_color" : "red", "linewidths" : 5, 
           "width": 5}
    
# Generate graph
n, d = bbn.to_nx_graph()
nx.draw(n, with_labels=True, labels=d, pos=pos, **options)

# Update margins and print the graph
ax = plt.gca()
ax.margins(0.3)
plt.axis("off")
plt.show()

In [None]:
# Define a function for printing marginal probabilities
def print_probs():
    for node in join_tree.get_bbn_nodes():
        potential = join_tree.get_bbn_potential(node)
        print("Node:", node)
        print("Values:")
        print(potential)
        print('----------------')
    
# Use the above function to print marginal probabilities
print_probs()

In [None]:
# To add evidence of events that happened so probability distribution can be recalculated
def evidence(ev, nod, cat, val):
    ev = EvidenceBuilder() \
    .with_node(join_tree.get_bbn_node_by_name(nod)) \
    .with_evidence(cat, val) \
    .build()
    join_tree.set_observation(ev)

In [None]:
# Use above function to add evidence
evidence('ev1', 'Concursante', '1', 1)
evidence('ev2', 'Monty', '2', 1)

# Print marginal probabilities
print_probs()

In [None]:
join_tree = InferenceController.apply(bbn)

In [None]:
monty_problem_dictionary = {'Concursante' : [], 'Premio' : [], 
                            'Monty' : ['Concursante', 'Premio']}

In [None]:
monty_problem_tables = {'Concursante' : [1/3, 1/3, 1/3], 'Premio' : [1/3, 1/3, 1/3], 
                            'Monty' : [0, 1/2, 1/2, 
                                       0, 0, 1, 
                                       0, 1, 0, 
                                       0, 0, 1, 
                                       1/2, 0, 1/2, 
                                       1, 0, 0, 
                                       0, 1, 0, 
                                       1, 0, 0, 
                                       1/2, 1/2, 0]}