# Education Network

## Imports and stuff

In [2]:
import numpy as np
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination

In [20]:
# Define variables (Will make easier to define the network)
A = "Age"
S = "Sex"
E = "Education"
O = "Occupation"
R = "Residence"
T = "Travel"

# Try to add another variable
F = "Field"
W = "Working Style"
D = "Distance"

# DEFINE THE NETWORK STRUCTURES
# Define the arcs
#education_model = BayesianNetwork([(A,E),(S,E),(E,O),(E,R),(O,T),(R,T)])
education_model = BayesianNetwork([(A,E),(S,E),(E,O),(E,R),(F,O),(F,W),(O,T),(O,W),(W,D),(R,T),(D,T)])

# Define the conditional probability distributions(?)#TODO CPD == ?
age_cpd=TabularCPD(A,3,[[.3],
                        [.5],
                        [.2]],
                        state_names={A: ["young", "adult", "old"]})

sex_cpd=TabularCPD(S,2,[[.6],
                        [.4]],
                        state_names={S: ["male", "female"]})

edu_cpd=TabularCPD(E,2,[[.75,.72,.88,.64,.7,.9],
                        [.25,.28,.12,.36,.3,.1]],
                        evidence=[S,A],evidence_card=[2,3],
                        state_names={E: ["high", "uni"], S: ["male", "female"], A: ["young", "adult", "old"]})

"""
THIS TABLE IS MODIFIED LATER DUE TO THE NEW VARIABLES ADDED

ocu_cpd=TabularCPD(O,2,[[.96,.92],
                        [.04,.08]],
                        evidence=[E],evidence_card=[2],
                        state_names={O: ["emp", "self"], E: ["high", "uni"]})
"""


res_cpd=TabularCPD(R,2,[[.25,.2],
                        [.75,.8]],
                        evidence=[E],evidence_card=[2],
                        state_names={R: ["small", "big"], E: ["high", "uni"]})
"""
THIS TABLE IS MODIFIED LATER DUE TO THE NEW VARIABLES ADDED

tra_cpd=TabularCPD(T,3,[[.48,.58,.56,.7],
                        [.42,.24,.36,.21],
                        [.1,.18,.08,.09]],
                        evidence=[O,R], evidence_card=[2,2],
                        state_names={T: ["car", "train", "other"], O: ["emp", "self", "uemp"], R: ["small", "big"]})
"""

# CPD FOR THE NEW VARIABLES

fie_cpd=TabularCPD(F,4,[[.38],
                        [.22],
                        [.09],
                        [.31]],
                        state_names={F: ["Scientific", "Humanistic", "Artistic", "Social"]})

ocu_cpd=TabularCPD(O,2,[[.80,.90,.65,.86,.76,.86,.61,.82],
                        [.20,.10,.35,.14,.24,.14,.39,.18],],
                        evidence=[E,F], evidence_card=[2,4],
                        state_names={O: ["emp", "self"], E: ["high", "uni"], F: ["Scientific", "Humanistic", "Artistic", "Social"]})

wor_cpd=TabularCPD(W,2,[[.52,.17,.69,.22,.90,.83,.96,.85],
                        [.48,.83,.31,.78,.10,.17,.04,.15]],
                        evidence=[O,F], evidence_card=[2,4],
                        state_names={W: ["sw", "off"], O: ["emp", "self"], F: ["Scientific", "Humanistic", "Artistic", "Social"]})

dis_cpd=TabularCPD(D,2,[[.94,.65],
                        [.06,.35]],
                        evidence=[W], evidence_card=[2],
                        state_names={D: ["near", "far"], W: ["sw", "off"]})

tra_cpd=TabularCPD(T,3,[[.45,.48,.10,.53,.55,.58,.15,.60],
                        [.08,.42,.02,.38,.06,.33,.01,.32],
                        [.47,.10,.88,.09,.39,.09,.84,.08]],
                        evidence=[O,R,D], evidence_card=[2,2,2],
                        state_names={T: ["car", "train", "other"], O: ["emp", "self"], R: ["small", "big"], D: ["near", "far"]})

#education_model.add_cpds(age_cpd,sex_cpd,edu_cpd,ocu_cpd,res_cpd,tra_cpd)
education_model.add_cpds(age_cpd,sex_cpd,edu_cpd,ocu_cpd,fie_cpd,res_cpd,wor_cpd,dis_cpd,tra_cpd)
# student_model.get_cpds()

In [21]:
education_model.check_model()

True

A smart student in an easy class is 90% likely to get an A, 8% likely to get a B, and 2% likely to get a C.
Conversely, a smart student in a hard class is only 50% likely to get an A

In [9]:
print(age_cpd) 
print(sex_cpd) 
print(edu_cpd) 
print(res_cpd) 
print(ocu_cpd)
print(tra_cpd)

+------------+-----+
| Age(young) | 0.3 |
+------------+-----+
| Age(adult) | 0.5 |
+------------+-----+
| Age(old)   | 0.2 |
+------------+-----+
+-------------+-----+
| Sex(male)   | 0.6 |
+-------------+-----+
| Sex(female) | 0.4 |
+-------------+-----+
+-----------------+------------+-----+-------------+-------------+
| Sex             | Sex(male)  | ... | Sex(female) | Sex(female) |
+-----------------+------------+-----+-------------+-------------+
| Age             | Age(young) | ... | Age(adult)  | Age(old)    |
+-----------------+------------+-----+-------------+-------------+
| Education(high) | 0.75       | ... | 0.7         | 0.9         |
+-----------------+------------+-----+-------------+-------------+
| Education(uni)  | 0.25       | ... | 0.3         | 0.1         |
+-----------------+------------+-----+-------------+-------------+
+------------------+-----------------+----------------+
| Education        | Education(high) | Education(uni) |
+------------------+--------

In [10]:
education_model.get_independencies()

(Sex ⟂ Age)
(Sex ⟂ Occupation, Travel, Residence | Education)
(Sex ⟂ Occupation, Residence | Education, Travel)
(Sex ⟂ Residence, Travel | Occupation, Education)
(Sex ⟂ Occupation, Travel, Residence | Education, Age)
(Sex ⟂ Occupation, Travel | Education, Residence)
(Sex ⟂ Travel | Occupation, Residence)
(Sex ⟂ Residence | Occupation, Education, Travel)
(Sex ⟂ Occupation, Residence | Education, Age, Travel)
(Sex ⟂ Occupation | Education, Travel, Residence)
(Sex ⟂ Residence, Travel | Occupation, Education, Age)
(Sex ⟂ Travel | Occupation, Education, Residence)
(Sex ⟂ Occupation, Travel | Education, Age, Residence)
(Sex ⟂ Travel | Occupation, Age, Residence)
(Sex ⟂ Residence | Occupation, Education, Age, Travel)
(Sex ⟂ Occupation | Education, Age, Travel, Residence)
(Sex ⟂ Travel | Occupation, Education, Age, Residence)
(Travel ⟂ Sex, Age | Education)
(Travel ⟂ Age | Education, Sex)
(Travel ⟂ Sex, Age | Occupation, Education)
(Travel ⟂ Sex | Education, Age)
(Travel ⟂ Sex, Age | Education

In [25]:
#Causal inference (prediction)
education_inference = VariableElimination(education_model)
print("What is the most probable method of transport used by a young female?")
print('P(Travel | Age=young , Sex=female)')
q_1 = education_inference.query(variables=[T], evidence={A: "young", S: "female"})
print(q_1)


What is the most probable method of transport used by a young female?
P(travel | age=young , sex=female)
+---------------+---------------+
| Travel        |   phi(Travel) |
| Travel(car)   |        0.5628 |
+---------------+---------------+
| Travel(train) |        0.2798 |
+---------------+---------------+
| Travel(other) |        0.1574 |
+---------------+---------------+


In [40]:
#Evidential inference (explanation)
print("Has the transport a significant impact on the education?")
print('P(Education|Travel=car')
q_2 = education_inference.query(variables=[E], evidence={T: "car"})
print(q_2)
print()
print('P(Education|Travel=train')
q_3 = education_inference.query(variables=[E], evidence={T: "train"})
print(q_3)

print('P(Education|Travel=other')
q_4 = education_inference.query(variables=[E], evidence={T: "other"})
print(q_4)

Has the transport a significant impact on the education?
P(Education|Travel=car
+-----------------+------------------+
| Education       |   phi(Education) |
| Education(high) |           0.7422 |
+-----------------+------------------+
| Education(uni)  |           0.2578 |
+-----------------+------------------+

P(Education|Travel=train
+-----------------+------------------+
| Education       |   phi(Education) |
| Education(high) |           0.7524 |
+-----------------+------------------+
| Education(uni)  |           0.2476 |
+-----------------+------------------+
P(Education|Travel=other
+-----------------+------------------+
| Education       |   phi(Education) |
| Education(high) |           0.7444 |
+-----------------+------------------+
| Education(uni)  |           0.2556 |
+-----------------+------------------+


In [43]:
#Intercasual inference (explainin away)
print("What is the most probable residence for a self employ")
print('P(Residence|Occupation=self)')
q_5 = education_inference.query(variables=[R], evidence={O: "self"})
print(q_5)
# however, if the course is easy, and the instructor writes the letter only based on the student's grade,
#    the chance that the letter is strong goes above 51%

P(Letter|Intelligence=normal,Difficulty=easy)
+------------------+------------------+
| Residence        |   phi(Residence) |
| Residence(small) |           0.2297 |
+------------------+------------------+
| Residence(big)   |           0.7703 |
+------------------+------------------+
