In [16]:
## Loading the dataset

import pandas as pd

df = pd.read_csv('survey_lung_cancer.csv') 


In [17]:
## Preprocessing

df1 = df.drop(labels=['GENDER','YELLOW_FINGERS','CHRONIC DISEASE','ALCOHOL CONSUMING','SWALLOWING DIFFICULTY','SHORTNESS OF BREATH'], axis = 1)

from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
df1['AGE'] = est.fit_transform(np.array(df1['AGE']).reshape(-1, 1)).astype(int)

df1 = df1.rename(columns={"FATIGUE ":"FATIGUE"})
df1 = df1.rename(columns={"ALLERGY ":"ALLERGY"})

for i in df1.iloc[:, 1:9]:
    df1[i] = df1[i].transform(lambda x: 'YES' if (x == 2) else 'NO')
    
df1.head()

Unnamed: 0,AGE,SMOKING,ANXIETY,PEER_PRESSURE,FATIGUE,ALLERGY,WHEEZING,COUGHING,CHEST PAIN,LUNG_CANCER
0,2,NO,YES,NO,YES,NO,YES,YES,YES,YES
1,2,YES,NO,NO,YES,YES,NO,NO,YES,YES
2,1,NO,NO,YES,YES,NO,YES,YES,YES,NO
3,1,YES,YES,NO,NO,NO,NO,NO,YES,NO
4,1,NO,NO,NO,NO,NO,YES,YES,NO,NO


In [18]:
## Constructing the Bayesian Network

from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
nodes = [('ANXIETY','SMOKING'),('PEER_PRESSURE','SMOKING'),('SMOKING','LUNG_CANCER'), ('AGE','LUNG_CANCER'), ('LUNG_CANCER','WHEEZING'),
         ('LUNG_CANCER','COUGHING'),('LUNG_CANCER','CHEST PAIN'),('ALLERGY','WHEEZING'),('ALLERGY','COUGHING'),('WHEEZING','FATIGUE'),
         ('COUGHING','FATIGUE')]
model= BayesianNetwork(nodes)
model.fit(df1, estimator=MaximumLikelihoodEstimator)
for cpd in model.get_cpds():
    print(cpd)
   

+--------------+----------+
| ANXIETY(NO)  | 0.501618 |
+--------------+----------+
| ANXIETY(YES) | 0.498382 |
+--------------+----------+
[0.50161812 0.49838188]
+---------------+-------------------+--------------------+-------------------+---------------------+
| ANXIETY       | ANXIETY(NO)       | ANXIETY(NO)        | ANXIETY(YES)      | ANXIETY(YES)        |
+---------------+-------------------+--------------------+-------------------+---------------------+
| PEER_PRESSURE | PEER_PRESSURE(NO) | PEER_PRESSURE(YES) | PEER_PRESSURE(NO) | PEER_PRESSURE(YES)  |
+---------------+-------------------+--------------------+-------------------+---------------------+
| SMOKING(NO)   | 0.425531914893617 | 0.6557377049180327 | 0.4               | 0.32978723404255317 |
+---------------+-------------------+--------------------+-------------------+---------------------+
| SMOKING(YES)  | 0.574468085106383 | 0.3442622950819672 | 0.6               | 0.6702127659574468  |
+---------------+-----------

In [19]:
## Doing inference

from pgmpy.inference import VariableElimination
lungcancer_infer = VariableElimination(model)

#simple query in order to show the variable elimination algorithm
q1=lungcancer_infer.query(variables=['LUNG_CANCER'], evidence={'PEER_PRESSURE':'YES'})
print('query used to show how to make inference with variable elimination')
print(q1)


# queries meant to show the conditional independence derived from markov blanket
q2=lungcancer_infer.query(variables=['COUGHING'], evidence={'LUNG_CANCER':'YES','ALLERGY':'NO','FATIGUE':'NO','WHEEZING':'YES'})
q3=lungcancer_infer.query(variables=['COUGHING'], evidence={'LUNG_CANCER':'YES','ALLERGY':'NO','FATIGUE':'NO','WHEEZING':'YES',
                                                           'ANXIETY':'NO','AGE':2,'CHEST PAIN':'YES','SMOKING':'YES','PEER_PRESSURE':'NO'})
print('query with markov blanket of coughing as evidence')
print(q2,'\n')

print('query with markov blanket of coughing and all the remaining nodes as evidence')
print(q3)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

query used to show how to make inference with variable elimination
+------------------+--------------------+
| LUNG_CANCER      |   phi(LUNG_CANCER) |
| LUNG_CANCER(NO)  |             0.1289 |
+------------------+--------------------+
| LUNG_CANCER(YES) |             0.8711 |
+------------------+--------------------+


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

query with markov blanket of coughing as evidence
+---------------+-----------------+
| COUGHING      |   phi(COUGHING) |
| COUGHING(NO)  |          0.5560 |
+---------------+-----------------+
| COUGHING(YES) |          0.4440 |
+---------------+-----------------+ 

query with markov blanket of coughing and all the remaining nodes as evidence
+---------------+-----------------+
| COUGHING      |   phi(COUGHING) |
| COUGHING(NO)  |          0.5560 |
+---------------+-----------------+
| COUGHING(YES) |          0.4440 |
+---------------+-----------------+
