# Task 1.4.1

Create a Bayesian Network on the same data with two different hierarchies.

# Hierarchy 1:

In [9]:
# Including the necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD

In [10]:
# Import data, make a copy of the original

df0 = pd.read_csv('seattle-weather.csv')
dfc1 = df0.copy()
dfc1.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [11]:
# Create new df with variables we want to work with:
new_cols = ['date', 'precipitation', 'temp_max', 'temp_min', 'wind', 'weather']

df = df0[new_cols]

df[df.isnull().any(axis=1)] # any missing data in columns
df.isnull().any()

date             False
precipitation    False
temp_max         False
temp_min         False
wind             False
weather          False
dtype: bool

In [12]:
num_stdv = 1

# Define the labels dictionary
labels = {
    'precipitation': ['low', 'moderate', 'high'],
    'temp_max': ['low', 'moderate', 'high'],
    'temp_min': ['low', 'moderate', 'high'],
    'wind': ['low', 'moderate', 'high']
}

# Create bounds for continuous labels
for col in df.columns:
    if col in labels:
        col_mean = df[col].mean()
        col_stvd = df[col].std()
        lower = col_mean - col_stvd * num_stdv
        upper = col_mean + col_stvd * num_stdv
        bins = [-float('inf'), lower, upper, float('inf')]
        df[col] = pd.cut(df[col], bins=bins, labels=labels[col])


df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,moderate,moderate,moderate,high,drizzle
1,2012-01-02,high,moderate,low,moderate,rain
2,2012-01-03,moderate,moderate,moderate,moderate,rain
3,2012-01-04,high,moderate,moderate,high,rain
4,2012-01-05,moderate,low,low,high,rain


In [13]:
# Define the hierarchy
weather_model_h1 = BayesianNetwork([
       ('weather', 'precipitation'), ('weather', 'wind'),
       ('precipitation', 'temp_max'), ('precipitation', 'temp_min'),
       ('wind', 'temp_min'), ('wind', 'temp_max')
])

# And, the states for each variables
precipitation_states = ['low', 'mid', 'high']
temp_max_states = ['low', 'mid', 'high']
temp_min_states = ['low', 'mid', 'high']
wind_states = ['low', 'mid', 'high']
weather_states = ['drizzle', 'rain', 'sun', 'snow', 'fog']

In [14]:
# Calculate Probabilities

# Weather does not have any parents so all we need are the marginal probabilities of observing each weather type
weather_marginal = (df['weather'].value_counts()/len(df['weather'])).round(3)
weather_marginal = np.array([[value] for value in weather_marginal])


# Joint Propabilities
# Create dict where key=parent, value=child
var_dict = {
           ('weather',): ['precipitation', 'wind'],
           ('precipitation', 'wind'): ['temp_max'],
           ('wind', 'precipitation'): ['temp_min'],
           }

# Create conditional distributions and store results in a list
cpd_lst = []
for key, value in var_dict.items():
   length = len(value)
   for i in range(length):
       value_given_key = df.groupby(list(key))[value[i]].value_counts(normalize=True).sort_index()
       cpd = value_given_key.unstack(fill_value=0).to_numpy().T
       cpd_lst.append(cpd)


# Note that we get 3 Nan values in the above conditional distributions. This is because one of the type of precipitation (low) did not contain any relation with temp_max.
# Therefore, normalization, does not produce the intended result.
# To mitigate this, we replace Nan with the equal probability within the three values, i.e., 0.33
for cpd in cpd_lst:
    for i in range(len(cpd[0])):
        col = cpd[:,i]
        if np.array_equal(col, np.array([0., 0., 0.])):
            cpd[:,i] = .33

# print(cpd_lst)

  value_given_key = df.groupby(list(key))[value[i]].value_counts(normalize=True).sort_index()
  value_given_key = df.groupby(list(key))[value[i]].value_counts(normalize=True).sort_index()


In [15]:
# Creating tabular conditional probability distribution
weather_cpd = TabularCPD(variable='weather', variable_card=5, values=weather_marginal, state_names={'weather': weather_states})

precipitation_cpd = TabularCPD(variable='precipitation', variable_card=3, evidence=['weather'], evidence_card=[5],
                              values=cpd_lst[0], state_names={'precipitation': precipitation_states, 'weather': weather_states})

wind_cpd = TabularCPD(variable='wind', variable_card=3, evidence=['weather'], evidence_card=[5],
                              values=cpd_lst[1], state_names={'wind': wind_states, 'weather': weather_states})

temp_max_cpd = TabularCPD(variable='temp_max', variable_card=3, evidence=['precipitation', 'wind'], evidence_card=[3, 3],
                              values=cpd_lst[2], state_names={'temp_max': temp_max_states, 'precipitation': precipitation_states, 'wind': wind_states})

temp_min_cpd = TabularCPD(variable='temp_min', variable_card=3, evidence=['wind', 'precipitation'], evidence_card=[3, 3],
                              values=cpd_lst[3], state_names={'temp_min': temp_min_states, 'wind': wind_states, 'precipitation': precipitation_states})

In [16]:
# Add CPDs and factors to the model
weather_model_h1.add_cpds(weather_cpd, precipitation_cpd, wind_cpd, temp_max_cpd, temp_min_cpd)
# Check if model is consistent
weather_model_h1.check_model()

# Store the model
%store weather_model_h1

Stored 'weather_model_h1' (BayesianNetwork)
