# INF-2600-1 24V Artificial Intelligence: Assignment 3 Pre-code
This code implements a Bayesian Network model for Analyzing Sensor Data for Weather Prediction using the pgmpy library in Python.


### BUILDING THE STRUCTURE OF BAYESIAN NETWORK: Using PgmPy
#### Install Package: `!pip install pgmpy`

In [255]:
# Including the necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD

# Factoring the dataset

In [256]:
# Import data, make a copy of the original

df0 = pd.read_csv('seattle-weather.csv')
dfc1 = df0.copy()
dfc1.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [257]:
# Get characteristics of dataset including columns with missing data as well:
dfc1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1461 entries, 0 to 1460
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           1461 non-null   object 
 1   precipitation  1461 non-null   float64
 2   temp_max       1461 non-null   float64
 3   temp_min       1461 non-null   float64
 4   wind           1461 non-null   float64
 5   weather        1461 non-null   object 
dtypes: float64(4), object(2)
memory usage: 68.6+ KB


In [258]:
# Checking the unique values in the 'weather' column
unique_fields = dfc1['weather'].unique()
print(unique_fields)

['drizzle' 'rain' 'sun' 'snow' 'fog']


In [259]:
dfc1.describe()

Unnamed: 0,precipitation,temp_max,temp_min,wind
count,1461.0,1461.0,1461.0,1461.0
mean,3.029432,16.439083,8.234771,3.241136
std,6.680194,7.349758,5.023004,1.437825
min,0.0,-1.6,-7.1,0.4
25%,0.0,10.6,4.4,2.2
50%,0.0,15.6,8.3,3.0
75%,2.8,22.2,12.2,4.0
max,55.9,35.6,18.3,9.5


In [260]:
# Put categorical varaibles in a list
categorical_lst = ['date','weather']
# Create a seperate & smaller dataframe for categorical variables
dfc2a = pd.DataFrame(dfc1, columns=categorical_lst, copy=True)
dfc2a.head()

Unnamed: 0,date,weather
0,2012-01-01,drizzle
1,2012-01-02,rain
2,2012-01-03,rain
3,2012-01-04,rain
4,2012-01-05,rain


In [261]:
# Put all continuous variables into a list
continuous_lst = ['precipitation', 'temp_max', 'temp_min', 'wind']
# Create a seperate & smaller dataframe for our chosen variables. Use 'copy=True' so changes wont affect original
dfc2b = pd.DataFrame(dfc1, columns=continuous_lst, copy=True)
dfc2b.head()

Unnamed: 0,precipitation,temp_max,temp_min,wind
0,0.0,12.8,5.0,4.7
1,10.9,10.6,2.8,4.5
2,0.8,11.7,7.2,2.3
3,20.3,12.2,5.6,4.7
4,1.3,8.9,2.8,6.1


### Create new dataframe

In [262]:
# Create new df with variables we want to work with:
new_cols = ['date', 'precipitation', 'temp_max', 'temp_min', 'wind', 'weather']

df = df0[new_cols]
df.head()

Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,0.0,12.8,5.0,4.7,drizzle
1,2012-01-02,10.9,10.6,2.8,4.5,rain
2,2012-01-03,0.8,11.7,7.2,2.3,rain
3,2012-01-04,20.3,12.2,5.6,4.7,rain
4,2012-01-05,1.3,8.9,2.8,6.1,rain


In [263]:
# Let's show all columns with missing data as well:
df[df.isnull().any(axis=1)] # any missing data in columns
df.isnull().any()

date             False
precipitation    False
temp_max         False
temp_min         False
wind             False
weather          False
dtype: bool

In [264]:
num_stdv = 1

# Define the labels dictionary
labels = {
    'precipitation': ['low', 'moderate', 'high'],
    'temp_max': ['low', 'moderate', 'high'],
    'temp_min': ['low', 'moderate', 'high'],
    'wind': ['low', 'moderate', 'high']
}

# Create bounds for continuous labels
for col in df.columns:
    if col in labels:
        col_mean = df[col].mean()
        col_stvd = df[col].std()
        lower = col_mean - col_stvd * num_stdv
        upper = col_mean + col_stvd * num_stdv
        bins = [-float('inf'), lower, upper, float('inf')]
        df[col] = pd.cut(df[col], bins=bins, labels=labels[col])

df.head()


Unnamed: 0,date,precipitation,temp_max,temp_min,wind,weather
0,2012-01-01,moderate,moderate,moderate,high,drizzle
1,2012-01-02,high,moderate,low,moderate,rain
2,2012-01-03,moderate,moderate,moderate,moderate,rain
3,2012-01-04,high,moderate,moderate,high,rain
4,2012-01-05,moderate,low,low,high,rain


# Creating Bayes Nets

In [265]:
# Define the hierarchy
weather_model = BayesianNetwork([
       ('weather', 'precipitation'), ('weather', 'wind'),
       ('precipitation', 'temp_max'),
       ('wind', 'temp_min')
])

# And, the states for each variables
precipitation_states = ['low', 'mid', 'high']
temp_max_states = ['low', 'mid', 'high']
temp_min_states = ['low', 'mid', 'high']
wind_states = ['low', 'mid', 'high']
weather_states = ['drizzle', 'rain', 'sun', 'snow', 'fog']

print(weather_states)

['drizzle', 'rain', 'sun', 'snow', 'fog']


In [266]:
# Calculate Probabilities

# Weather does not have any parents so all we need are the marginal probabilities of observing each weather type
weather_marginal = (df['weather'].value_counts()/len(df['weather'])).round(3)
weather_marginal = np.array([[value] for value in weather_marginal])


# Joint Propabilities
# Create dict where key=parent, value=child
var_dict = {
           'weather': ['precipitation', 'wind'],
           'precipitation': ['temp_max'],
           'wind': ['temp_min'],
           }

# Create conditional distributions and store results in a list
cpd_lst = []
for key, value in var_dict.items():
   length = len(value)
   for i in range(length):
       value_given_key = df.groupby(key)[value[i]].value_counts(normalize=True).sort_index()
       cpd = value_given_key.unstack(fill_value=0).to_numpy().T
       cpd_lst.append(cpd)

# Note that we get 3 Nan values in the above conditional distributions. This is because one of the type of precipitation (low) did not contain any relation with temp_max.
# Therefore, normalization, does not produce the intended result.
# To mitigate this, we replace Nan with the equal probability within the three values, i.e., 0.33
cpd_lst[2][:,0] = .33

# print(cpd_lst)

  value_given_key = df.groupby(key)[value[i]].value_counts(normalize=True).sort_index()
  value_given_key = df.groupby(key)[value[i]].value_counts(normalize=True).sort_index()


In [267]:
# Creating tabular conditional probability distribution
weather_cpd = TabularCPD(variable='weather', variable_card=5, values=weather_marginal, state_names={'weather': weather_states})

precipitation_cpd = TabularCPD(variable='precipitation', variable_card=3, evidence=['weather'], evidence_card=[5],
                              values=cpd_lst[0], state_names={'precipitation': precipitation_states, 'weather': weather_states})
wind_cpd = TabularCPD(variable='wind', variable_card=3, evidence=['weather'], evidence_card=[5],
                              values=cpd_lst[1], state_names={'wind': wind_states, 'weather': weather_states})
temp_max_cpd = TabularCPD(variable='temp_max', variable_card=3, evidence=['precipitation'], evidence_card=[3],
                              values=cpd_lst[2], state_names={'temp_max': temp_max_states, 'precipitation': precipitation_states})
temp_min_cpd = TabularCPD(variable='temp_min', variable_card=3, evidence=['wind'], evidence_card=[3],
                              values=cpd_lst[3], state_names={'temp_min': temp_min_states, 'wind': wind_states})

In [268]:
# Add CPDs and factors to the model
weather_model.add_cpds(weather_cpd, precipitation_cpd, wind_cpd, temp_max_cpd, temp_min_cpd)
# Check if model is consistent
weather_model.check_model()

# Store the model
%store weather_model

Stored 'weather_model' (BayesianNetwork)


In [269]:
# Viewing nodes of the model
weather_model.nodes()

NodeView(('weather', 'precipitation', 'wind', 'temp_max', 'temp_min'))

In [270]:
# Viewing edges of the model
weather_model.edges()

OutEdgeView([('weather', 'precipitation'), ('weather', 'wind'), ('precipitation', 'temp_max'), ('wind', 'temp_min')])

In [271]:
# Print the probability table of the weather node
print(weather_cpd)

# Print the probability table of the wind node
print(wind_cpd)

+------------------+-------+
| weather(drizzle) | 0.439 |
+------------------+-------+
| weather(rain)    | 0.438 |
+------------------+-------+
| weather(sun)     | 0.069 |
+------------------+-------+
| weather(snow)    | 0.036 |
+------------------+-------+
| weather(fog)     | 0.018 |
+------------------+-------+
+------------+----------------------+-----+--------------+
| weather    | weather(drizzle)     | ... | weather(fog) |
+------------+----------------------+-----+--------------+
| wind(low)  | 0.3018867924528302   | ... | 0.140625     |
+------------+----------------------+-----+--------------+
| wind(mid)  | 0.6792452830188679   | ... | 0.775        |
+------------+----------------------+-----+--------------+
| wind(high) | 0.018867924528301886 | ... | 0.084375     |
+------------+----------------------+-----+--------------+


In [272]:
# Independcies in the model
weather_model.get_independencies()

# Checking independcies of a particular node
weather_model.local_independencies("temp_max")

(temp_max ⟂ temp_min, weather, wind | precipitation)

# Task 1.2

In [273]:
from pgmpy.inference import VariableElimination

inference = VariableElimination(weather_model)

In [274]:
# Question 1:

# (a) What is the probability of high wind when the weather is sunny?
phi_query = inference.query(variables=['wind'], evidence={'weather' : 'sun'})
print("Probability of high wind when the weather is sunny:", phi_query.values[2])

# (b) What is the probability of sunny weather when the wind is high?
phi_query = inference.query(variables=['weather'], evidence={'wind' : 'high'})
print("Probability of sunny weather when the wind is high:", phi_query.values[2])


Probability of high wind when the weather is sunny: 0.2542901716068643
Probability of sunny weather when the wind is high: 0.24117511076942655


In [275]:
# Question 2:
# (a) Calculate all the possible joint probability and determine the best probable condition. Explain your results?

# Returns the highest probable state in the joint distribution of variables.
phi_query = inference.map_query(variables=['weather', 'precipitation', 'wind', 'temp_max', 'temp_min'], show_progress=False)
print("The most probable condition", phi_query)

    
# (b) What is the most probable condition for precipitation, wind and weather, combined?

# Returns the highest probable state in the joint distribution of variables.
phi_query = inference.map_query(variables=['weather', 'precipitation', 'wind'], show_progress=False)
print("The most probable condition for precipitation, wind and weather", phi_query)



The most probable condition {'temp_min': 'mid', 'precipitation': 'mid', 'weather': 'drizzle', 'wind': 'mid', 'temp_max': 'mid'}
The most probable condition for precipitation, wind and weather {'weather': 'drizzle', 'precipitation': 'mid', 'wind': 'mid'}


In [276]:
# Question 3. Find the probability associated with each weather, given that the precipitation is medium? Explain your result.
phi_query = inference.query(variables=['weather'], evidence={'precipitation' : 'mid'})

print("The probability associated with each weather state, given that the precipitation is medium")
print(phi_query)


The probability associated with each weather state, given that the precipitation is medium
+------------------+----------------+
| weather          |   phi(weather) |
| weather(drizzle) |         0.4508 |
+------------------+----------------+
| weather(rain)    |         0.4498 |
+------------------+----------------+
| weather(sun)     |         0.0553 |
+------------------+----------------+
| weather(snow)    |         0.0256 |
+------------------+----------------+
| weather(fog)     |         0.0185 |
+------------------+----------------+


In [277]:
# Question 4. What is the probability of each weather condition given that precipitation is medium and wind is low or medium? 
# Explain your method and results. How does the result change with the addition of wind factor compared to question 3 of Task 1.2?
phi_query = inference.query(variables=['weather'], evidence={'precipitation' : 'mid', 'wind' : 'low', 'wind' : 'mid'})

print("The probability associated with each weather state, given that precipitation is medium and wind is low or medium")
print(phi_query)


The probability associated with each weather state, given that precipitation is medium and wind is low or medium
+------------------+----------------+
| weather          |   phi(weather) |
| weather(drizzle) |         0.4872 |
+------------------+----------------+
| weather(rain)    |         0.4180 |
+------------------+----------------+
| weather(sun)     |         0.0564 |
+------------------+----------------+
| weather(snow)    |         0.0157 |
+------------------+----------------+
| weather(fog)     |         0.0228 |
+------------------+----------------+


# Task 1.3 - Approximate Inference

## Likelihood Weighted Sample

Generates weighted sample(s) from joint distribution of the Bayesian Network, that comply with the given evidence.

In [278]:
from pgmpy.factors.discrete import State
from pgmpy.sampling import BayesianModelSampling

inference = BayesianModelSampling(weather_model)

In [279]:
# Repeat Q.1. (a) of Task 1.2 - What is the probability of high wind when the weather is sunny?
evidence = [State('weather', 'sun')]

# Generates weighted sample(s) from joint distribution of the Bayesian Network, that comply with the given evidence
weighted_sample = inference.likelihood_weighted_sample(evidence=evidence, size=100000, show_progress=False)

weighted_probability = weighted_sample['wind'].value_counts(normalize=True)['high']
print("The probability of high wind when the weather is sunny: ", weighted_probability)

The probability of high wind when the weather is sunny:  0.25411


  df = pd.DataFrame.from_records(samples)


In [280]:
# Repeat Q.1. (b) of Task 1.2 - What is the probability of sunny weather when the wind is high?
evidence = [State('wind', 'high')]

# Generates weighted sample(s) from joint distribution of the Bayesian Network, that comply with the given evidence
weighted_sample = inference.likelihood_weighted_sample(evidence=evidence, size=100000, show_progress=False)

weighted_probability = weighted_sample['weather'].value_counts(normalize=True)['sun']
print("The probability of sunny weather when the wind is high: ", weighted_probability)

The probability of sunny weather when the wind is high:  0.06887


  df = pd.DataFrame.from_records(samples)


## Rejection Sampling

In [281]:
# Repeat Q.2 . (a) of Task 1.2 - Calculate all the possible joint probability and determine the best probable condition. Explain your results?
rejection_sample = inference.rejection_sample(size=10000, show_progress=False)

rejection_counts = rejection_sample.value_counts(normalize=True)

conditions = rejection_counts.idxmax()

condition = {}
for index, node in enumerate(weather_model.nodes()):
    condition[node] = conditions[index]

print("The most probable condition", condition)

Generating for node: temp_min: 100%|██████████| 5/5 [00:00<00:00, 128.21it/s]

The most probable condition {'weather': 'drizzle', 'precipitation': 'mid', 'wind': 'mid', 'temp_max': 'mid', 'temp_min': 'mid'}



  df = pd.DataFrame.from_records(samples)


In [282]:
# Repeat Q.2 . (b) of Task 1.2 - What is the most probable condition for precipitation, wind and weather, combined?
combination = ['precipitation', 'wind', 'weather']
rejection_sample = inference.rejection_sample(size=10000, show_progress=False)[combination]

rejection_counts = rejection_sample.value_counts(normalize=True)

conditions = rejection_counts.idxmax()

condition = {}
for index, node in enumerate(combination):
    condition[node] = conditions[index]

print("The most probable condition for precipitation, wind and weather", condition)

Generating for node: temp_min: 100%|██████████| 5/5 [00:00<00:00, 126.40it/s]

The most probable condition for precipitation, wind and weather {'precipitation': 'mid', 'wind': 'mid', 'weather': 'drizzle'}



  df = pd.DataFrame.from_records(samples)


## Approx Inference

In [283]:
from pgmpy.inference import ApproxInference

inference = ApproxInference(weather_model)

In [284]:
# Repeat Q.3 of Task 1.2 - Find the probability associated with each weather, given that the precipitation is medium? Explain your result.
phi_query = inference.query(variables=['weather'], evidence={'precipitation' : 'mid'}, show_progress=False)

print("The probability associated with each weather state, given that the precipitation is medium")
print(phi_query)


The probability associated with each weather state, given that the precipitation is medium
+------------------+----------------+
| weather          |   phi(weather) |
| weather(drizzle) |         0.4472 |
+------------------+----------------+
| weather(sun)     |         0.0564 |
+------------------+----------------+
| weather(rain)    |         0.4532 |
+------------------+----------------+
| weather(fog)     |         0.0168 |
+------------------+----------------+
| weather(snow)    |         0.0264 |
+------------------+----------------+


  df = pd.DataFrame.from_records(samples)


# Normal Sampling

In [285]:
# Repeat Q.4 of Task 1.2 - What is the probability of each weather condition given that precipitation is medium and wind is low or medium? Explain your method and results. How does the result change with the addition of wind factor compared to question 3 of Task 1.2?
inference = BayesianModelSampling(weather_model)

forward_sample = inference.forward_sample(size=10000, show_progress=False)

conditioned_samples = forward_sample[(forward_sample['precipitation'] == 'mid') & (forward_sample['wind'].isin(['low', 'mid']))]

weather_probabilities = conditioned_samples['weather'].value_counts(normalize=True)

print("The probability associated with each weather state, given that precipitation is medium and wind is low or medium")
print(weather_probabilities)

  df = pd.DataFrame.from_records(samples)


The probability associated with each weather state, given that precipitation is medium and wind is low or medium
weather
drizzle    0.470228
rain       0.451729
sun        0.045315
fog        0.019921
snow       0.012806
Name: proportion, dtype: float64
