# Install libraries

In [1]:
!pip install pgmpy

Collecting pgmpy
[?25l  Downloading https://files.pythonhosted.org/packages/a3/0e/d9fadbfaa35e010c04d43acd3ae9fbefec98897dd7d61a6b7eb5a8b34072/pgmpy-0.1.14-py3-none-any.whl (331kB)
[K     |█                               | 10kB 16.8MB/s eta 0:00:01[K     |██                              | 20kB 19.9MB/s eta 0:00:01[K     |███                             | 30kB 23.6MB/s eta 0:00:01[K     |████                            | 40kB 27.4MB/s eta 0:00:01[K     |█████                           | 51kB 28.8MB/s eta 0:00:01[K     |██████                          | 61kB 29.4MB/s eta 0:00:01[K     |███████                         | 71kB 31.1MB/s eta 0:00:01[K     |████████                        | 81kB 31.5MB/s eta 0:00:01[K     |█████████                       | 92kB 31.7MB/s eta 0:00:01[K     |█████████▉                      | 102kB 32.0MB/s eta 0:00:01[K     |██████████▉                     | 112kB 32.0MB/s eta 0:00:01[K     |███████████▉                    | 122kB 32.0MB/

In [2]:
import requests
from bs4 import BeautifulSoup
import re

import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.sampling import BayesianModelSampling

# Collect data

The url below has the top 1000 Names in England in 2015. Only boys' names were used for creating the model this time.

In [3]:
url = 'https://www.britishbabynames.com/blog/top-1000-names-in-england-and-wales-2015.html'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')

html_text = soup.find('table').find_all('table')[1].find_all('td')

In [4]:
name_list = []

for i in html_text:
  tmp = i.string.replace('\xa0', '')
  name_list.append(list(tmp))

In [5]:
df = pd.DataFrame(name_list)

In [6]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,O,L,I,V,E,R,,,,,,
1,J,A,C,K,,,,,,,,
2,H,A,R,R,Y,,,,,,,
3,G,E,O,R,G,E,,,,,,
4,J,A,C,O,B,,,,,,,


In [7]:
df.fillna('*', inplace=True)

In [8]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,O,L,I,V,E,R,*,*,*,*,*,*
1,J,A,C,K,*,*,*,*,*,*,*,*
2,H,A,R,R,Y,*,*,*,*,*,*,*
3,G,E,O,R,G,E,*,*,*,*,*,*
4,J,A,C,O,B,*,*,*,*,*,*,*


In [9]:
new_column_names = {i: 'w' + str(i + 1) for i in range(0, 12, 1)}
df.rename(columns=new_column_names, inplace=True)

In [10]:
df

Unnamed: 0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12
0,O,L,I,V,E,R,*,*,*,*,*,*
1,J,A,C,K,*,*,*,*,*,*,*,*
2,H,A,R,R,Y,*,*,*,*,*,*,*
3,G,E,O,R,G,E,*,*,*,*,*,*
4,J,A,C,O,B,*,*,*,*,*,*,*
...,...,...,...,...,...,...,...,...,...,...,...,...
1018,S,T,U,A,R,T,*,*,*,*,*,*
1019,T,A,D,H,G,*,*,*,*,*,*,*
1020,T,U,D,O,R,*,*,*,*,*,*,*
1021,U,M,A,I,R,*,*,*,*,*,*,*


# Create the model

In [11]:
#Define the network
model = BayesianModel([('w1','w2'),
                       ('w1','w3'),
                       ('w2','w3'),
                       ('w2','w4'),
                       ('w3','w4'),
                       ('w3','w5'),
                       ('w4','w5'),
                       ('w4','w6'),
                       ('w5','w6'),
                       ('w5','w7'),
                       ('w6','w7'),
                       ('w6','w8'),
                       ('w7','w8'),
                       ('w7','w9'),
                       ('w8','w9'),
                       ('w8','w10'),
                       ('w9','w10'),
                       ('w9','w11'),
                       ('w10','w11'),
                       ('w10','w12'),
                       ('w11','w12')
                       ])

#Calculate the cpd
model.fit(df)
cpds = model.get_cpds()

#Just check the w1's probabilities
print(model.get_cpds('w1'))

  import pandas.util.testing as tm


+-------+------------+
| w1(A) | 0.129032   |
+-------+------------+
| w1(B) | 0.0400782  |
+-------+------------+
| w1(C) | 0.0625611  |
+-------+------------+
| w1(D) | 0.0459433  |
+-------+------------+
| w1(E) | 0.0488759  |
+-------+------------+
| w1(F) | 0.0332356  |
+-------+------------+
| w1(G) | 0.0127077  |
+-------+------------+
| w1(H) | 0.0469208  |
+-------+------------+
| w1(I) | 0.0293255  |
+-------+------------+
| w1(J) | 0.0625611  |
+-------+------------+
| w1(K) | 0.0615836  |
+-------+------------+
| w1(L) | 0.0430108  |
+-------+------------+
| w1(M) | 0.0723363  |
+-------+------------+
| w1(N) | 0.0234604  |
+-------+------------+
| w1(O) | 0.0234604  |
+-------+------------+
| w1(P) | 0.0175953  |
+-------+------------+
| w1(Q) | 0.00195503 |
+-------+------------+
| w1(R) | 0.0752688  |
+-------+------------+
| w1(S) | 0.0576735  |
+-------+------------+
| w1(T) | 0.0420332  |
+-------+------------+
| w1(U) | 0.00391007 |
+-------+------------+
| w1(V) | 0

# Sampling

In [14]:
#Sampling
sampler = BayesianModelSampling(model)
new_data = sampler.forward_sample(size=5)

#Output
print(new_data)

  "Found unknown state name. Trying to switch to using all state names as state numbers"
Generating for node: w12: 100%|██████████| 12/12 [00:00<00:00, 16.66it/s]

  w1 w2 w3 w4 w5 w6 w7 w8 w9 w10 w11 w12
0  M  I  L  F  R  E  D  *  *   *   *   *
1  R  A  F  A  E  L  *  *  *   *   *   *
2  U  Z  A  I  S  *  *  *  *   *   *   *
3  A  A  R  R  Y  *  *  *  *   *   *   *
4  K  U  R  T  I  S  *  *  *   *   *   *



