In [74]:
import random
from collections import defaultdict
from typing import Optional

from pyro.distributions import constraints
from pyroapi import pyro
import pyro.distributions as dist
import pyro.distributions.constraints

import torch
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn import datasets
from tqdm import trange, tqdm

# Load/Gen Data

In [75]:
words = ["word" + str(i) for i in range(5000)]
words

['word0',
 'word1',
 'word2',
 'word3',
 'word4',
 'word5',
 'word6',
 'word7',
 'word8',
 'word9',
 'word10',
 'word11',
 'word12',
 'word13',
 'word14',
 'word15',
 'word16',
 'word17',
 'word18',
 'word19',
 'word20',
 'word21',
 'word22',
 'word23',
 'word24',
 'word25',
 'word26',
 'word27',
 'word28',
 'word29',
 'word30',
 'word31',
 'word32',
 'word33',
 'word34',
 'word35',
 'word36',
 'word37',
 'word38',
 'word39',
 'word40',
 'word41',
 'word42',
 'word43',
 'word44',
 'word45',
 'word46',
 'word47',
 'word48',
 'word49',
 'word50',
 'word51',
 'word52',
 'word53',
 'word54',
 'word55',
 'word56',
 'word57',
 'word58',
 'word59',
 'word60',
 'word61',
 'word62',
 'word63',
 'word64',
 'word65',
 'word66',
 'word67',
 'word68',
 'word69',
 'word70',
 'word71',
 'word72',
 'word73',
 'word74',
 'word75',
 'word76',
 'word77',
 'word78',
 'word79',
 'word80',
 'word81',
 'word82',
 'word83',
 'word84',
 'word85',
 'word86',
 'word87',
 'word88',
 'word89',
 'word90',
 'word91'

In [76]:
sentences = []

for _s in range(1000):
    length = random.randint(3, 7)
    sentences.append(random.choices(population=words, k=length))

sentences

[['word4761', 'word177', 'word1688'],
 ['word790', 'word1779', 'word3371', 'word901', 'word1656', 'word4557'],
 ['word581',
  'word1723',
  'word1463',
  'word4353',
  'word2796',
  'word1134',
  'word37'],
 ['word19', 'word603', 'word2602', 'word4325', 'word776', 'word2047'],
 ['word2942', 'word1195', 'word1117'],
 ['word4666', 'word4294', 'word3735', 'word1929'],
 ['word68',
  'word2250',
  'word197',
  'word4510',
  'word1957',
  'word1074',
  'word329'],
 ['word862', 'word245', 'word3020', 'word4851'],
 ['word934', 'word121', 'word161', 'word3105', 'word1133', 'word75'],
 ['word1916', 'word2963', 'word2218', 'word2638', 'word632'],
 ['word738',
  'word4980',
  'word733',
  'word2118',
  'word3340',
  'word1746',
  'word1574'],
 ['word3305', 'word858', 'word654'],
 ['word119', 'word2499', 'word2006'],
 ['word2586', 'word1388', 'word2611'],
 ['word2578',
  'word3322',
  'word1610',
  'word1601',
  'word1320',
  'word2472',
  'word1424'],
 ['word2337', 'word3484', 'word2302', 'word474

In [77]:
data = [(sentence, random.getrandbits(1)) for sentence in sentences]
df = DataFrame(data)
data_X, data_y = df[0], df[1]
data_X, data_y

(0                          [word4761, word177, word1688]
 1      [word790, word1779, word3371, word901, word165...
 2      [word581, word1723, word1463, word4353, word27...
 3      [word19, word603, word2602, word4325, word776,...
 4                         [word2942, word1195, word1117]
                              ...                        
 995     [word983, word1048, word3980, word656, word4894]
 996              [word3515, word3550, word465, word4416]
 997              [word4569, word448, word3400, word3153]
 998    [word1347, word3925, word4679, word1504, word1...
 999                [word393, word4256, word88, word1908]
 Name: 0, Length: 1000, dtype: object,
 0      1
 1      1
 2      1
 3      0
 4      0
       ..
 995    1
 996    1
 997    1
 998    1
 999    1
 Name: 1, Length: 1000, dtype: int64)

# Split Data

In [78]:
X_learn, X_evaluate, y_learn, y_evaluate = train_test_split(data_X, data_y, test_size=0.1, random_state=0)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X_learn, y_learn, test_size=0.2, random_state=0)
data_X.shape, X_train.shape, X_test.shape, X_evaluate.shape

((1000,), (720,), (180,), (100,))

# Build lexicon

In [80]:
def build_lexicon(X, y):
    lexicon = defaultdict(lambda: {0: 0, 1: 0})
    for i in X.index.values:
        sentence = X[i]
        label = y[i]
        for word in sentence:
            lexicon[word][label] += 1
    return lexicon

In [81]:
global_lexicon = build_lexicon(X_train, y_train)
global_lexicon

defaultdict(<function __main__.build_lexicon.<locals>.<lambda>()>,
            {'word721': {0: 0, 1: 3},
             'word1014': {0: 1, 1: 1},
             'word2982': {0: 0, 1: 1},
             'word1144': {0: 0, 1: 2},
             'word1579': {0: 0, 1: 1},
             'word2220': {0: 0, 1: 1},
             'word2046': {0: 3, 1: 1},
             'word1245': {0: 1, 1: 0},
             'word4522': {0: 1, 1: 0},
             'word3592': {0: 1, 1: 0},
             'word1993': {0: 1, 1: 1},
             'word4893': {0: 1, 1: 1},
             'word1725': {0: 0, 1: 1},
             'word3860': {0: 0, 1: 1},
             'word2486': {0: 0, 1: 3},
             'word3223': {0: 0, 1: 2},
             'word1787': {0: 2, 1: 0},
             'word2741': {0: 1, 1: 0},
             'word342': {0: 1, 1: 0},
             'word3459': {0: 1, 1: 0},
             'word1540': {0: 1, 1: 0},
             'word4871': {0: 2, 1: 1},
             'word4966': {0: 1, 1: 0},
             'word137': {0: 1, 1: 1},


# Build params

In [82]:
def model(alpha, beta):
    latent_prediction = pyro.sample("latent_prediction", dist.Beta(torch.tensor(1.), torch.tensor(1.)))
    data = torch.concat((torch.zeros(alpha), torch.ones(beta)))
    with pyro.plate("N", alpha + beta):
        # observe datapoint i using the bernoulli
        # likelihood Bernoulli(f)
        return pyro.sample("obs", dist.Bernoulli(latent_prediction), obs=data)

In [83]:
def guide(alpha, beta):
    alpha_posterior = pyro.param("alpha_posterior", torch.as_tensor(alpha, dtype=torch.float),
                                 constraint=constraints.positive)
    beta_posterior = pyro.param("beta_posterior", torch.as_tensor(beta, dtype=torch.float),
                                constraint=constraints.positive)
    return pyro.sample("latent_prediction", dist.Beta(concentration0=alpha_posterior,
                                                      concentration1=beta_posterior))


In [84]:
def bayesian_update(alpha, beta, max_opt_steps=100, loss_threshold=None, clear_param_store=True,
                    pbar: Optional[tqdm] = None):
    if clear_param_store:
        pyro.clear_param_store()
    adam = pyro.optim.Adam({"lr": 0.0025})
    elbo = pyro.infer.Trace_ELBO()
    svi = pyro.infer.SVI(model, guide, adam, elbo)
    steps = 0
    loss = (loss_threshold or 0) + 1
    while steps < max_opt_steps and loss > (loss_threshold or loss - 1):
        loss = svi.step(alpha, beta)
        steps += 1
        if pbar is not None:
            pbar.update()
    pbar.update(max_opt_steps - steps)
    return pyro.param("alpha_posterior"), pyro.param("beta_posterior")

In [85]:
for word in global_lexicon:
    global_lexicon[word]['alpha'] = global_lexicon[word][0] + 1
    global_lexicon[word]['beta'] = global_lexicon[word][1] + 1

global_lexicon = defaultdict(lambda: {
    0: 0,
    1: 0,
    'alpha': 1,
    'beta': 1}, global_lexicon)
global_lexicon

defaultdict(<function __main__.<lambda>()>,
            {'word721': {0: 0, 1: 3, 'alpha': 1, 'beta': 4},
             'word1014': {0: 1, 1: 1, 'alpha': 2, 'beta': 2},
             'word2982': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word1144': {0: 0, 1: 2, 'alpha': 1, 'beta': 3},
             'word1579': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word2220': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word2046': {0: 3, 1: 1, 'alpha': 4, 'beta': 2},
             'word1245': {0: 1, 1: 0, 'alpha': 2, 'beta': 1},
             'word4522': {0: 1, 1: 0, 'alpha': 2, 'beta': 1},
             'word3592': {0: 1, 1: 0, 'alpha': 2, 'beta': 1},
             'word1993': {0: 1, 1: 1, 'alpha': 2, 'beta': 2},
             'word4893': {0: 1, 1: 1, 'alpha': 2, 'beta': 2},
             'word1725': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word3860': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word2486': {0: 0, 1: 3, 'alpha': 1, 'beta': 4},
             'word3223': {0

In [86]:
with tqdm(total=len(global_lexicon) * 10) as pbar:
    for word in global_lexicon:
        alpha_posterior, beta_posterior = bayesian_update(global_lexicon[word]['alpha'],
                                                          global_lexicon[word]['beta'], max_opt_steps=10, pbar=pbar)
        global_lexicon[word]['alpha_posterior'] = alpha_posterior
        global_lexicon[word]['beta_posterior'] = beta_posterior

global_lexicon = defaultdict(lambda: {
    0: 0,
    1: 0,
    'alpha': 1,
    'beta': 1,
    'alpha_posterior': torch.tensor(1.0),
    'beta_posterior': torch.tensor(1.0),
}, global_lexicon)
global_lexicon

100%|██████████| 25970/25970 [02:05<00:00, 207.60it/s]


defaultdict(<function __main__.<lambda>()>,
            {'word721': {0: 0,
              1: 3,
              'alpha': 1,
              'beta': 4,
              'alpha_posterior': tensor(1.0141, grad_fn=<AddBackward0>),
              'beta_posterior': tensor(3.9068, grad_fn=<AddBackward0>)},
             'word1014': {0: 1,
              1: 1,
              'alpha': 2,
              'beta': 2,
              'alpha_posterior': tensor(2.0261, grad_fn=<AddBackward0>),
              'beta_posterior': tensor(1.9781, grad_fn=<AddBackward0>)},
             'word2982': {0: 0,
              1: 1,
              'alpha': 1,
              'beta': 2,
              'alpha_posterior': tensor(0.9908, grad_fn=<AddBackward0>),
              'beta_posterior': tensor(2.0183, grad_fn=<AddBackward0>)},
             'word1144': {0: 0,
              1: 2,
              'alpha': 1,
              'beta': 3,
              'alpha_posterior': tensor(1.0169, grad_fn=<AddBackward0>),
              'beta_posterior': te

# Classify

In [87]:
def classify(sentence, lexicon):
    expectations = []
    for word in sentence:
        alpha = lexicon[word]['alpha_posterior']
        beta = lexicon[word]['beta_posterior']
        distribution = dist.Beta(alpha, beta)
        expectations.append(distribution.mean)

    expectations_t = torch.tensor(expectations)
    return torch.mean(expectations_t), torch.std_mean(expectations_t)

In [88]:
classify(["word1", "word500", "word7"], global_lexicon)

(tensor(0.4462), (tensor(0.0931), tensor(0.4462)))

In [89]:
# Evaluate