In [1]:
import random
from collections import defaultdict
from typing import Optional

import pyro.distributions as dist
import pyro.distributions.constraints
import torch
from pandas import DataFrame
from pyro.distributions import constraints
from pyroapi import pyro
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Load/Gen Data

In [2]:
words = ["word" + str(i) for i in range(5000)]
words

['word0',
 'word1',
 'word2',
 'word3',
 'word4',
 'word5',
 'word6',
 'word7',
 'word8',
 'word9',
 'word10',
 'word11',
 'word12',
 'word13',
 'word14',
 'word15',
 'word16',
 'word17',
 'word18',
 'word19',
 'word20',
 'word21',
 'word22',
 'word23',
 'word24',
 'word25',
 'word26',
 'word27',
 'word28',
 'word29',
 'word30',
 'word31',
 'word32',
 'word33',
 'word34',
 'word35',
 'word36',
 'word37',
 'word38',
 'word39',
 'word40',
 'word41',
 'word42',
 'word43',
 'word44',
 'word45',
 'word46',
 'word47',
 'word48',
 'word49',
 'word50',
 'word51',
 'word52',
 'word53',
 'word54',
 'word55',
 'word56',
 'word57',
 'word58',
 'word59',
 'word60',
 'word61',
 'word62',
 'word63',
 'word64',
 'word65',
 'word66',
 'word67',
 'word68',
 'word69',
 'word70',
 'word71',
 'word72',
 'word73',
 'word74',
 'word75',
 'word76',
 'word77',
 'word78',
 'word79',
 'word80',
 'word81',
 'word82',
 'word83',
 'word84',
 'word85',
 'word86',
 'word87',
 'word88',
 'word89',
 'word90',
 'word91'

In [3]:
sentences = []

for _s in range(1000):
    length = torch.distributions.LogNormal(1., .5).sample().int().item() * 3
    sentences.append(random.choices(population=words, k=length))

sentences

[['word423', 'word2944', 'word4244', 'word3978', 'word677', 'word3398'],
 ['word1509',
  'word2300',
  'word2943',
  'word4150',
  'word1064',
  'word885',
  'word3192',
  'word3321',
  'word4526'],
 ['word4026', 'word1143', 'word717'],
 ['word2675', 'word2554', 'word1346'],
 ['word846', 'word2039', 'word2593'],
 ['word1731',
  'word2719',
  'word3661',
  'word2105',
  'word2760',
  'word2458',
  'word2030',
  'word225',
  'word4180',
  'word2561',
  'word4152',
  'word4238'],
 ['word1912', 'word2295', 'word3956'],
 ['word1478',
  'word3687',
  'word4890',
  'word2852',
  'word1753',
  'word1980',
  'word2704',
  'word1699',
  'word1682'],
 ['word1330',
  'word3419',
  'word723',
  'word1516',
  'word414',
  'word1788',
  'word4962',
  'word3203',
  'word3336',
  'word862',
  'word2436',
  'word4104'],
 ['word3857',
  'word4318',
  'word3254',
  'word3511',
  'word4704',
  'word1919',
  'word1032',
  'word2049',
  'word3688',
  'word1301',
  'word2086',
  'word1488',
  'word1896',
  'w

In [4]:
data = [(sentence, random.getrandbits(1)) for sentence in sentences]
df = DataFrame(data)
data_X, data_y = df[0], df[1]
data_X, data_y

(0      [word423, word2944, word4244, word3978, word67...
 1      [word1509, word2300, word2943, word4150, word1...
 2                          [word4026, word1143, word717]
 3                         [word2675, word2554, word1346]
 4                          [word846, word2039, word2593]
                              ...                        
 995    [word1177, word228, word4889, word2232, word20...
 996    [word1064, word1686, word3162, word1044, word1...
 997    [word1106, word2270, word973, word1905, word42...
 998                       [word1209, word4882, word1284]
 999                        [word3898, word2536, word972]
 Name: 0, Length: 1000, dtype: object,
 0      0
 1      1
 2      1
 3      0
 4      1
       ..
 995    0
 996    1
 997    0
 998    0
 999    0
 Name: 1, Length: 1000, dtype: int64)

# Split Data

In [5]:
X_learn, X_evaluate, y_learn, y_evaluate = train_test_split(data_X, data_y, test_size=0.1, random_state=0)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_learn, y_learn, test_size=0.2, random_state=0)
data_X.shape, X_train.shape, X_test.shape, X_evaluate.shape

((1000,), (720,), (180,), (100,))

# Build lexicon

In [7]:
def build_lexicon(X, y):
    lexicon = defaultdict(lambda: {0: 0, 1: 0})
    for i in X.index.values:
        sentence = X[i]
        label = y[i]
        for word in sentence:
            lexicon[word][label] += 1
    return lexicon

In [8]:
global_lexicon = build_lexicon(X_train, y_train)
global_lexicon

defaultdict(<function __main__.build_lexicon.<locals>.<lambda>()>,
            {'word4860': {0: 1, 1: 2},
             'word1182': {0: 1, 1: 1},
             'word2724': {0: 1, 1: 1},
             'word2742': {0: 0, 1: 1},
             'word1163': {0: 0, 1: 2},
             'word2345': {0: 2, 1: 1},
             'word4299': {0: 1, 1: 2},
             'word3574': {0: 0, 1: 2},
             'word2976': {0: 0, 1: 1},
             'word2494': {0: 0, 1: 1},
             'word1613': {0: 2, 1: 1},
             'word3813': {0: 0, 1: 1},
             'word2825': {0: 1, 1: 1},
             'word91': {0: 0, 1: 2},
             'word771': {0: 1, 1: 1},
             'word4427': {0: 2, 1: 3},
             'word2482': {0: 1, 1: 1},
             'word2934': {0: 1, 1: 1},
             'word3228': {0: 0, 1: 1},
             'word4284': {0: 0, 1: 3},
             'word2644': {0: 0, 1: 1},
             'word4294': {0: 1, 1: 1},
             'word1882': {0: 1, 1: 1},
             'word1143': {0: 1, 1: 2},


# Build params

In [9]:
def model(alpha, beta):
    latent_prediction = pyro.sample("latent_prediction",
                                    dist.Beta(concentration0=torch.as_tensor(alpha, dtype=torch.float),
                                              concentration1=torch.as_tensor(beta, dtype=torch.float)))
    data = torch.concat((torch.zeros(alpha), torch.ones(beta)))
    with pyro.plate("N", alpha + beta):
        # observe datapoint i using the bernoulli
        # likelihood Bernoulli(f)
        return pyro.sample("obs", dist.Bernoulli(latent_prediction), obs=data)

In [10]:
def guide(alpha, beta):
    alpha_posterior = pyro.param("alpha_posterior", torch.tensor(1.0), constraint=constraints.positive)
    beta_posterior = pyro.param("beta_posterior", torch.tensor(1.0), constraint=constraints.positive)
    return pyro.sample("latent_prediction", dist.Beta(concentration0=alpha_posterior,
                                                      concentration1=beta_posterior))


In [11]:
def bayesian_update(alpha,
                    beta,
                    max_opt_steps=100,
                    loss_threshold=None,
                    clear_param_store=True,
                    cuda=False,
                    pbar: Optional[tqdm] = None):
    if clear_param_store:
        pyro.clear_param_store()
    if cuda is not None:
        if cuda and torch.cuda.is_available():
            torch.device("cuda")
        elif not cuda:
            torch.device("cpu")
    adam = pyro.optim.Adam({"lr": 0.0025})
    elbo = pyro.infer.Trace_ELBO()
    svi = pyro.infer.SVI(model, guide, adam, elbo)
    steps = 0
    loss = (loss_threshold or 0) + 1

    while steps < max_opt_steps and loss > (loss_threshold or loss - 1):
        loss = svi.step(alpha, beta)
        steps += 1
        if pbar is not None:
            pbar.update()
    pbar.update(max_opt_steps - steps)
    return pyro.param("alpha_posterior"), pyro.param("beta_posterior")

In [12]:
for word in global_lexicon:
    global_lexicon[word]['alpha'] = global_lexicon[word][0] + 1
    global_lexicon[word]['beta'] = global_lexicon[word][1] + 1

global_lexicon = defaultdict(lambda: {
    0: 0,
    1: 0,
    'alpha': 1,
    'beta': 1}, global_lexicon)
global_lexicon

defaultdict(<function __main__.<lambda>()>,
            {'word4860': {0: 1, 1: 2, 'alpha': 2, 'beta': 3},
             'word1182': {0: 1, 1: 1, 'alpha': 2, 'beta': 2},
             'word2724': {0: 1, 1: 1, 'alpha': 2, 'beta': 2},
             'word2742': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word1163': {0: 0, 1: 2, 'alpha': 1, 'beta': 3},
             'word2345': {0: 2, 1: 1, 'alpha': 3, 'beta': 2},
             'word4299': {0: 1, 1: 2, 'alpha': 2, 'beta': 3},
             'word3574': {0: 0, 1: 2, 'alpha': 1, 'beta': 3},
             'word2976': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word2494': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word1613': {0: 2, 1: 1, 'alpha': 3, 'beta': 2},
             'word3813': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word2825': {0: 1, 1: 1, 'alpha': 2, 'beta': 2},
             'word91': {0: 0, 1: 2, 'alpha': 1, 'beta': 3},
             'word771': {0: 1, 1: 1, 'alpha': 2, 'beta': 2},
             'word4427': {0: 

In [13]:
with tqdm(total=len(global_lexicon) * 1000) as pbar:
    for word in global_lexicon:
        alpha_posterior, beta_posterior = bayesian_update(global_lexicon[word]['alpha'],
                                                          global_lexicon[word]['beta'],
                                                          max_opt_steps=1000,
                                                          cuda=True,
                                                          pbar=pbar)
        global_lexicon[word]['alpha_posterior'] = alpha_posterior
        global_lexicon[word]['beta_posterior'] = beta_posterior

global_lexicon = defaultdict(lambda: {
    0: 0,
    1: 0,
    'alpha': 1,
    'beta': 1,
    'alpha_posterior': torch.tensor(1.0),
    'beta_posterior': torch.tensor(1.0),
}, global_lexicon)
global_lexicon

100%|██████████| 3334000/3334000 [1:08:23<00:00, 812.42it/s]


defaultdict(<function __main__.<lambda>()>,
            {'word4860': {0: 1,
              1: 2,
              'alpha': 2,
              'beta': 3,
              'alpha_posterior': tensor(1.2517, grad_fn=<AddBackward0>),
              'beta_posterior': tensor(1.7849, grad_fn=<AddBackward0>)},
             'word1182': {0: 1,
              1: 1,
              'alpha': 2,
              'beta': 2,
              'alpha_posterior': tensor(1.5115, grad_fn=<AddBackward0>),
              'beta_posterior': tensor(1.5852, grad_fn=<AddBackward0>)},
             'word2724': {0: 1,
              1: 1,
              'alpha': 2,
              'beta': 2,
              'alpha_posterior': tensor(1.5016, grad_fn=<AddBackward0>),
              'beta_posterior': tensor(1.5880, grad_fn=<AddBackward0>)},
             'word2742': {0: 0,
              1: 1,
              'alpha': 1,
              'beta': 2,
              'alpha_posterior': tensor(1.0284, grad_fn=<AddBackward0>),
              'beta_posterior': t

# Classify

In [14]:
def classify(sentence, lexicon):
    expectations = []
    for word in sentence:
        alpha = lexicon[word]['alpha_posterior']
        beta = lexicon[word]['beta_posterior']
        distribution = dist.Beta(alpha, beta)
        expectations.append(distribution.mean)

    expectations_t = torch.tensor(expectations)
    return torch.std_mean(expectations_t)

In [15]:
classify(["word1", "word500", "word7"], global_lexicon)

(tensor(0.1438), tensor(0.4954))

In [16]:
# Evaluate