In [47]:
import random
from collections import defaultdict

import torch
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn import datasets

# Load/Gen Data

In [48]:
words = ["word"+str(i) for i in range(5000)]
words

['word0',
 'word1',
 'word2',
 'word3',
 'word4',
 'word5',
 'word6',
 'word7',
 'word8',
 'word9',
 'word10',
 'word11',
 'word12',
 'word13',
 'word14',
 'word15',
 'word16',
 'word17',
 'word18',
 'word19',
 'word20',
 'word21',
 'word22',
 'word23',
 'word24',
 'word25',
 'word26',
 'word27',
 'word28',
 'word29',
 'word30',
 'word31',
 'word32',
 'word33',
 'word34',
 'word35',
 'word36',
 'word37',
 'word38',
 'word39',
 'word40',
 'word41',
 'word42',
 'word43',
 'word44',
 'word45',
 'word46',
 'word47',
 'word48',
 'word49',
 'word50',
 'word51',
 'word52',
 'word53',
 'word54',
 'word55',
 'word56',
 'word57',
 'word58',
 'word59',
 'word60',
 'word61',
 'word62',
 'word63',
 'word64',
 'word65',
 'word66',
 'word67',
 'word68',
 'word69',
 'word70',
 'word71',
 'word72',
 'word73',
 'word74',
 'word75',
 'word76',
 'word77',
 'word78',
 'word79',
 'word80',
 'word81',
 'word82',
 'word83',
 'word84',
 'word85',
 'word86',
 'word87',
 'word88',
 'word89',
 'word90',
 'word91'

In [49]:
sentences = []

for _s in range(1000):
    length = random.randint(3,7)
    sentences.append(random.choices(population=words, k=length))

sentences

[['word606', 'word2990', 'word3780', 'word4792'],
 ['word3358', 'word4256', 'word3928'],
 ['word1498', 'word2380', 'word2581', 'word4906', 'word2777', 'word2298'],
 ['word3602',
  'word1135',
  'word1519',
  'word807',
  'word2822',
  'word240',
  'word2405'],
 ['word396',
  'word2868',
  'word2942',
  'word2694',
  'word928',
  'word2162',
  'word586'],
 ['word422', 'word2924', 'word197', 'word2081', 'word1356', 'word1164'],
 ['word4141', 'word1672', 'word4985', 'word672'],
 ['word2153',
  'word3830',
  'word4804',
  'word2366',
  'word4844',
  'word2137',
  'word2746'],
 ['word3348', 'word3470', 'word1137', 'word3434', 'word3884'],
 ['word4995', 'word2953', 'word4795'],
 ['word3595', 'word2214', 'word2932', 'word19', 'word2111', 'word4519'],
 ['word3952', 'word1591', 'word2536', 'word3443', 'word415', 'word3657'],
 ['word2402',
  'word767',
  'word1811',
  'word578',
  'word4033',
  'word1318',
  'word3963'],
 ['word1827', 'word1366', 'word4988'],
 ['word2290', 'word2706', 'word3614'

In [50]:
data = [(sentence, random.getrandbits(1)) for sentence in sentences]
df = DataFrame(data)
data_X, data_y = df[0], df[1]
data_X, data_y

(0                [word606, word2990, word3780, word4792]
 1                         [word3358, word4256, word3928]
 2      [word1498, word2380, word2581, word4906, word2...
 3      [word3602, word1135, word1519, word807, word28...
 4      [word396, word2868, word2942, word2694, word92...
                              ...                        
 995    [word660, word2117, word4513, word3288, word35...
 996     [word2810, word1781, word311, word2769, word840]
 997                       [word2470, word1773, word1724]
 998                          [word428, word1038, word60]
 999    [word1543, word13, word84, word4964, word4526,...
 Name: 0, Length: 1000, dtype: object,
 0      0
 1      0
 2      1
 3      0
 4      1
       ..
 995    1
 996    0
 997    1
 998    1
 999    0
 Name: 1, Length: 1000, dtype: int64)

# Split Data

In [51]:
X_learn, X_evaluate, y_learn, y_evaluate = train_test_split(data_X, data_y, test_size=0.1, random_state=0)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X_learn, y_learn, test_size=0.2, random_state=0)
data_X.shape, X_train.shape, X_test.shape, X_evaluate.shape

((1000,), (720,), (180,), (100,))

# Build lexicon

In [53]:
def build_lexicon(X, y):
    lexicon = defaultdict(lambda: {0:0, 1:0})
    for i in X.index.values:
        sentence = X[i]
        label = y[i]
        for word in sentence:
            lexicon[word][label] += 1
    return lexicon

In [54]:
global_lexicon = build_lexicon(X_train, y_train)
global_lexicon

defaultdict(<function __main__.build_lexicon.<locals>.<lambda>()>,
            {'word507': {0: 0, 1: 1},
             'word3818': {0: 1, 1: 1},
             'word1788': {0: 0, 1: 1},
             'word1705': {0: 0, 1: 2},
             'word3655': {0: 0, 1: 1},
             'word2259': {0: 0, 1: 1},
             'word1917': {0: 0, 1: 3},
             'word4843': {0: 2, 1: 1},
             'word1362': {0: 1, 1: 1},
             'word2440': {0: 1, 1: 0},
             'word1906': {0: 1, 1: 0},
             'word773': {0: 1, 1: 0},
             'word288': {0: 1, 1: 1},
             'word2913': {0: 1, 1: 0},
             'word949': {0: 0, 1: 1},
             'word4141': {0: 1, 1: 2},
             'word857': {0: 2, 1: 3},
             'word3668': {0: 1, 1: 1},
             'word352': {0: 1, 1: 0},
             'word3078': {0: 1, 1: 0},
             'word1464': {0: 1, 1: 1},
             'word4638': {0: 1, 1: 1},
             'word1811': {0: 0, 1: 3},
             'word863': {0: 0, 1: 2},
    

# Build params

In [55]:
for word in global_lexicon:
    global_lexicon[word]['alpha'] = global_lexicon[word][0] + 1
    global_lexicon[word]['beta'] = global_lexicon[word][1] + 1

global_lexicon = defaultdict(lambda: {
    0: 0,
    1: 0,
    'alpha': 1,
    'beta': 1}, global_lexicon)
global_lexicon

defaultdict(<function __main__.<lambda>()>,
            {'word507': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word3818': {0: 1, 1: 1, 'alpha': 2, 'beta': 2},
             'word1788': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word1705': {0: 0, 1: 2, 'alpha': 1, 'beta': 3},
             'word3655': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word2259': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word1917': {0: 0, 1: 3, 'alpha': 1, 'beta': 4},
             'word4843': {0: 2, 1: 1, 'alpha': 3, 'beta': 2},
             'word1362': {0: 1, 1: 1, 'alpha': 2, 'beta': 2},
             'word2440': {0: 1, 1: 0, 'alpha': 2, 'beta': 1},
             'word1906': {0: 1, 1: 0, 'alpha': 2, 'beta': 1},
             'word773': {0: 1, 1: 0, 'alpha': 2, 'beta': 1},
             'word288': {0: 1, 1: 1, 'alpha': 2, 'beta': 2},
             'word2913': {0: 1, 1: 0, 'alpha': 2, 'beta': 1},
             'word949': {0: 0, 1: 1, 'alpha': 1, 'beta': 2},
             'word4141': {0: 1

# Classify

In [56]:
def classify(sentence, lexicon):
    alpha_beta = []
    for word in sentence:
        alpha_beta.append((lexicon[word]['alpha'], lexicon[word]['beta']))

    alpha_beta_t = torch.tensor(alpha_beta)
    alpha_beta_t = torch.sum(alpha_beta_t, dim=0)
    alpha = alpha_beta_t[0]
    beta = alpha_beta_t[1]
    return alpha/(alpha+beta)

In [57]:
classify(["word1", "word500", "word7"], global_lexicon)

tensor(0.5000)

In [58]:
# Evaluate