In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import random

from atm import *

In [2]:
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/b.kabongo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/b.kabongo/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
class Config:
    n_users = None
    n_items = None
    n_topics = None
    n_aspects = None
    vocabulary_size = 10_000
    gibbs_sampling_iterations = 1000
    train_size = 0.8
    seed = 42

In [4]:
random.seed(Config.seed)
np.random.seed(Config.seed)

In [5]:
config = Config()
        
data_df = pd.read_csv("../../aspects_datasets/Beer/data.csv")
user_vocab = {user_id: i for i, user_id in enumerate(data_df["user_id"].unique())}
item_vocab = {item_id: i for i, item_id in enumerate(data_df["item_id"].unique())}
config.n_users = len(user_vocab)
config.n_items = len(item_vocab)
config.n_aspects = 4
config.n_topics = 10

data_df["user_id"] = data_df["user_id"].map(user_vocab)
data_df["item_id"] = data_df["item_id"].map(item_vocab)

In [6]:
train_df = data_df.sample(frac=config.train_size, random_state=config.seed)
data, vocabulary = process_data(config, train_df)

Processing Data: 100%|[36m██████████[0m| 231199/231199 [01:34<00:00, 2456.36it/s]


In [7]:
print("Data:", data[:5])

Data: [(587, 898, [['spectacular', 'murki', 'golden', 'bodi', 'top', 'larg', 'sticki', 'rocki', 'white', 'head', 'last', 'forev', 'build', 'kind', 'funni', 'shape', 'pretti', 'phenol', 'aroma', 'certain', 'clovey', 'also', 'medicin', 'ester', 'come', 'well', 'shape', 'honey', 'melon', 'pure', 'banana', 'eventu', 'close', 'earthi', 'mildest', 'cinnamon', 'note', 'appear', 'alreadi', 'quit', 'origin', 'fascin', 'featur', 'mouthfeel', 'dri', 'weizen', 'first', 'also', 'enorm', 'full', 'bodi', 'immedi', 'expand', 'thick', 'chewi', 'meringu', 'foam', 'mouth', 'uniqu', 'textur', 'full', 'wheati', 'breadi', 'perhap', 'liveliest', 'tastiest', 'origin', 'weizen', 'mani', 'way', 'get', 'respect', 'despit', 'unclean']]), (7963, 61, [['bottl', 'pour', 'clear', 'rosi', 'dark', 'brown', 'massiv', 'light', 'tan', 'head', 'sweet', 'milki', 'nose', 'nutti', 'chocol', 'charact', 'sweet', 'nutti', 'malt', 'flavor', 'like', 'pumpernickel', 'bread', 'mix', 'chocol', 'pleasant', 'smooth', 'sweet', 'much', '

In [8]:
print("Vocabulary:", list(vocabulary.keys())[:100])

Vocabulary: ['head', 'aroma', 'sweet', 'light', 'flavor', 'malt', 'bottle', 'beer', 'white', 'finish', 'nice', 'hops', 'taste', 'dark', 'caramel', 'good', 'medium', 'pours', 'brown', 'body', 'bitter', 'color', 'bit', 'like', 'notes', 'chocolate', 'amber', 'malty', 'little', 'nose', 'dry', 'hop', 'carbonation', 'well', 'citrus', 'clear', 'orange', 'malts', 'thin', 'bitterness', 'fruity', 'one', 'golden', 'slightly', 'roasted', 'alcohol', 'coffee', 'small', 'fruit', 'hoppy', 'creamy', 'smooth', 'palate', 'quite', 'much', 'black', 'lacing', 'flavour', 'really', 'mouthfeel', 'hazy', 'sweetness', 'bodied', 'mild', 'tan', 'great', 'yellow', 'pretty', 'slight', 'full', 'yeast', 'pale', 'flavors', 'ale', 'decent', 'big', 'strong', 'floral', 'poured', 'thanks', 'pour', 'thick', 'touch', 'deep', 'spicy', 'brew', 'hint', 'overall', 'red', 'colour', 'almost', 'lightly', 'vanilla', 'wheat', 'balanced', 'lots', 'sour', 'copper', 'tap', 'bad']


In [None]:
Beta_w = np.ones(config.vocabulary_size)  
Gamma_u = np.ones(config.n_aspects)
Gamma_i = np.ones(config.n_aspects)
Alpha_u = np.ones(config.n_topics)
Alpha_i = np.ones(config.n_topics)
eta = (1, 1)

params = gibbs_sampling_atm(config, data, vocabulary, Beta_w, Gamma_u, Gamma_i, Alpha_u, Alpha_i, eta)

Gibbs Sampling:  36%|[36m███▋      [0m| 364/1000 [52:10<1:31:49,  8.66s/it]

In [None]:
params.keys()

dict_keys(['Phi', 'Lambda_u', 'Lambda_i', 'Theta_u', 'Psi_i', 'Pi_u'])

In [None]:
pickle.dump(params, open("../../ALFM/Beer/params.pkl", "wb"))