# Seeded Poisson Factorization (SPF) example notebook

This notebook shows a minimal example of the SPF model.

In [17]:
# Imports
from SPF.SPF_model import SPF
import numpy as np
import pandas as pd
import tensorflow as tf

# Set seed
tf.random.set_seed(42)

df1 = pd.read_csv("./data/10k_amazon.csv")
df1.head()

Unnamed: 0.1,Unnamed: 0,productId,Title,userId,Helpfulness,Score,Time,Text,Cat1,Cat2,Cat3,tokens
0,7290,B0002VAUGQ,Panasonic ES8152NC Rechargeable Pivot Action P...,A209EUVOA7XUE1,1/1,5.0,1162598400,For most of my adult life I have used electric...,health personal care,personal care,shaving hair removal,50
1,31407,B0002X7RYM,The First Years 5-in-1 Carry Me Near Sleep System,A3IGS8PE0SGXAC,0/1,2.0,1313280000,I bought this item because of all the good rev...,baby products,nursery,furniture,59
2,31992,B000BQSID4,DUSTPAN W/HOOD COPPER [Kitchen],A3FVPUDGN016BX,12/12,1.0,1315612800,This dust pan is probably the worst item I hav...,health personal care,household supplies,cleaning tools,94
3,30173,B000E3CG0K,Paraffin Wax Refill With Heat Retaining Capacity,A3OU3EMLLTMMT9,0/0,5.0,1308096000,I purchased this product for my Dr. Scholl's p...,beauty,skin care,hands nails,79
4,36061,B000LBTE1O,Gerber Training Pants (6-pairs),ALWLY040H253N,3/4,4.0,1329350400,My daughter is 19months old (34.5 inches long ...,baby products,bathing skin care,unknown,118


In [18]:
# Define keywords
pets = ["dog","cat", "litter", "cats", "dogs", "food", "box", "collar", "water", "pet"]
toys = ["toy", "game", "play", "fun", "old", "son", "year", "loves", "kids", "daughter"]
beauty = ["hair", "skin", "product", "color", "scent", "smell", "used", "dry", "using", "products"]
baby = ["baby", "seat", "diaper", "diapers", "stroller", "bottles", "son", "pump", "gate", "months"]
health = ["product", "like", "razor", "shave", "time", "day", "shaver", "better", "work", "years"]
grocery = ["tea", "taste", "flavor", "coffee", "sauce", "chocolate", "sugar", "eat", "sweet", "delicious"]

keywords = {"pet supplies": pets, "toys games": toys, "beauty": beauty, "baby products": baby, "health personal care": health, "grocery gourmet food": grocery}

## SPF model

In [19]:
# Initialize the model
spf1 = SPF(keywords = keywords, residual_topics=0)
spf1

Seeded Poisson Factorization (SPF) model initialized with 6 keyword topics and 0 residual topics.

In [20]:
# Read documents and create the data required in the backend
spf1.read_docs(df1["Text"])

DTM created with: 30000 documents and 23135 unique words!


In [21]:
# Train the model
spf1.model_train(lr = 0.1, epochs = 150, tensorboard = False, print_information = False, print_progressbar = True)

|**************--------------------------------------------------------------------------------------| 14.00% [4.5970/s per epoch | Negative ELBO: 7110841.5]

KeyboardInterrupt: 

In [None]:
# See model results
spf1.plot_model_loss()

In [None]:
betas = spf1.calculate_topic_word_distributions()
betas

In [None]:
# Print the words with the highest mean intensity per topic
import pprint
pprint.pprint(spf1.print_topics(num_words = 50))

In [None]:
# Calculate model accuracy
categories, E_theta = spf1.calculate_topics()
df1["SPF_estimates"] = categories
df1["Accuracy"] = df1.Cat1 == df1.SPF_estimates

from sklearn.metrics import classification_report, confusion_matrix
import pprint
pprint.pprint(classification_report(df1.Cat1, df1.SPF_estimates))

In [None]:
# Analyze keywords from topics
spf1.plot_seeded_topic_distribution(topic = "grocery gourmet food")

In [None]:
# Analyze topic-word distribution of specific words or keywords
spf1.plot_word_distribution(word = "dog", topic = "pet supplies", x_max = 45)