# Seeded Poisson Factorization (SPF) example notebook

This notebook shows a minimal example of the SPF model.

In [1]:
# Imports
from seededPF.SPF_model import SPF
import numpy as np
import pandas as pd
import tensorflow as tf

# Set seed
tf.random.set_seed(42)

df1 = pd.read_csv("./data/10k_amazon.csv")
df1.head()

Unnamed: 0.1,Unnamed: 0,productId,Title,userId,Helpfulness,Score,Time,Text,Cat1,Cat2,Cat3,tokens
0,15123,B00032G1S0,"Tuscan Whole Milk, 1 Gallon, 128 fl oz",A3B4TGHT06VIV9,19/20,5.0,1220659200,"It is a Tuscan Milkman,And he stoppeth only me...",grocery gourmet food,dairy eggs,milk,106
1,29859,B000O1LEK0,Kid Galaxy Mega X Morphibian Komodo dragon,A29BL1ME0A4S0U,1/1,4.0,1306800000,This is a fun car. The only problem is the sho...,toys games,electronics for kids,unknown,52
2,25129,B000G7WU50,Boon Flo Water Deflector and Protective Faucet...,A1M2CJJ859SZV7,1/1,2.0,1285372800,This does indeed deflect the water for a softe...,baby products,safety,bathroom safety,284
3,20101,B000K50KVO,Playtex Baby Einstein 6oz. Spill-Proof Trainer...,A1YU39HHJTF0F0,0/0,1.0,1254700800,because my little girl loves to chew on her si...,baby products,feeding,solid feeding,46
4,18841,B0001E8BJ2,Pre~Seed Intimate Moisturizer with 2 Early Pre...,A17YQZH6BBY9O4,2/2,5.0,1244851200,We got pregnant after using this product only ...,health personal care,health care,family planning contraceptives,49


In [2]:
# Define keywords
pets = ["dog","cat", "litter", "cats", "dogs", "food", "box", "collar", "water", "pet"]
toys = ["toy", "game", "play", "fun", "old", "son", "year", "loves", "kids", "daughter"]
beauty = ["hair", "skin", "product", "color", "scent", "smell", "used", "dry", "using", "products"]
baby = ["baby", "seat", "diaper", "diapers", "stroller", "bottles", "son", "pump", "gate", "months"]
health = ["product", "like", "razor", "shave", "time", "day", "shaver", "better", "work", "years"]
grocery = ["tea", "taste", "flavor", "coffee", "sauce", "chocolate", "sugar", "eat", "sweet", "delicious"]

keywords = {"pet supplies": pets, "toys games": toys, "beauty": beauty, "baby products": baby, "health personal care": health, "grocery gourmet food": grocery}

## SPF model

In [3]:
# Initialize the model
spf1 = SPF(keywords = keywords, residual_topics=0)
spf1

Seeded Poisson Factorization (SPF) model initialized with 6 keyword topics and 0 residual topics.

In [4]:
# Read documents and create the data required in the backend
spf1.read_docs(df1["Text"])

DTM created with: 10000 documents and 13636 unique words!


In [5]:
# Train the model
spf1.model_train(lr = 0.1, epochs = 150, tensorboard = False, print_information = False, print_progressbar = True)

Instructions for updating:
The `interpolate_nondiscrete` flag is deprecated; instead use `force_probs_to_zero_outside_support` (with the opposite sense).


KeyboardInterrupt: 

In [None]:
# See model results
spf1.plot_model_loss()

In [None]:
betas = spf1.calculate_topic_word_distributions()
betas

In [None]:
# Print the words with the highest mean intensity per topic
import pprint
pprint.pprint(spf1.print_topics(num_words = 50))

In [None]:
# Calculate model accuracy
categories, E_theta = spf1.return_topics()
df1["SPF_estimates"] = categories
df1["Accuracy"] = df1.Cat1 == df1.SPF_estimates

from sklearn.metrics import classification_report, confusion_matrix
import pprint
pprint.pprint(classification_report(df1.Cat1, df1.SPF_estimates))

In [None]:
# Analyze keywords from topics
spf1.plot_seeded_topic_distribution(topic = "grocery gourmet food")

In [None]:
# Analyze topic-word distribution of specific words or keywords
spf1.plot_word_distribution(word = "dog", topic = "pet supplies", x_max = 45)