## Testing Suite

In [29]:
# Set up the environment
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import LDA, LDAModel
import scipy as sp
import numpy as np
import pandas as pd
from scipy import io

# Input the data
data = sc.textFile("sample_data.txt")
parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3)

In [4]:
# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

Learned topics (as distributions over vocab of 11 words):
Topic 0:
 5.62631004755
 18.8616474703
 4.28187623872
 5.01676737764
 5.28503421228
 4.01115444738
 9.33112263062
 1.39866720377
 4.54494186264
 17.1316831112
 11.2615159387
Topic 1:
 12.0176371072
 6.27438369428
 4.37603086404
 17.7011979581
 14.9105894751
 13.2031216267
 7.44542907552
 5.47148322893
 0.769780523621
 3.13133106182
 1.78253720154
Topic 2:
 8.35605284529
 3.86396883542
 3.34209289724
 17.2820346642
 4.80437631266
 4.78572392591
 14.2234482939
 3.1298495673
 2.68527761374
 3.73698582701
 19.9559468597


In [5]:
# Sample docs
documents = sp.io.mmread("sample_docs_sparse.obj")
N, V = documents.shape
print N, V

50 6812


## STM Class Definitions

In [32]:
import pandas.rpy.common as rcom
iris = rcom.load_data('iris')
iris.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [25]:
class STM(object):
    
    def __init__(self, sc):
        self.sc = sc
        self.status = 0
        self.theta = None
        self.n_partitions = None # Get this from Spark context
        self.lhood_bound = None
        self.seed = None
        self.description = None
    
    def __e_step__(self):
        self.stopits = True
        
    def __m_step__(self):
        self.stopits = True
        
    def __optimize_stm__(self, optimizer):
        # Run the EM algorithm
        if optimizer == 'em':
            print "Beginning EM"
            while self.stopits == False:
                self.__e_step__()
                self.__m_step__()
    
    def __theta_posterior_draw__(self):
        # TODO
        return None
    
    def __compute_lhood_bound__(self):
        # TODO
        return None
    
    def __init_random__(self):
        self.mu = np.zeros((self.K-1, 1))
        self.sigma = 20 * np.identity(self.K-1)
        self.beta = np.random.gamma(shape=0.1, size=(self.A, self.K, self.V))
        self.Lambda = np.zeros((self.N, self.K-1))
        
    def __jeffreys_prior__(self):
        return None
        
    def __kappa_init__(self):
        self.kappa = {}
        
        # Baseline log probabilities
        m = np.array(self.wcounts)[0].astype(float)/ 
                                 np.sum(np.array(self.wcounts)[0])
        m = np.log(m) - np.log(m)
        self.kappa['m'] = m; del m;
        
        # Parameter objects
        self.aspectmod = self.A > 1
        
        # Covariates
        kappa['covar'] = None
        
        
    def print_topics(self):
        # TODO
        return None
    
    def estimate_effect(self):
        # TODO
        self.N = 30
        return None
    
    def train(self, documents, vocab=None, K=10, prevalence=None, content=None, data=None,
              max_em_its=500, init_type="random", optimizer="em", 
              em_tol=1e-5, verbose=True, report_every=5, LDAbeta=True,
              interactions=True, ngroups=1, model=None):
        """Train a STM model.
        """
        # Basic setup
        self.N, self.V = documents.shape
        self.K = K
        self.verbose = verbose
        self.report_every = report_every
        self.LDAbeta = LDAbeta
        self.wcounts = documents.sum(0)
        self.ntokens = np.sum(self.wcounts)
        self.stopits = False
        self.A = len(content)
        self.P = len(prevalence)
        self.X = data[prevalence]
        self.Y = data[content]
        
        # Initialization
        print "Beginning Initialization"
        if init_type == 'random':
            self.__init_random__()
        self.__kappa_init__()
        
        # Run EM
        self.__optimize_stm__(optimizer)
        
        # Declare completion
        print "All done!"

In [33]:
stm = STM(sc)
stm.train(documents, prevalence=['Species'], content=['Species'], data=iris)

Beginning Initialization
Beginning EM
All done!


In [43]:
stm.wcounts.astype(float) / sum(stm.wcounts.astype(float))

matrix([[ nan,  nan,  nan, ...,  nan,  nan,   1.]])

In [66]:
a = np.log(np.array(stm.wcounts)[0].astype(float)/ 
                                 np.sum(np.array(stm.wcounts)[0]))

In [68]:
np.mean(a)

-inf