## Testing Suite

In [29]:
# Set up the environment
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import LDA, LDAModel
import scipy as sp
import numpy as np
import pandas as pd
from scipy import io

# Input the data
data = sc.textFile("sample_data.txt")
parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3)

In [4]:
# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " + str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

Learned topics (as distributions over vocab of 11 words):
Topic 0:
 5.62631004755
 18.8616474703
 4.28187623872
 5.01676737764
 5.28503421228
 4.01115444738
 9.33112263062
 1.39866720377
 4.54494186264
 17.1316831112
 11.2615159387
Topic 1:
 12.0176371072
 6.27438369428
 4.37603086404
 17.7011979581
 14.9105894751
 13.2031216267
 7.44542907552
 5.47148322893
 0.769780523621
 3.13133106182
 1.78253720154
Topic 2:
 8.35605284529
 3.86396883542
 3.34209289724
 17.2820346642
 4.80437631266
 4.78572392591
 14.2234482939
 3.1298495673
 2.68527761374
 3.73698582701
 19.9559468597


In [5]:
# Sample docs
documents = sp.io.mmread("sample_docs_sparse.obj")
N, V = documents.shape
print N, V

50 6812


## STM Class Definitions

In [32]:
import pandas.rpy.common as rcom
iris = rcom.load_data('iris')
iris.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [226]:
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry

class STM(object):
    
    def __init__(self, sc):
        self.sc = sc
        self.status = 0
        self.theta = None
        self.n_partitions = None # Get this from Spark context
        self.lhood_bound = None
        self.seed = None
        self.description = None
    
    def __e_step__(self, documents):
        mu_i = self.mu
        sigma_ss = np.zeros((self.K-1, self.K-1))
        beta_ss = np.zeros((self.A, self.K, self.V))
        bound = np.array([0] * N)
        Lambda = np.array([0] * N)
        siginv = np.linalg.inv(self.sigma)
        
        
    def __m_step__(self):
        self.stopits = True
        
    def __optimize_stm__(self, optimizer):
        if optimizer == 'em':
            # Split documents into groups
            self.groups = [[x for x in range(self.N) if x % self.ngroups == j] 
                           for j in range(self.ngroups)]
            
            # Run EM
            print "Beginning EM"
            while self.stopits == False:
                #for i in xrange(self.ngroups):
                #    gdocs = self.documents[self.groups[i]]
                #    self.__e_step__(gdocs)
                #    if self.verbose:
                #        print "Completed Group " + str(i) + " E-Step"
                self.__e_step__(documents)
                self.__m_step__()
    
    def __theta_posterior_draw__(self):
        # TODO
        return None
    
    def __compute_lhood_bound__(self):
        # TODO
        return None
    
    def __init_random__(self):
        self.mu = np.zeros((self.K-1, 1))
        self.sigma = 20 * np.identity(self.K-1)
        self.beta = np.random.gamma(shape=0.1, size=(self.A, self.K, self.V))
        self.Lambda = np.zeros((self.N, self.K-1))
        
    def __kappa_init__(self):
        
        # Baseline log probabilities
        m = np.array(self.wcounts)[0].astype(float)/ np.sum(np.array(self.wcounts)[0])
        m = np.log(m) - np.log(m)
        self.kappa['m'] = m; del m;
        
        # Parameter objects
        self.aspectmod = self.A > 1
        
        # Covariates
        self.kappa['covar'] = None
        
        
    def print_topics(self):
        print self.groups
    
    def estimate_effect(self):
        # TODO
        self.N = 30
        return None
    
    def train(self, documents, vocab=None, K=10, prevalence=None, content=None, data=None,
              max_em_its=500, init_type="random", optimizer="em", 
              em_tol=1e-5, verbose=True, report_every=5, LDAbeta=True,
              interactions=True, ngroups=10, model=None):
        """Train a STM model.
        """
        # Input parsing
        if type(documents) == sp.sparse.coo_matrix:
            self.documents = documents.tolil()
            #self.documents = documents
            
        # Parallelize the collection
        #self.dist_docs = CoordinateMatrix(sc.parallelize([MatrixEntry(i, j, x) for i, j, x 
        #                    in zip(documents.row, documents.col, documents.data)]))
        #print self.dist_docs.numCols(), self.dist_docs.numRows()
        #self.dist_
        
        # Basic setup
        self.N, self.V = documents.shape
        self.K = K
        self.verbose = verbose
        self.report_every = report_every
        self.ngroups = ngroups
        self.max_em_its = max_em_its
        self.em_tol = em_tol
        self.gamma = {
            'mode' : 'L1',
            'prior' : None,
            'enet' : 1
        }
        self.kappa = {
            'LDAbeta' : LDAbeta,
            'interactions' : interactions,
            'fixedintercept' : True,
            'mstep' : {
                'tol' : .001,
                'maxit' : 3
            },
            'contrasts' : False
        }
        self.tau = {
            'mode' : 'L1',
            'nits' : 50,
            'burnin' : 25,
            'alpha' : 50.0 / self.K,
            'eta' : .01,
            's' : .05,
            'p' : 3000,
            'd_groups_size' : 2000
        }
        self.wcounts = documents.sum(0)
        self.ntokens = np.sum(self.wcounts)
        self.stopits = False
        self.A = len(content)
        self.P = len(prevalence)
        self.X = data[prevalence]
        self.Y = data[content]
                
        # Initialization
        print "Beginning Initialization"
        if init_type == 'random':
            self.__init_random__()
        self.__kappa_init__()
        
        # Run EM
        self.__optimize_stm__(optimizer)
        
        # Declare completion
        print "All done!"

In [227]:
stm = STM(sc)
stm.train(documents, prevalence=['Species'], content=['Species'], data=iris, ngroups=10)

Beginning Initialization
Beginning EM
All done!


In [228]:
documents_lil = documents.tolil()

In [229]:
sc.parallelize(documents_lil.todense()).map(hi)

PythonRDD[627] at RDD at PythonRDD.scala:43

In [230]:
[(i, j, x) for i, j, x in zip(documents.row, documents.col, documents.data)]

[(41, 16, 2),
 (15, 22, 1),
 (26, 23, 1),
 (43, 24, 1),
 (20, 26, 1),
 (45, 28, 1),
 (15, 32, 1),
 (9, 33, 1),
 (20, 33, 1),
 (21, 33, 1),
 (24, 33, 1),
 (42, 33, 1),
 (15, 36, 1),
 (21, 43, 1),
 (7, 47, 1),
 (33, 47, 1),
 (45, 47, 1),
 (3, 54, 1),
 (6, 54, 1),
 (9, 54, 1),
 (10, 54, 1),
 (28, 54, 1),
 (31, 54, 1),
 (15, 55, 1),
 (5, 73, 2),
 (10, 73, 1),
 (0, 76, 1),
 (9, 76, 1),
 (17, 83, 1),
 (32, 98, 2),
 (1, 106, 1),
 (18, 106, 1),
 (30, 106, 1),
 (5, 107, 1),
 (7, 107, 1),
 (13, 107, 1),
 (19, 107, 1),
 (22, 107, 2),
 (25, 107, 1),
 (29, 107, 1),
 (38, 107, 1),
 (40, 107, 1),
 (49, 107, 1),
 (17, 123, 1),
 (23, 123, 2),
 (37, 123, 1),
 (49, 123, 2),
 (26, 135, 1),
 (29, 135, 1),
 (33, 135, 1),
 (34, 135, 1),
 (39, 135, 1),
 (42, 135, 1),
 (47, 135, 1),
 (49, 135, 1),
 (5, 141, 1),
 (6, 141, 1),
 (33, 141, 1),
 (39, 141, 1),
 (48, 148, 1),
 (49, 149, 1),
 (21, 151, 1),
 (3, 153, 1),
 (5, 153, 1),
 (9, 153, 1),
 (11, 153, 1),
 (32, 153, 1),
 (33, 153, 1),
 (34, 153, 2),
 (38, 153, 