'''
This program runs a dynamic topic model 
#Date: Mar 3, 2019
#Author: Carly Knight
#https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/ldaseqmodel.ipynb
'''

### Import packages

In [1]:
import pandas as pd
import os
import re
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.wrappers import DtmModel
from gensim.corpora import Dictionary, bleicorpus
import numpy
from gensim.matutils import hellinger
from gensim import corpora

In [None]:
path_to_dtm_binary = "/path/to/dtm/binary"

### Import data

In [105]:
file_loc = "/Users/carlyknight/Documents/Data/Annual Report/report_paragraphs/future_texts/"

#limit to over 1930
metadata = pd.read_csv(file_loc + "metadata_futureperfect.csv") 
metadata = metadata[(metadata['Year'] >= 1930)]
metadata['text_filename'] = metadata['Filename'].str.replace(".xml", ".txt")

#pick relevant files
textfiles = [i for i in metadata['text_filename'].values.tolist()]

In [106]:
for index, row in metadata.iterrows():
    txtfile= row['Filename'].replace(".xml", ".txt")
    #read in text file and turn into new variable
    with open(file_loc + txtfile, 'r') as f:
        metadata.loc[index,'text']= f.read()
        f.close()

In [101]:
def iter_documents(top_directory):
    """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
    for root, dirs, files in os.walk(top_directory):
        for file in filter(lambda file: file.endswith('.txt'), files):
            if file in textfiles: #check to see if its the one we want
                document = open(os.path.join(root, file)).read() # read the entire document, as one big string
                yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you


In [None]:

class MyCorpus(object):
    def __init__(self, top_dir):
        self.top_dir = top_dir
        self.dictionary = gensim.corpora.Dictionary(iter_documents(top_dir))
        self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params

    def __iter__(self):
        for tokens in iter_documents(self.top_dir):
            yield self.dictionary.doc2bow(tokens)

corpus = MyCorpus('/path/to/files') # create a dictionary
for vector in corpus: # convert each document to a bag-of-word vector
    print (vector)

### Clean + create corpus

In [113]:
#stopwords
stopwords = ["a", "the", "thc", "co", "cco", "r", "x", "y", "t", "w", "k", "d"]

#lowercase
metadata['text'] = metadata['text'].astype(str).str.lower()

#remove puncutation
metadata['text'] = metadata['text'].str.replace(r'[^\w\s]+', '')

#remove numbers
metadata['text'] = metadata['text'].str.replace(r'[\d]+', '')

#remove extraspaces
metadata['text'] = metadata['text'].str.replace(r'\s+', ' ')

#turn to text
documents = metadata['text'].tolist()

#processed corpus
processed_corpus = [
    [word.lower() for word in document.split() if word not in stopwords]
    for document in documents
    ]

#dictionary
dictionary = corpora.Dictionary(processed_corpus)

#Bag of Words
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

### DTM

In [108]:
#time slice
counts = metadata[['Year', "Filename"]].groupby(['Year']).agg(['count'])
timeslice=[i[0] for i in counts.values.tolist()]

In [None]:
#DTM
ldaseq = ldaseqmodel.LdaSeqModel(corpus=bow_corpus, id2word=dictionary, time_slice=timeslice, num_topics=10)