''' This program runs a dynamic topic model

#Date: Mar 4, 2019

#Author: Carly Knight

#https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/dtm_example.ipynb
#https://stackoverflow.com/questions/50413059/dynamic-topic-modeling-with-gensim-which-code'''


## set up

In [10]:
import logging
import os
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
import numpy as np
import pandas as pd


### dtm

In [11]:
# you can also copy the path down directly. Change this variable to your DTM executable before running.
dtm_path = "/usr/local/bin/dtm-darwin64"

# Open corpus

In [12]:
file_loc = "/Users/carlyknight/Documents/Data/Annual Report/report_paragraphs/future_texts/"

#limit to over 1930
metadata = pd.read_csv(file_loc + "metadata_futureperfect.csv") 
metadata = metadata[(metadata['Year'] >= 1930)]
metadata['text_filename'] = metadata['Filename'].str.replace(".xml", ".txt")

#pick relevant files
textfiles = [i for i in metadata['text_filename'].values.tolist()]

In [13]:
#open files
for index, row in metadata.iterrows():
    txtfile= row['Filename'].replace(".xml", ".txt")
    #read in text file and turn into new variable
    with open(file_loc + txtfile, 'r') as f:
        metadata.loc[index,'text']= f.read()
        f.close()

In [14]:
#remove puncutation
metadata['text'] = metadata['text'].str.replace(r'[^\w\s]+', '')

#remove numbers
metadata['text'] = metadata['text'].str.replace(r'[\d]+', '')

#remove extraspaces
metadata['text'] = metadata['text'].str.replace(r'\s+', ' ')

#turn to text
documents = metadata['text'].tolist()



## Preprocess

In [15]:
from gensim.parsing.preprocessing import preprocess_string

#processed corpus
processed_corpus = [preprocess_string(document) for document in documents]

#processed_corpus = [
#    [word.lower() for word in document.split() if word not in stopwords]
#    for document in documents
#    ]

#dictionary
dictionary = corpora.Dictionary(processed_corpus)

#Bag of Words
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

#Time slice
counts = metadata[['Year', "Filename"]].groupby(['Year']).agg(['count'])
timeslice=[i[0] for i in counts.values.tolist()]

# DTM

In [17]:
model = DtmModel(dtm_path, bow_corpus, timeslice, num_topics=50,
                 id2word=dictionary, initialize_lda=True)

In [18]:
topics = model.show_topic(topicid=7, time=10, topn=10)
topics

[(0.05950451712676686, 'steel'),
 (0.032056908359930684, 'equip'),
 (0.030933853257497413, 'product'),
 (0.029370913916933604, 'car'),
 (0.02885436926693988, 'war'),
 (0.021485055777403197, 'aircraft'),
 (0.017369381565946744, 'contract'),
 (0.010410676516983559, 'truck'),
 (0.009852134250738635, 'ship'),
 (0.008990229576077754, 'work')]

In [19]:
#https://towardsdatascience.com/topic-modeling-with-gensim-a5609cefccc

# save

In [21]:
import pickle

In [22]:
f = open('/Users/carlyknight/Documents/Data/FuturePerfect/dynamic_topic_models/topic50.pkl', 'wb') 
pickle.dump(model, f)          
f.close()                 