''' This program runs a dynamic topic model on future only paragraph

#Date: July 7, 2020

#Author: Carly Knight

#https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/dtm_example.ipynb
#https://stackoverflow.com/questions/50413059/dynamic-topic-modeling-with-gensim-which-code'''


## set up

In [1]:
import logging
import os
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
import numpy as np
import pandas as pd
from gensim.corpora import Dictionary
from collections import defaultdict

### dtm

In [2]:
# you can also copy the path down directly. Change this variable to your DTM executable before running.
dtm_path = "/usr/local/bin/dtm-darwin64"

# Open corpus

In [3]:
file_loc = "/Users/carlyknight/Documents/Data/Annual Report/report_paragraphs/future_texts_paragraphs_futureonly/"

In [4]:
#read data
metadata = pd.read_csv(file_loc + "metadata_futureperfect_paragraphs_futureonly.csv") 

In [5]:
#limit to over 1930
metadata = metadata[(metadata['Year'] >= 1930)]

#create vars for filenames
metadata['document_filename'] = metadata['Filename'].str.replace(".xml", ".txt")
metadata['paragraph_filetime']= metadata['Filename'].str.replace(".xml", "") + "_P" + metadata['Paragraph'].astype(str) + ".txt"

#941727, 19
metadata.shape 

(149145, 18)

In [6]:
#pick relevant files
textfiles = [i for i in metadata['paragraph_filetime'].values.tolist()]

In [7]:
len(textfiles)

149145

In [8]:
# #open files (this step takes forever)
# for index, row in metadata.iterrows():
#     txtfile= row['paragraph_filetime']
#     #read in text file and turn into new variable
#     with open(file_loc + txtfile, 'r') as f:
#         metadata.loc[index,'text']= f.read()
#         f.close()

In [9]:
#empty dictionary
documentslist = []

#put txt files into dictionary
for txtfile in textfiles:
    with open(file_loc + txtfile, 'r') as f:
        documentslist.append(f.read())
        f.close()

In [10]:
#remove puncutation
#metadata['text'] = metadata['text'].str.replace(r'[^\w\s]+', '')

#remove numbers
#metadata['text'] = metadata['text'].str.replace(r'[\d]+', '')

#remove extraspaces
#metadata['text'] = metadata['text'].str.replace(r'\s+', ' ')

#turn to text
#documents = metadata['text'].tolist()

#turn dict to list
#documentslist = list(documentsdict.values())

## Preprocess

In [11]:
from gensim.parsing.preprocessing import preprocess_string

#processed corpus: removes punctuation, whitespaces, stopwords, stems, numbes
processed_corpus = [preprocess_string(document) for document in documentslist] 

#remove low frequency terms (terms with fewer than 20) (brings it down to 35,244 from 953,010 tokens)
frequency = defaultdict(int)

for text in processed_corpus:
    for token in text:
        frequency[token] += 1

processed_corpus = [
    [token for token in text if frequency[token] > 10]
    for text in processed_corpus
]

#dictionary
dictionary = Dictionary(processed_corpus)

#filter extremes: get rid of words that appear in no fewer than 5 documents
dictionary.filter_extremes(no_below=5)
len(dictionary) #43,025

#Bag of Words
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

In [12]:
#Time slice
counts = metadata[['Year', "paragraph_filetime"]].groupby(['Year']).agg(['count'])
timeslice=[i[0] for i in counts.values.tolist()]

In [13]:
#save
import pickle

f = open('/Users/carlyknight/Documents/Data/FuturePerfect/dynamic_topic_models/bow_corpus.pkl', 'wb') 
pickle.dump(bow_corpus, f , protocol=4)  

In [56]:
f = open('/Users/carlyknight/Documents/Data/FuturePerfect/dynamic_topic_models/timeslice.pkl', 'wb') 
pickle.dump(timeslice, f , protocol=4)  

In [57]:
f = open('/Users/carlyknight/Documents/Data/FuturePerfect/dynamic_topic_models/dictionary.pkl', 'wb') 
pickle.dump(dictionary, f , protocol=4)  

# DTM

In [13]:
model = DtmModel(dtm_path, bow_corpus, timeslice, num_topics=70,
                 id2word=dictionary, initialize_lda=True)

In [14]:
topics = model.show_topic(topicid=9, time=10, topn=10)
topics

[(0.08473327479531916, 'increas'),
 (0.03555758414081014, 'oper'),
 (0.03515276830673887, 'cost'),
 (0.029695280703259205, 'revenu'),
 (0.029182418608139023, 'profit'),
 (0.027083386787228863, 'result'),
 (0.026758278614501085, 'reduc'),
 (0.02553544239174281, 'sale'),
 (0.023134761609755368, 'improv'),
 (0.022756247844058226, 'margin')]

In [19]:
#https://towardsdatascience.com/topic-modeling-with-gensim-a5609cefccc

# save

In [16]:
import pickle


f = open('/Users/carlyknight/Documents/Data/FuturePerfect/dynamic_topic_models/topic_para_futonly_70.pkl', 'wb') 
pickle.dump(model, f , protocol=4)          
f.close()                 