''' This program runs a dynamic topic model

#Date: Mar 4, 2019

#Author: Carly Knight

#https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/dtm_example.ipynb
#https://stackoverflow.com/questions/50413059/dynamic-topic-modeling-with-gensim-which-code'''


## set up

In [1]:
import logging
import os
from gensim import corpora, utils
from gensim.models.wrappers.dtmmodel import DtmModel
import numpy as np
import pandas as pd
from gensim.corpora import Dictionary
from collections import defaultdict



### dtm

In [2]:
# you can also copy the path down directly. Change this variable to your DTM executable before running.
dtm_path = "/usr/local/bin/dtm-darwin64"

# Open corpus

In [3]:
file_loc = "/Users/carlyknight/Documents/Data/Annual Report/report_paragraphs/future_texts/"

#limit to over 1930
metadata = pd.read_csv(file_loc + "metadata_futureperfect.csv") 
metadata = metadata[(metadata['Year'] >= 1930)]
metadata['text_filename'] = metadata['Filename'].str.replace(".xml", ".txt")

#pick relevant files
textfiles = [i for i in metadata['text_filename'].values.tolist()]

In [4]:
#open files
for index, row in metadata.iterrows():
    txtfile= row['Filename'].replace(".xml", ".txt")
    #read in text file and turn into new variable
    with open(file_loc + txtfile, 'r') as f:
        metadata.loc[index,'text']= f.read()
        f.close()

In [5]:
#metadata.shape (36479,17)

In [6]:
#remove puncutation
#metadata['text'] = metadata['text'].str.replace(r'[^\w\s]+', '')

#remove numbers
#metadata['text'] = metadata['text'].str.replace(r'[\d]+', '')

#remove extraspaces
#metadata['text'] = metadata['text'].str.replace(r'\s+', ' ')

#turn to text
documents = metadata['text'].tolist()

## Preprocess

In [7]:
from gensim.parsing.preprocessing import preprocess_string

#processed corpus: removes punctuation, whitespaces, stopwords, stems, numbes
processed_corpus = [preprocess_string(document) for document in documents] 

#remove low frequency terms (terms with fewer than 20) (brings it down to 35,244 from 953,010 tokens)
frequency = defaultdict(int)

for text in processed_corpus:
    for token in text:
        frequency[token] += 1

processed_corpus = [
    [token for token in text if frequency[token] > 10]
    for text in processed_corpus
]

#dictionary
dictionary = Dictionary(processed_corpus)

#filter extremes: get rid of words that appear in no fewer than 5 documents
dictionary.filter_extremes(no_below=5)
len(dictionary) #43,025

#Bag of Words
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

#Time slice
counts = metadata[['Year', "Filename"]].groupby(['Year']).agg(['count'])
timeslice=[i[0] for i in counts.values.tolist()]

# DTM

In [8]:
model = DtmModel(dtm_path, bow_corpus, timeslice, num_topics=30,
                 id2word=dictionary, initialize_lda=True)

In [9]:
topics = model.show_topic(topicid=29, time=10, topn=30)
topics

[(0.032291592697312604, 'energi'),
 (0.021481406511552857, 'power'),
 (0.013668430146890412, 'ga'),
 (0.01188373686774733, 'risk'),
 (0.011835282308570914, 'entergi'),
 (0.011781632127476377, 'electr'),
 (0.009634545540664088, 'natur'),
 (0.0070267761484145, 'util'),
 (0.006514366789415616, 'regul'),
 (0.006351815263064744, 'regulatori'),
 (0.005784179698029628, 'trade'),
 (0.005754278958368043, 'nuclear'),
 (0.00565475847365005, 'ferc'),
 (0.005328069091969176, 'commod'),
 (0.0051527677759150734, 'duke'),
 (0.005022061235678064, 'pseg'),
 (0.004841629418802497, 'liabil'),
 (0.004673568173901362, 'fuel'),
 (0.004085098330786342, 'debt'),
 (0.004014058490131408, 'competit'),
 (0.003942967431451223, 'file'),
 (0.003828331607758928, 'oblig'),
 (0.0037950797445392157, 'impact'),
 (0.003669180622016843, 'decommiss'),
 (0.0036675145294982297, 'deriv'),
 (0.0034807730211410494, 'hold'),
 (0.0034742208122424682, 'subject'),
 (0.00342423061869348, 'approv'),
 (0.0033728961150998513, 'transact')

In [19]:
#https://towardsdatascience.com/topic-modeling-with-gensim-a5609cefccc

# save

In [10]:
import pickle

In [11]:
f = open('/Users/carlyknight/Documents/Data/FuturePerfect/dynamic_topic_models/topic30.pkl', 'wb') 
pickle.dump(model, f , protocol=4)          
f.close()                 