In [1]:
from tika import parser
import re
import timeit
import os
import pandas as pd

In [4]:
def generate_raw_data():
    
    """
    This function generates raw text data from FOMC transcripts
    
    returns a list where each element is the full text within each FOMC meeting
    
    It will take about 4-5 minutes
    """
    
    cwd = os.getcwd()
    base_directory = './FOMC_pdf'
    raw_doc = os.listdir(base_directory)
    filelist = sorted(raw_doc)
    
    onlyfiles = [f for f in os.listdir(base_directory) if os.path.isfile(os.path.join(base_directory, f))]
    date = [f[4:12] for f in onlyfiles]

    
    start = timeit.default_timer()
    
    document = []
    raw_text = pd.DataFrame(columns = ['Date','Speaker', 'content'])

    for i,file in enumerate(filelist):
        parsed = parser.from_file(os.path.join(cwd, 'FOMC_pdf',file))
        interjections = re.split('MR. |MS. |CHAIRMAN |VICE CHAIRMAN ', parsed['content'])[1:]
        temp_df = pd.DataFrame(columns = ['Date','Speaker','content'])
        interjection_new = []
        for interjection in interjections:
            
            temp_temp_df = pd.DataFrame(columns = ['Date','Speaker','content'], index = [0])
            interjection = interjection.replace('\n',' ')
            temp_temp_df['Speaker'] = interjection.split('.')[0]
            temp_temp_df['content'] = '.'.join(interjection.split('.')[1:])
            temp_df = pd.concat([temp_df, temp_temp_df], ignore_index = True)
            interjection_new.append(interjection)
        temp_df['Date'] = date[i]
        document.append(interjection_new)
        raw_text = pd.concat([raw_text, temp_df], ignore_index = True)
    end = timeit.default_timer()
    raw_text.index = raw_text['Date']
    raw_text.to_excel('raw_text.xlsx')
    print("Documents processed. Time: {}".format(end - start))
    
    return raw_text



In [5]:
def separate_FOMC12():
    
    separation_rule = pd.read_excel('Separation.xlsx')

2019-04-02 23:10:00,878 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar to C:\Users\barry\AppData\Local\Temp\tika-server.jar.
2019-04-02 23:10:09,335 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar.md5 to C:\Users\barry\AppData\Local\Temp\tika-server.jar.md5.
2019-04-02 23:10:09,553 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2019-04-02 23:10:14,594 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


Documents processed. Time: 355.90648895270465


In [121]:
import pandas as pd
import topicmodels
import numpy as np
from nltk.stem import PorterStemmer
import nltk
from nltk.collocations import *

def preprocess():
    
    
    '''
    main function for preprocessing
        
    This function writes the tokenized documents, which includes columns of 
    
    Date: date of the meeting
    Section: FOMC1 or FOMC2
    Speaker: speaker of the interjection
    content: list of tokens in the interjection
    
    '''
    
    
    text = pd.read_excel('raw_text_separated.xlsx')
    text = text.fillna('0')
    
    text_separated = separation(text)
    
    text_separated_col = find_collocation(text_separated)
    text_separated_col['content'] = tokenize(text_separated_col['content'].values)
    text_separated_col.to_excel('FOMC_token_separated_new.xlsx')
    
def tokenize(content):
    '''
    Code for tokenization:
        1. remove words with length of 1
        2. remove non-alphabetical words
        3. remove stop words
        4. stem all words
    '''
    FOMC_token = []
    for statement in content:
        statement = statement.lower()
        docsobj = topicmodels.RawDocs([statement], "long")
        docsobj.token_clean(1)
        docsobj.stopword_remove("tokens")
        docsobj.stem()
        docsobj.stopword_remove("stems")
        ps = PorterStemmer()
        FOMC_token.append(' '.join([ps.stem(word) for word in docsobj.tokens[0]]))
        
    return FOMC_token

def separation(text):
    
    separation_rule = pd.read_excel('Separation.xlsx')
    
    FOMC_separation = pd.DataFrame(columns = ['Date','Speaker','content','Section'])
    for i in separation_rule.index:

        temp1 = raw_text[raw_text["Date"].apply(lambda x: str(x)[:-2]) == str(i)].iloc[separation_rule['FOMC1_start'][i]:separation_rule['FOMC1_end'][i]]
        temp1['Section'] = 1
        if separation_rule['FOMC2_end'][i] == 'end':
            temp2 = raw_text[raw_text["Date"].apply(lambda x: str(x)[:-2]) == str(i)].iloc[separation_rule['FOMC2_start'][i]:]
        else:
            temp2 = raw_text[raw_text["Date"].apply(lambda x: str(x)[:-2]) == str(i)].iloc[separation_rule['FOMC2_start'][i]:separation_rule['FOMC2_end'][i]]
        temp2['Section'] = 2
        FOMC_separation = FOMC_separation.append(temp1, ignore_index=True)
        FOMC_separation = FOMC_separation.append(temp2, ignore_index = True)
        
    FOMC_separation.to_excel('raw_text_separated.xlsx')
    return FOMC_separation

def find_collocation(raw_text_separated):
    
    big_document = raw_text_separated['content'].apply(lambda x: x.split(' ')).values
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    trigram_measures = nltk.collocations.TrigramAssocMeasures()
    
    finder_2 = BigramCollocationFinder.from_documents(big_document)
    finder_2.apply_freq_filter(100)
    bi_collocation = [' '.join(x) for x in np.array(finder_2.score_ngrams(bigram_measures.chi_sq))[:,0]]
    bi_collocation_replace = [''.join(x) + 'xx' for x in np.array(finder_2.score_ngrams(bigram_measures.chi_sq))[:,0]]
    
    finder_3 = TrigramCollocationFinder.from_documents(big_document)
    finder_3.apply_freq_filter(50)
    tri_collocation = [' '.join(x) for x in np.array(finder_2.score_ngrams(trigram_measures.chi_sq))[:,0]]
    tri_collocation_replace = [''.join(x) + 'xxx' for x in np.array(finder_2.score_ngrams(trigram_measures.chi_sq))[:,0]]   
    
    interjection_string = []
    for meeting in raw_text_separated['content']:
        for (i,bicol) in emuerate(bi_collocation):
            meeting = meeting.replace(bicol, bi_collocation_replace[i])
        for (i,tricol) in emuerate(tri_collocation):
            meeting = meeting.replace(tricol, tri_collocation_replace[i])
    
        interjection_string.append(meeting)
    
    raw_text_separated['content'] = interjection_string
    raw_text_separated.to_excel('FOMC_separated_Collocation.xlsx')
    return raw_text_separated

In [122]:
%%time
preprocess()

TypeError: _contingency() missing 1 required positional argument: 'n_xxx'