### This script cleans raw FOMC statements for analysis. It is adapted from an open-source version by Miguel Acosta ([www.acostamiguel.com](http://www.acostamiguel.com)) and updated for Python 3.

---

### Input

* Raw FOMC statements from `statements/statements.raw`

---

### Output

1. **Cleaned statements**

   * Path: `statements/statements.clean`
   * Header, footer, and voting info removed
   * Used in the project

2. **Normalized statements**

   * Path: `statements/statements.clean.np`
   * Header/footer removed, stemmed, stopwords and numbers removed
   * Not used in the project


In [1]:
import os, csv, re
from os import listdir
from os.path import isfile, join
from nltk.stem.lancaster import LancasterStemmer
from textmining_withnumbers import TermDocumentMatrix as TDM

In [2]:
# Directory setup
datadir      = 'data'
statementdir = os.path.join('statements','statements.raw')
cleanDir     = os.path.join('statements','statements.clean')
cleanDirNP   = os.path.join('statements','statements.clean.np')
outputDir    = 'output'

In [3]:
def getReplacementList(list_name):
    allWords = [line.rstrip('\n') for line in open(list_name, 'r')]
    oldWords = [allWords[i] for i in range(len(allWords)) if i % 2 == 0]
    newWords = [allWords[i] for i in range(len(allWords)) if i % 2 == 1]
    return [oldWords, newWords]

In [4]:
def cleanStatement(statement, locationold, replacements, locationnew, stoplist, charsToKeep):
    original = open(os.path.join(locationold, statement), 'r').read().lower()
    clean = original
    for todelete in ['.', '\r\n', '\n', ',', '-', ';', ':']:
        clean = clean.replace(todelete, ' ')
    clean = re.sub(charsToKeep, '', clean)
    clean = clean.replace('  ', ' ')
    clean = clean.replace(' u s ', ' unitedstates ')

    deleteBefore = re.search("[Ff]or\s[Ii]mmediate\s[Rr]elease", clean).start() + len('for immediate release')
    clean = clean[deleteBefore:]

    intaking = re.search("in\staking\sthe\sdiscount\srate\saction", clean)
    votingfor = re.search("voting\sfor\sthe\sfomc", clean)
    if intaking is None and votingfor:
        deleteAfter = votingfor.start()
    elif votingfor is None and intaking:
        deleteAfter = intaking.start()
    elif votingfor is None and intaking is None:
        deleteAfter = len(clean)
    else:
        deleteAfter = min(votingfor.start(), intaking.start())
    clean = clean[:deleteAfter]

    for word in range(len(replacements[0])):
        clean = clean.replace(replacements[0][word], replacements[1][word])

    for word in stoplist:
        clean = clean.replace(' ' + word.lower() + ' ', ' ')

    with open(os.path.join(locationnew, statement), 'w') as new:
        new.write(clean)

  deleteBefore = re.search("[Ff]or\s[Ii]mmediate\s[Rr]elease", clean).start() + len('for immediate release')
  intaking = re.search("in\staking\sthe\sdiscount\srate\saction", clean)
  votingfor = re.search("voting\sfor\sthe\sfomc", clean)


In [5]:
def main():
    stoplist = [line.rstrip('\n') for line in open(os.path.join(datadir, "stoplist_mcdonald_comb.txt"), 'r')]
    stoplistNP = [line.rstrip('\n') for line in open(os.path.join(datadir, "emptystop.txt"), 'r')]
    replacements = getReplacementList(os.path.join(datadir, "wordlist.txt"))
    replacementsNP = getReplacementList(os.path.join(datadir, "wordlist.np.txt"))
    statementList = [f for f in listdir(statementdir) if isfile(join(statementdir, f))]
    for statement in statementList:
        cleanStatement(statement, statementdir, replacements, cleanDir, stoplist, '[^A-Za-z ]+')
        cleanStatement(statement, statementdir, replacementsNP, cleanDirNP, stoplistNP, '[^A-Za-z0-9 ]+')

In [6]:
if __name__ == "__main__":
    main()