# Department of Education Press Release Preprocessing

### Author: Doug Hummel-Price | Created: April 2020 | Updated: 06.09.20

This notebook contains the code to preprocess the ED press release corpus

In [None]:
## Import necessary libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
## Load in data
data = pd.read_excel("CompleteCorpus_NOT_Preprocessed.xlsx")
data["All_Text"] = [x.strip("amp;") for x in data.All_Text]
data.head(3)

In [None]:
## Creates several user-defined functions
def despace(string):
    '''
    A function to replace instances of more than one consecutive space with a single space
    Takes: a string
    Returns: a string
    '''
    string = re.sub("\s{2,}"," ", string)
    return string

def dequote(string):
    '''
    A function to replace instances of quotation marks with the word "quotationmark"
    Takes: a string
    Returns: a string
    '''
    quotes = ['“','”','"']
    for quote in quotes: 
        string = string.replace(quote," quotationmark ")
    return string

def year(string):
    '''
    A function to replace instances of year-numbers with certain categories of words
    Takes: a string
    Returns: a string
    '''
    string = re.sub("1[7-9][0-9]{2}","older_year_number",string)
    string = re.sub("200[0-9]{1}","older_year_number",string)
    string = re.sub("20[1-2]{1}[0-9]{1}","recent_year_number",string)
    string = re.sub("202[1-9]{1}","future_year_number",string)
    string = re.sub("20[3-5]{1}[0-9]{1}","future_year_number",string)
    return string

def dollar(string):
    '''
    A function to replace instances of dollar-numbers with the word "dollar_amount"
    Takes: a string
    Returns: a string
    '''
    string = re.sub("\$[0-9\.]+\sMillion","dollar_amount",string)
    string = re.sub("\$[0-9\.]+\sBillion","dollar_amount",string)
    string = re.sub("\$[0-9\.]+\sTillion","dollar_amount",string)
    string = re.sub("\$[0-9\.]+\smillion","dollar_amount",string)
    string = re.sub("\$[0-9\.]+\sbillion","dollar_amount",string)
    string = re.sub("\$[0-9\.]+\stillion","dollar_amount",string)
    string = re.sub("\$[0-9]{1,3},[0-9]{3},[0-9]{3},[0-9]{3},[0-9]{3}","dollar_amount", string)
    string = re.sub("\$[0-9]{1,3},[0-9]{3},[0-9]{3},[0-9]{3}","dollar_amount", string)
    string = re.sub("\$[0-9]{1,3},[0-9]{3},[0-9]{3}","dollar_amount", string)
    string = re.sub("\$[0-9]{1,3},[0-9]{3}","dollar_amount", string)
    string = re.sub("\$[0-9]{1,3}","dollar_amount", string)    
    return string

def US(string):
    '''
    A function to replace references to the US with the word "United_States"
    Takes: a string
    Returns: a string
    '''
    string = re.sub("U.S.","United_States",string)
    string = re.sub("US", "United_States", string)
    string = re.sub("United\sStates","United_States", string)
    return string

def DEd(string):
    '''
    A function to replace references to specific people and the department with consistent wording
    Takes: a string
    Returns: a string
    '''
    string = re.sub("Department\sof\sEducation","Department_of_Education",string)
    string = re.sub("department\sof\seducation","Department_of_Education",string)
    string = re.sub("Secretary of Education","Secretary_of_Education",string)
    string = re.sub("Education\sDepartment","Department_of_Education",string)
    string = re.sub("Betsy\sDeVos","Betsy_DeVos",string)
    string = re.sub("Secretary\sDeVos","Betsy_DeVos",string)
    string = re.sub("Arne\sDuncan","Arne_Duncan",string)
    string = re.sub("Secretary\sDuncan","Arne_Duncan",string)
    string = re.sub("President\sTrump","President_Trump",string)
    string = re.sub("President\sDonald\sTrump","President_Trump",string)
    string = re.sub("President\sDonald\sJ\.\sTrump","President_Trump",string)
    string = re.sub("President\sTrump","President_Obama",string)
    string = re.sub("President\sObama","President_Obama",string)
    string = re.sub("President\sBarack\sObama","President_Obama",string)
    return string

def process(string):
    '''
    A function to run an inputted string through the above functions all in one go
    Takes: a string
    Returns: a string
    '''
    string = dollar(string)
    string = year(string)
    string = US(string)
    string = DEd(string)
    string = despace(string)
    string = dequote(string)
    string = string.lower()
    sentence = "" 
    for word in string.split():
        if word not in stop_words:
            sentence = " ".join([sentence,word])
    return sentence

## Test the preprocessing with these lines
teststring = 'President Trump "P" US  $1,245 $45 2012 1985 2045 2065 President Donald J. Trump "$400.33 million"\
                Department of Education Secretary of Education'
process(teststring)

In [None]:
## Apply the preprocessing
data["Processed_text"] = [process(string) for string in data.All_Text]
data

In [None]:
## Save the processed corpus
data.to_excel("Processed_Corpus.xlsx",index=False)