# Read data

In [1]:
import pandas as pd
import spacy
import udf

In [2]:
df = pd.read_csv("obama_speech.csv")

In [3]:
df['text'] = df['text'].apply(lambda x: udf.clean.allowed(x))

In [4]:
df['title'] = df['title'].apply(lambda x: udf.clean.title(x))

In [5]:
df.iloc[0,2] = df.iloc[0,2][df.iloc[0,2].find('[Chief Justice John G. \r\n\t\tRoberts administers the Oath of Office]')
                         +len('[Chief Justice John G. \r\n\t\tRoberts administers the Oath of Office]'):df.iloc[0,2].find('(Drudge Report)')].strip()
df.iloc[1,2] = df.iloc[1,2][df.iloc[1,2].find('[as prepared for delivery]')+len('[as prepared for delivery]'):].strip()
df.iloc[2,2] = df.iloc[2,2][df.iloc[2,2].find('[AUTHENTICITY CERTIFIED: Text version below \r\ntranscribed directly from edited audio above]')
                         +len('[AUTHENTICITY CERTIFIED: Text version below \r\ntranscribed directly from edited audio above]'):].strip()
df.iloc[3,2] = df.iloc[3,2][df.iloc[3,2].find('pdf')+len('pdf'):].strip()

# Named entity recognition

In [6]:
df['text_removed'] = df['text'].apply(lambda x: udf.clean.remove(x))

In [7]:
nlp = spacy.load('en_core_web_lg')
tokens_sent = []
for summary in nlp.pipe(df['text_removed']):
   tok = [token.lemma_.lower() for token in summary if token.is_alpha is True and token.is_stop is False]
   tokens_sent.append(tok)

In [8]:
df['processed_for_sentiment'] = tokens_sent
df['processed_for_sentiment'] = df['processed_for_sentiment'].apply(lambda x: " ".join(x))

In [9]:
removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM','ADJ','DET']# Verb possible
tokens_lda = []
for summary in nlp.pipe(df['processed_for_sentiment']):
   proj_tok = [token.lemma_ for token in summary if token.pos_ not in removal]
   tokens_lda.append(proj_tok)

In [10]:
df['processed_for_lda'] = tokens_lda
df['processed_for_lda'] = df['processed_for_lda'].apply(lambda x: " ".join(x))

In [11]:
most_noun = udf.clean.most_occur(df['processed_for_lda'])
print(most_noun)

[('people', 7572), ('go', 5404), ('work', 4739), ('country', 4010), ('think', 3982), ('want', 3759), ('know', 3741), ('president', 3542), ('time', 3399), ('world', 3337), ('get', 3234), ('america', 3074), ('come', 3021), ('say', 2816), ('need', 2594), ('way', 2544), ('help', 2441), ('united', 2349), ('nation', 2310), ('thing', 2282), ('states', 2239), ('job', 2235), ('thank', 2174), ('government', 2115), ('question', 1859), ('family', 1834), ('change', 1781), ('life', 1761), ('let', 1746), ('look', 1739), ('americans', 1735), ('take', 1733), ('care', 1622), ('make', 1591), ('economy', 1582), ('health', 1573), ('business', 1546), ('see', 1484), ('believe', 1484), ('security', 1482), ('community', 1440), ('issue', 1435), ('place', 1434), ('support', 1423), ('continue', 1412), ('law', 1402), ('war', 1384), ('try', 1363), ('start', 1335), ('lot', 1332), ('child', 1318), ('state', 1311), ('mean', 1304), ('happen', 1272), ('man', 1236), ('woman', 1188), ('build', 1156), ('system', 1151), ('s

In [12]:
most_noun = udf.clean.most_occur(df['processed_for_sentiment'])
print(most_noun)

[('people', 7571), ('go', 5405), ('work', 4715), ('country', 4006), ('think', 3893), ('know', 3815), ('want', 3761), ('president', 3534), ('time', 3393), ('right', 3343), ('world', 3338), ('get', 3108), ('like', 3093), ('america', 3073), ('come', 3029), ('say', 2809), ('good', 2731), ('way', 2650), ('need', 2595), ('new', 2579), ('american', 2478), ('united', 2390), ('help', 2390), ('states', 2358), ('nation', 2288), ('thing', 2283), ('job', 2227), ('thank', 2194), ('government', 2115), ('question', 1859), ('family', 1830), ('americans', 1812), ('sure', 1810), ('change', 1780), ('life', 1761), ('let', 1745), ('look', 1737), ('take', 1717), ('great', 1698), ('care', 1618), ('economy', 1580), ('health', 1573), ('make', 1564), ('business', 1542), ('well', 1484), ('believe', 1484), ('see', 1483), ('security', 1482), ('community', 1438), ('place', 1434), ('issue', 1434), ('support', 1422), ('long', 1409), ('continue', 1403), ('law', 1400), ('young', 1395), ('mean', 1393), ('war', 1376), ('t

# Save dataframe

In [13]:
df_processed = df.drop(columns=['text_removed','text'])

In [14]:
file_name = "obama_speech_preprocessed.csv"
df_processed.to_csv(file_name, encoding='utf-8', index=False)