In [1]:
#Operating system
from pathlib import Path

#Data processing
import pandas as pd
import numpy as np

#Print output
from pprint import pprint

#Read/write csv files
import csv 

import nltk

#Lemmatisation
from nltk import WordNetLemmatizer

#Stopwords
from nltk.corpus import stopwords

#Word tokenize
from nltk.tokenize import word_tokenize

In [2]:
#Import data
speech_df = pd.read_csv(Path.cwd() / "raw_speeches_df.txt")
speech_df.head()

Unnamed: 0,date,title,content
0,2010-02-01,2010_Budget_to_Congress,"Good morning, everybody. This morning, I sent..."
1,2016-09-11,911_After_15_Years,"Good morning. Scripture tells us, “Let not s..."
2,2009-09-11,911_Pentagon_Memorial_2009,"Secretary Gates, Admiral Mullen and members o..."
3,2010-09-11,911_Pentagon_Memorial_2010,"Secretary Gates, Admiral Mullen and members o..."
4,2011-03-18,Address_on_Libya,"Good afternoon, everybody.I want to take thi..."


In [3]:
#Define the Stopwords
StopWords = stopwords.words("english")

#Modify the Stopwords and add some extra stopwords
StopWords.extend(['from','u','obama','president'])

In [4]:
#Function of text processing
def clean_text(text):
    
    #Tokenize
    tokens = word_tokenize(text)
    
    #Remove the punctuations
    tokens = [word for word in tokens if word.isalpha()]
    
    #Lower the tokens
    tokens = [word.lower() for word in tokens]
    
    #Remove stopword
    tokens = [word for word in tokens if not word in StopWords]
    
    #Lemmatize
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word, pos="v") for word in tokens]
    tokens = [lemma.lemmatize(word, pos="n") for word in tokens]
    
    # list to string
    text = " ".join(tokens)

    return text

In [5]:
#Text processing
clean_content=[]
for content in speech_df['content'].values.tolist():
    clean_content.append(clean_text(content))
    
speech_df["content_clean"] = clean_content

In [6]:
#Visually check
speech_df.head()

Unnamed: 0,date,title,content,content_clean
0,2010-02-01,2010_Budget_to_Congress,"Good morning, everybody. This morning, I sent...",good morning everybody morning send budget con...
1,2016-09-11,911_After_15_Years,"Good morning. Scripture tells us, “Let not s...",good morning scripture tell u let steadfast lo...
2,2009-09-11,911_Pentagon_Memorial_2009,"Secretary Gates, Admiral Mullen and members o...",secretary gate admiral mullen member arm force...
3,2010-09-11,911_Pentagon_Memorial_2010,"Secretary Gates, Admiral Mullen and members o...",secretary gate admiral mullen member arm force...
4,2011-03-18,Address_on_Libya,"Good afternoon, everybody.I want to take thi...",good afternoon want take opportunity update am...


In [7]:
#Function of extracting nouns from contents of speeches
def nouns_extract(content):
    nouns = []
    content = list(content.split(" "))
    
    for word, pos in nltk.pos_tag(content):
        if (pos == "NN" or pos == "NNP" or pos == "NNS" or pos == "NNPS"):
            nouns.append(word)
            string_nouns = ' '.join(nouns)
    return string_nouns

In [8]:
#Extract nouns
content_nouns=[]
for content in clean_content:
    content_nouns.append(nouns_extract(content))
    
speech_df["content_nouns"] = content_nouns

In [9]:
#Visually check
speech_df.head()

Unnamed: 0,date,title,content,content_clean,content_nouns
0,2010-02-01,2010_Budget_to_Congress,"Good morning, everybody. This morning, I sent...",good morning everybody morning send budget con...,morning everybody morning budget congress year...
1,2016-09-11,911_After_15_Years,"Good morning. Scripture tells us, “Let not s...",good morning scripture tell u let steadfast lo...,morning scripture let tablet secretary carter ...
2,2009-09-11,911_Pentagon_Memorial_2009,"Secretary Gates, Admiral Mullen and members o...",secretary gate admiral mullen member arm force...,secretary gate member arm force family day sep...
3,2010-09-11,911_Pentagon_Memorial_2010,"Secretary Gates, Admiral Mullen and members o...",secretary gate admiral mullen member arm force...,secretary gate member arm force survivor trage...
4,2011-03-18,Address_on_Libya,"Good afternoon, everybody.I want to take thi...",good afternoon want take opportunity update am...,afternoon take opportunity people situation we...


In [10]:
#Write the data frame to a txt file named "clean_speeches_df.txt"
speech_df.to_csv(Path.cwd()/"clean_speeches_df.txt", header= True, index= False) 