In [20]:

import pandas as pd
import datetime
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
import string
from string import punctuation
from sklearn.model_selection import train_test_split

In [82]:
# stop = stopwords.words('english')
# stemmer = SnowballStemmer("dutch")

In [2]:
df = pd.read_csv('presidential_quotes.csv')

In [3]:
df = df.drop('Unnamed: 0', axis = 1)

In [4]:
df.head()

Unnamed: 0,party,quote,said_by
0,D,Change will not come if we wait for some other...,Barack Obama
1,D,The best way to not feel hopeless is to get up...,Barack Obama
2,D,A change is brought about because ordinary peo...,Barack Obama
3,D,Yes We Can!,Barack Obama
4,D,We are the change we have been waiting for.,Barack Obama


In [5]:
df.shape

(2016, 3)

In [6]:
df.said_by.value_counts()

Barack Obama       570
Hillary Clinton    563
Abraham Lincoln    440
Donald Trump       227
George W. Bush     216
Name: said_by, dtype: int64

In [7]:
df.party.value_counts()

D    1133
R     883
Name: party, dtype: int64

<h2>Tokenize

In [8]:
df.quote = df.quote.str.lower()

In [9]:
df['quote_tokenized'] = df.quote.apply(lambda x: word_tokenize(x, language = 'en'))

In [10]:
df.head()

Unnamed: 0,party,quote,said_by,quote_tokenized
0,D,change will not come if we wait for some other...,Barack Obama,"[change, will, not, come, if, we, wait, for, s..."
1,D,the best way to not feel hopeless is to get up...,Barack Obama,"[the, best, way, to, not, feel, hopeless, is, ..."
2,D,a change is brought about because ordinary peo...,Barack Obama,"[a, change, is, brought, about, because, ordin..."
3,D,yes we can!,Barack Obama,"[yes, we, can, !]"
4,D,we are the change we have been waiting for.,Barack Obama,"[we, are, the, change, we, have, been, waiting..."


<h2>Remove Stop Words

In [13]:
stops = list(set(stopwords.words('english'))) + list(punctuation)

In [14]:
#function to remove stop words
def remove_stops(text):
    text_no_stops = []
    for i in text:
        if i not in stops:
            if len(i) == 1:
                pass
            else:
                text_no_stops.append(i)
        else:
            pass
    return text_no_stops

In [15]:
df['quote_no_stops'] = df['quote_tokenized'].apply(lambda x: remove_stops(x))

In [16]:
df.head()

Unnamed: 0,party,quote,said_by,quote_tokenized,quote_no_stops
0,D,change will not come if we wait for some other...,Barack Obama,"[change, will, not, come, if, we, wait, for, s...","[change, come, wait, person, wait, time, ones,..."
1,D,the best way to not feel hopeless is to get up...,Barack Obama,"[the, best, way, to, not, feel, hopeless, is, ...","[best, way, feel, hopeless, get, something, wa..."
2,D,a change is brought about because ordinary peo...,Barack Obama,"[a, change, is, brought, about, because, ordin...","[change, brought, ordinary, people, extraordin..."
3,D,yes we can!,Barack Obama,"[yes, we, can, !]",[yes]
4,D,we are the change we have been waiting for.,Barack Obama,"[we, are, the, change, we, have, been, waiting...","[change, waiting]"


<h2>Lemmatize

In [21]:
#initialize WordNetLemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()

In [22]:
#function to lemmatize text
def lemmatize_text(text):
    lemmatized = []
    for word in text:
        lemmatized.append(lemmatizer.lemmatize(word))
    return lemmatized