<h1>Text Cleaning and Classification Models

In [40]:
import pandas as pd
import datetime
import numpy as np
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer

import string
from string import punctuation

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

This code prevents the kernel from stopping when XGBoost is running

In [28]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

<h2>Read in CSV

In [2]:
df = pd.read_csv('presidential_quotes.csv')

In [3]:
df = df.drop('Unnamed: 0', axis = 1)

In [4]:
df.head()

Unnamed: 0,party,quote,said_by
0,D,Change will not come if we wait for some other...,Barack Obama
1,D,The best way to not feel hopeless is to get up...,Barack Obama
2,D,A change is brought about because ordinary peo...,Barack Obama
3,D,Yes We Can!,Barack Obama
4,D,We are the change we have been waiting for.,Barack Obama


In [5]:
df.shape

(2016, 3)

In [6]:
df.said_by.value_counts()

Barack Obama       570
Hillary Clinton    563
Abraham Lincoln    440
Donald Trump       227
George W. Bush     216
Name: said_by, dtype: int64

In [7]:
df.party.value_counts()

D    1133
R     883
Name: party, dtype: int64

<h2>Tokenize

In [8]:
df.quote = df.quote.str.lower()

In [9]:
df['quote_tokenized'] = df.quote.apply(lambda x: word_tokenize(x, language = 'en'))

In [10]:
df.head()

Unnamed: 0,party,quote,said_by,quote_tokenized
0,D,change will not come if we wait for some other...,Barack Obama,"[change, will, not, come, if, we, wait, for, s..."
1,D,the best way to not feel hopeless is to get up...,Barack Obama,"[the, best, way, to, not, feel, hopeless, is, ..."
2,D,a change is brought about because ordinary peo...,Barack Obama,"[a, change, is, brought, about, because, ordin..."
3,D,yes we can!,Barack Obama,"[yes, we, can, !]"
4,D,we are the change we have been waiting for.,Barack Obama,"[we, are, the, change, we, have, been, waiting..."


<h2>Remove Stop Words

In [13]:
stops = list(set(stopwords.words('english'))) + list(punctuation)

In [14]:
#function to remove stop words
def remove_stops(text):
    text_no_stops = []
    for i in text:
        if i not in stops:
            if len(i) == 1:
                pass
            else:
                text_no_stops.append(i)
        else:
            pass
    return text_no_stops

In [15]:
df['quote_no_stops'] = df['quote_tokenized'].apply(lambda x: remove_stops(x))

In [16]:
df.head()

Unnamed: 0,party,quote,said_by,quote_tokenized,quote_no_stops
0,D,change will not come if we wait for some other...,Barack Obama,"[change, will, not, come, if, we, wait, for, s...","[change, come, wait, person, wait, time, ones,..."
1,D,the best way to not feel hopeless is to get up...,Barack Obama,"[the, best, way, to, not, feel, hopeless, is, ...","[best, way, feel, hopeless, get, something, wa..."
2,D,a change is brought about because ordinary peo...,Barack Obama,"[a, change, is, brought, about, because, ordin...","[change, brought, ordinary, people, extraordin..."
3,D,yes we can!,Barack Obama,"[yes, we, can, !]",[yes]
4,D,we are the change we have been waiting for.,Barack Obama,"[we, are, the, change, we, have, been, waiting...","[change, waiting]"


<h2>Lemmatize

In [21]:
#initialize WordNetLemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()

In [22]:
#function to lemmatize text
def lemmatize_text(text):
    lemmatized = []
    for word in text:
        lemmatized.append(lemmatizer.lemmatize(word))
    return lemmatized

In [23]:
df['lemmatized_quote'] = df['quote_no_stops'].apply(lemmatize_text)

In [25]:
df.head()

Unnamed: 0,party,quote,said_by,quote_tokenized,quote_no_stops,lemmatized_quote
0,D,change will not come if we wait for some other...,Barack Obama,"[change, will, not, come, if, we, wait, for, s...","[change, come, wait, person, wait, time, ones,...","[change, come, wait, person, wait, time, one, ..."
1,D,the best way to not feel hopeless is to get up...,Barack Obama,"[the, best, way, to, not, feel, hopeless, is, ...","[best, way, feel, hopeless, get, something, wa...","[best, way, feel, hopeless, get, something, wa..."
2,D,a change is brought about because ordinary peo...,Barack Obama,"[a, change, is, brought, about, because, ordin...","[change, brought, ordinary, people, extraordin...","[change, brought, ordinary, people, extraordin..."
3,D,yes we can!,Barack Obama,"[yes, we, can, !]",[yes],[yes]
4,D,we are the change we have been waiting for.,Barack Obama,"[we, are, the, change, we, have, been, waiting...","[change, waiting]","[change, waiting]"


In [26]:
#create string of lemmatized quotes
df['lemmatized_quote_string'] = df['lemmatized_quote'].apply(lambda x: ' '.join(x))

In [27]:
df.head()

Unnamed: 0,party,quote,said_by,quote_tokenized,quote_no_stops,lemmatized_quote,lemmatized_quote_string
0,D,change will not come if we wait for some other...,Barack Obama,"[change, will, not, come, if, we, wait, for, s...","[change, come, wait, person, wait, time, ones,...","[change, come, wait, person, wait, time, one, ...",change come wait person wait time one 've wait...
1,D,the best way to not feel hopeless is to get up...,Barack Obama,"[the, best, way, to, not, feel, hopeless, is, ...","[best, way, feel, hopeless, get, something, wa...","[best, way, feel, hopeless, get, something, wa...",best way feel hopeless get something wait good...
2,D,a change is brought about because ordinary peo...,Barack Obama,"[a, change, is, brought, about, because, ordin...","[change, brought, ordinary, people, extraordin...","[change, brought, ordinary, people, extraordin...",change brought ordinary people extraordinary t...
3,D,yes we can!,Barack Obama,"[yes, we, can, !]",[yes],[yes],yes
4,D,we are the change we have been waiting for.,Barack Obama,"[we, are, the, change, we, have, been, waiting...","[change, waiting]","[change, waiting]",change waiting


<h2>Lable Encoding - Author

In [57]:
# Create a label encoder object
le = preprocessing.LabelEncoder()

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 8 columns):
party                      2016 non-null object
quote                      2016 non-null object
said_by                    2016 non-null object
quote_tokenized            2016 non-null object
quote_no_stops             2016 non-null object
lemmatized_quote           2016 non-null object
lemmatized_quote_string    2016 non-null object
author_encoded             2016 non-null int64
dtypes: int64(1), object(7)
memory usage: 126.1+ KB


In [59]:
# Fit the encoder to the pandas column
le.fit(df.said_by)

LabelEncoder()

In [60]:
targets = list(le.classes_)
print(targets)

['Abraham Lincoln', 'Barack Obama', 'Donald Trump', 'George W. Bush', 'Hillary Clinton']


In [61]:
# Apply the fitted encoder to the pandas column
df['author_encoded'] = le.transform(df.said_by) 

In [65]:
#df.head()

<h2>Lable Encoding - Party

In [52]:
le_2 = preprocessing.LabelEncoder()

In [55]:
# Fit the encoder to the pandas column
le_2.fit(df.party)

LabelEncoder()

In [56]:
targets_2 = list(le_2.classes_)
print(targets_2)

['D', 'R']


In [63]:
# Apply the fitted encoder to the pandas column
df['party_encoded'] = le_2.transform(df.party) 

In [64]:
df.head()

Unnamed: 0,party,quote,said_by,quote_tokenized,quote_no_stops,lemmatized_quote,lemmatized_quote_string,author_encoded,party_encoded
0,D,change will not come if we wait for some other...,Barack Obama,"[change, will, not, come, if, we, wait, for, s...","[change, come, wait, person, wait, time, ones,...","[change, come, wait, person, wait, time, one, ...",change come wait person wait time one 've wait...,1,0
1,D,the best way to not feel hopeless is to get up...,Barack Obama,"[the, best, way, to, not, feel, hopeless, is, ...","[best, way, feel, hopeless, get, something, wa...","[best, way, feel, hopeless, get, something, wa...",best way feel hopeless get something wait good...,1,0
2,D,a change is brought about because ordinary peo...,Barack Obama,"[a, change, is, brought, about, because, ordin...","[change, brought, ordinary, people, extraordin...","[change, brought, ordinary, people, extraordin...",change brought ordinary people extraordinary t...,1,0
3,D,yes we can!,Barack Obama,"[yes, we, can, !]",[yes],[yes],yes,1,0
4,D,we are the change we have been waiting for.,Barack Obama,"[we, are, the, change, we, have, been, waiting...","[change, waiting]","[change, waiting]",change waiting,1,0


<h2>Train Test Split - Author

In [68]:
X = df.lemmatized_quote_string
y = df.author_encoded

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

<h2> Train Test Split - Party

In [69]:
X_party = df.lemmatized_quote_string
y_party = df.party_encoded

X_train_party, X_test_party, y_train_party, y_test_party = train_test_split(X_party, y_party, test_size=0.3, random_state = 42)

<h1>Author Classification

<h2>Naive Bayes Classifier

In [41]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [47]:
%%time

y_pred_nb = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred_nb, y_test))
#print(classification_report(y_test, y_pred,target_names = targets))

accuracy 0.6760330578512397
CPU times: user 26.7 ms, sys: 1.9 ms, total: 28.6 ms
Wall time: 27.9 ms


<h2>Logistic Regression Classifier

In [44]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False))])

In [48]:
%%time

y_pred_lr = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred_lr, y_test))
#print(classification_report(y_test, y_pred,target_names=targets))

accuracy 0.7735537190082644
CPU times: user 21.7 ms, sys: 2.27 ms, total: 24 ms
Wall time: 24.5 ms


<h2>XGBoost

In [49]:
from xgboost import XGBClassifier

In [50]:
xgb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', XGBClassifier()),
              ])
xgb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [51]:
%%time

y_pred_xgb = xgb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred_xgb, y_test))
#print(classification_report(y_test, y_pred,target_names=targets))

accuracy 0.7024793388429752
CPU times: user 37.7 ms, sys: 3.12 ms, total: 40.8 ms
Wall time: 47.1 ms


<h1>Party Classifier

<h2>Naive Bayes Classifier

In [70]:
nb_party = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb_party.fit(X_train_party, y_train_party)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [71]:
%%time

y_pred_nb_party = nb_party.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred_nb_party, y_test_party))
#print(classification_report(y_test, y_pred,target_names = targets))

accuracy 0.7900826446280992
CPU times: user 20.8 ms, sys: 2.01 ms, total: 22.8 ms
Wall time: 22 ms


<h2>Logistic Regression Classifier

In [72]:
logreg_party = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg_party.fit(X_train_party, y_train_party)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False))])

In [73]:
y_pred_lr_party = logreg_party.predict(X_test_party)

print('accuracy %s' % accuracy_score(y_pred_lr_party, y_test_party))
#print(classification_report(y_test, y_pred,target_names=targets))

accuracy 0.8644628099173554


In [74]:
xgb_party = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', XGBClassifier()),
              ])
xgb_party.fit(X_train_party, y_train_party)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [75]:
y_pred_xgb_party = xgb_party.predict(X_test_party)

print('accuracy %s' % accuracy_score(y_pred_xgb_party, y_test_party))
#print(classification_report(y_test, y_pred,target_names=targets))

accuracy 0.7966942148760331
