In [97]:

import pandas as pd
import datetime
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import string
from sklearn.model_selection import train_test_split

In [82]:
stop = stopwords.words('english')
stemmer = SnowballStemmer("dutch")

In [83]:
df = pd.read_csv('presidential_quotes.csv')

In [84]:
df = df.drop('Unnamed: 0', axis = 1)

In [85]:
df.head()

Unnamed: 0,party,quote,said_by
0,D,Change will not come if we wait for some other...,Barack Obama
1,D,The best way to not feel hopeless is to get up...,Barack Obama
2,D,A change is brought about because ordinary peo...,Barack Obama
3,D,Yes We Can!,Barack Obama
4,D,We are the change we have been waiting for.,Barack Obama


In [86]:
df.shape

(2016, 3)

In [87]:
df.said_by.value_counts()

Barack Obama       570
Hillary Clinton    563
Abraham Lincoln    440
Donald Trump       227
George W. Bush     216
Name: said_by, dtype: int64

In [88]:
df.party.value_counts()

D    1133
R     883
Name: party, dtype: int64

In [89]:
df['tokenized_quote'] = df['quote'].apply(word_tokenize) 

In [90]:
df.head()

Unnamed: 0,party,quote,said_by,tokenized_quote
0,D,Change will not come if we wait for some other...,Barack Obama,"[Change, will, not, come, if, we, wait, for, s..."
1,D,The best way to not feel hopeless is to get up...,Barack Obama,"[The, best, way, to, not, feel, hopeless, is, ..."
2,D,A change is brought about because ordinary peo...,Barack Obama,"[A, change, is, brought, about, because, ordin..."
3,D,Yes We Can!,Barack Obama,"[Yes, We, Can, !]"
4,D,We are the change we have been waiting for.,Barack Obama,"[We, are, the, change, we, have, been, waiting..."


In [91]:
df['quote_without_stop_words'] = df['tokenized_quote'].apply(lambda x: [t.lower() for t in x if t not in stop and t not in string.punctuation])



In [92]:
df.head()

Unnamed: 0,party,quote,said_by,tokenized_quote,quote_without_stop_words
0,D,Change will not come if we wait for some other...,Barack Obama,"[Change, will, not, come, if, we, wait, for, s...","[change, come, wait, person, wait, time, we, o..."
1,D,The best way to not feel hopeless is to get up...,Barack Obama,"[The, best, way, to, not, feel, hopeless, is, ...","[the, best, way, feel, hopeless, get, somethin..."
2,D,A change is brought about because ordinary peo...,Barack Obama,"[A, change, is, brought, about, because, ordin...","[a, change, brought, ordinary, people, extraor..."
3,D,Yes We Can!,Barack Obama,"[Yes, We, Can, !]","[yes, we, can]"
4,D,We are the change we have been waiting for.,Barack Obama,"[We, are, the, change, we, have, been, waiting...","[we, change, waiting]"


In [93]:
df['stemmed'] = df["quote_without_stop_words"].apply(lambda x: [stemmer.stem(y) for y in x])

In [94]:
df.head()

Unnamed: 0,party,quote,said_by,tokenized_quote,quote_without_stop_words,stemmed
0,D,Change will not come if we wait for some other...,Barack Obama,"[Change, will, not, come, if, we, wait, for, s...","[change, come, wait, person, wait, time, we, o...","[chang, com, wait, person, wait, tim, we, ones..."
1,D,The best way to not feel hopeless is to get up...,Barack Obama,"[The, best, way, to, not, feel, hopeless, is, ...","[the, best, way, feel, hopeless, get, somethin...","[the, best, way, fel, hopeles, get, someth, do..."
2,D,A change is brought about because ordinary peo...,Barack Obama,"[A, change, is, brought, about, because, ordin...","[a, change, brought, ordinary, people, extraor...","[a, chang, brought, ordinary, peopl, extraordi..."
3,D,Yes We Can!,Barack Obama,"[Yes, We, Can, !]","[yes, we, can]","[yes, we, can]"
4,D,We are the change we have been waiting for.,Barack Obama,"[We, are, the, change, we, have, been, waiting...","[we, change, waiting]","[we, chang, waiting]"


In [98]:
X = df.stemmed
y = df.said_by

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [100]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [101]:
model.fit(X_train, y_train)


AttributeError: 'list' object has no attribute 'lower'