In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv("simpsons_dataset.csv")
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [3]:
df_BL = df[(df.raw_character_text == 'Lisa Simpson') | (df.raw_character_text == "Bart Simpson")]
df_BL.head()

Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
11,Lisa Simpson,Do you know where I could find him?


In [4]:
text = df_BL['spoken_words'].values.astype('U') #Taking the text from the df. We need to convert it to Unicode
vect = CountVectorizer(stop_words='english') #Create the CV object, with English stop words
vect = vect.fit(text) #We fit the model with the words from the review text
feature_names = vect.get_feature_names() #Get the words from the vocabulary
print(f"There are {len(feature_names)} unique (!) words in the vocabulary. A selection: {feature_names[500:520]}")

There are 14258 unique (!) words in the vocabulary. A selection: ['anguished', 'angus', 'anima', 'animal', 'animals', 'animated', 'animation', 'animators', 'anka', 'ankle', 'ann', 'annapolis', 'anne', 'annie', 'anniversary', 'annnnd', 'announce', 'announcement', 'announcements', 'announcer']


In [5]:
matrix = vect.transform(text) #The transform method from the CountVectorizer object creates the matrix
print(matrix[0:500,0:500])

  (24, 424)	1
  (40, 325)	1
  (45, 266)	1
  (63, 269)	1
  (74, 356)	1
  (80, 264)	1
  (82, 304)	1
  (98, 192)	1
  (100, 396)	1
  (151, 328)	1
  (156, 325)	1
  (157, 451)	1
  (163, 325)	1
  (164, 325)	1
  (186, 461)	1
  (207, 325)	1
  (210, 397)	1
  (231, 270)	1
  (237, 404)	1
  (259, 325)	1
  (287, 325)	1
  (294, 493)	1
  (295, 163)	1
  (318, 300)	1
  (321, 281)	1
  (356, 450)	1
  (358, 397)	1
  (362, 449)	1
  (366, 24)	1
  (366, 449)	1
  (386, 129)	1
  (387, 325)	1
  (388, 70)	1
  (394, 38)	1
  (394, 91)	1
  (396, 446)	1
  (398, 126)	1
  (410, 52)	1
  (410, 319)	1
  (410, 343)	1
  (413, 449)	1
  (419, 196)	1
  (428, 360)	1
  (464, 304)	1


In [6]:
docu_feat = pd.DataFrame(matrix.toarray()) #make a regular matrix, then put in Dataframe
docu_feat.index = df_BL['spoken_words']
# docu_feat["Who?"] = df_BL["raw_character_text"]
docu_feat.columns = feature_names #Give the columns names (words from vocabulary)

In [7]:
#jonas example
# df_words = pd.concat([df_BL, pd.DataFrame(docu_feat.toarray())],axis=1)
# df_words.head(5)

In [8]:
docu_feat.iloc[0:4, 1000:1015]

Unnamed: 0_level_0,bartholemew,bartholomew,bartish,bartman,barto,bartrand,bartron,barts,barty,bas,base,baseball,based,basement,basements
spoken_words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Where's Mr. Bergstrom?,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
That life is worth living.,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Victory party under the slide!,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Mr. Bergstrom! Mr. Bergstrom!,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
df_docu = pd.DataFrame(docu_feat)
df_docu.head(15)

Unnamed: 0_level_0,000,007,10,1000,10201,108,1094,11,12,120,...,zork,zorrinid,zuckerberg,zuh,zumanity,zur,zz,zzzapp,ãªtre,ãºna
spoken_words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Where's Mr. Bergstrom?,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
That life is worth living.,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Victory party under the slide!,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Mr. Bergstrom! Mr. Bergstrom!,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Do you know where I could find him?,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"The train, how like him... traditional, yet environmentally sound.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"I see he touched you, too.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Hey, thanks for your vote, man.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Well, you got that right. Thanks for your vote, girls.",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Well, don't sweat it. Just so long as a couple of people did... right, Milhouse?",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


At this moment python is using 1.39GB of space.

In [10]:
df_docu.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25248 entries, Where's Mr. Bergstrom? to Mr. Bergstrom, we request the pleasure of your company... no... Mr. Bergstrom, if you're not doing anything this Friday... no... Mr. Bergstrom, do you like pork chops... oh no, of course you wouldn't...
Columns: 14258 entries, 000 to ãºna
dtypes: int64(14258)
memory usage: 2.7+ GB


## Naive Bayes

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [12]:
#Setting up the data and model
X = docu_feat #selecting the variables to go into my X matrix
y = df_BL['raw_character_text'] #creating the y vector. Type = 'paid' or 'free'

#Split the data. test_size = 0.3, so I'm splitting the data into 70% training data and 30% test data
#I'm using the subscript _k to indicate it's linear regression (using the same variable names in next block)
X_train_k, X_test_k, y_train_k, y_test_k = train_test_split(X, y, test_size=0.3) #split the data and store it

In [None]:
abc = MultinomialNB()
MNB = abc.fit(X_train_k, y_train_k)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
MNB.predict(X)

In [17]:
Accuracy = MNB.score(X_test_k, y_test_k)
Accuracy

0.643960396039604