In [1]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('simpsons_dataset.csv').dropna()
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


Elke waarde is nieuw (?) Geen een is hetzelfde

In [3]:
# df = df.loc[(df['raw_character_text'] == "Lisa Simpson") | (df['raw_character_text'] == "Bart Simpson")]
# df.head()
lisa = df.loc[df['raw_character_text'] == "Lisa Simpson"]

In [4]:
bart = df.loc[df['raw_character_text'] == "Bart Simpson"]

In [5]:
simpsons = pd.concat([lisa, bart])
simpsons.head()

Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
11,Lisa Simpson,Do you know where I could find him?
13,Lisa Simpson,"The train, how like him... traditional, yet en..."


In [6]:
text = simpsons['spoken_words'].values.astype('U')

In [7]:
vect = CountVectorizer(stop_words='english') #Create the CV object, with English stop words
vect = vect.fit(text) #We fit the model with the words from the review text
feature_names = vect.get_feature_names() #Get the words from the vocabulary
print(f"There are {len(feature_names)} words in the vocabulary. A selection: {feature_names[500:520]}")

There are 14257 words in the vocabulary. A selection: ['anguished', 'angus', 'anima', 'animal', 'animals', 'animated', 'animation', 'animators', 'anka', 'ankle', 'ann', 'annapolis', 'anne', 'annie', 'anniversary', 'annnnd', 'announce', 'announcement', 'announcements', 'announcer']


In [8]:
matrix = vect.transform(text) #The transform method from the CountVectorizer object creates the matrix
#print(matrix[0:50,0:50])#Let's print a little part of the matrix: the first 50 words & documents

print(matrix[0:500, 0:500])
#haakjes zeggen in welke plek in de matrix het staat, want de sparse laat de nullen niet zien

  (14, 424)	1
  (28, 266)	1
  (31, 269)	1
  (45, 192)	1
  (46, 396)	1
  (55, 325)	1
  (59, 270)	1
  (65, 325)	1
  (81, 300)	1
  (83, 281)	1
  (92, 450)	1
  (93, 397)	1
  (97, 449)	1
  (101, 24)	1
  (101, 449)	1
  (109, 129)	1
  (110, 325)	1
  (116, 38)	1
  (116, 91)	1
  (118, 446)	1
  (120, 126)	1
  (128, 52)	1
  (128, 319)	1
  (128, 343)	1
  (130, 449)	1
  :	:
  (187, 479)	1
  (188, 479)	1
  (190, 450)	1
  (235, 264)	1
  (236, 264)	1
  (248, 471)	1
  (250, 207)	1
  (312, 80)	1
  (322, 319)	1
  (323, 162)	1
  (330, 384)	1
  (332, 161)	1
  (354, 427)	1
  (358, 325)	1
  (387, 147)	1
  (387, 288)	1
  (397, 425)	1
  (415, 397)	1
  (442, 316)	1
  (447, 170)	1
  (448, 307)	1
  (452, 170)	1
  (454, 170)	1
  (465, 222)	1
  (488, 219)	1


In [9]:
simpsons_mat = pd.DataFrame(matrix.toarray())
simpsons_mat.index = simpsons['raw_character_text'] 
simpsons_mat.columns = feature_names 

In [10]:
#simpsons_mat#.iloc[0:4, 1000:1015]

In [11]:
# tada = pd.concat([simpsons, simpsons_mat], sort=False)
# tada

__Exercise 2__

In [19]:
X = simpsons_mat
X = normalize(X)
y = simpsons['raw_character_text']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 


In [20]:
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
model.predict(X_test)

array(['Lisa Simpson', 'Lisa Simpson', 'Bart Simpson', ...,
       'Bart Simpson', 'Bart Simpson', 'Bart Simpson'], dtype='<U12')

In [22]:
accuracy = model.score(X_test, y_test)

In [23]:
print(f'The accuracy is: {accuracy}' )

The accuracy is: 0.6385068762278978


In [24]:
simpsons['class'] = model.predict(X)

In [27]:
simpsons.head(500)

Unnamed: 0,raw_character_text,spoken_words,class
1,Lisa Simpson,Where's Mr. Bergstrom?,Lisa Simpson
3,Lisa Simpson,That life is worth living.,Bart Simpson
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!,Lisa Simpson
11,Lisa Simpson,Do you know where I could find him?,Bart Simpson
13,Lisa Simpson,"The train, how like him... traditional, yet en...",Lisa Simpson
15,Lisa Simpson,"I see he touched you, too.",Lisa Simpson
37,Lisa Simpson,"Mr. Bergstrom! Hey, Mr. Bergstrom!",Lisa Simpson
39,Lisa Simpson,"Hey, Lisa, indeed.",Bart Simpson
41,Lisa Simpson,"Oh, I mean, were you just going to leave, just...",Bart Simpson
43,Lisa Simpson,You can't go! You're the best teacher I'll eve...,Bart Simpson
