In [1]:
import pandas as pd
import csv
import io

In [2]:
ep_iv = pd.read_csv('SW_EpisodeIV.txt', delim_whitespace=True, header=0, escapechar='\\')
ep_v = pd.read_csv('SW_EpisodeV.txt', delim_whitespace=True, header=0, escapechar='\\')
ep_vi = pd.read_csv('SW_EpisodeVI.txt', delim_whitespace=True, header=0, escapechar='\\')

In [3]:
all_eps = (ep_iv.append(ep_v, ignore_index=True)).append(ep_vi, ignore_index=True)

In [4]:
sw_highs = all_eps[all_eps.groupby('character').character.transform(len) >= 49]

In [5]:
sw_highs['character'].value_counts()

LUKE        494
HAN         459
THREEPIO    301
LEIA        227
VADER       140
BEN         115
LANDO       101
YODA         49
Name: character, dtype: int64

In [6]:
sw_highs.shape

(1886, 2)

In [24]:
sw_highs = sw_highs.sample(frac=1).reset_index(drop=True)
sw_train = sw_highs.loc[:1600]
sw_test = sw_highs.loc[1601:]

In [25]:
sw_train['character'].value_counts()

LUKE        409
HAN         400
THREEPIO    261
LEIA        193
VADER       117
BEN          97
LANDO        84
YODA         40
Name: character, dtype: int64

In [26]:
sw_test['character'].value_counts()

LUKE        85
HAN         59
THREEPIO    40
LEIA        34
VADER       23
BEN         18
LANDO       17
YODA         9
Name: character, dtype: int64

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(sw_train.dialogue)
X_train_counts.shape

(1601, 2116)

In [28]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1601, 2116)

In [29]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, sw_train.character)

In [30]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf = text_clf.fit(sw_train.dialogue, sw_train.character)

In [31]:
import numpy as np
predicted = text_clf.predict(sw_test.dialogue)
np.mean(predicted == sw_test.character)

0.3929824561403509

In [32]:
text_clf_no_stop_words = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
predicted_no_stop_words = text_clf_no_stop_words.fit(sw_train.dialogue, sw_train.character)
predicted_no_stop_words = text_clf_no_stop_words.predict(sw_test.dialogue)
np.mean(predicted_no_stop_words == sw_test.character)

0.40350877192982454

In [33]:
# Aggregate each line of dialogue to one character
sw_grouped_quotes_train = sw_train.groupby(['character'])['dialogue'].apply(lambda text: ''.join(text.to_string(index=False))).str.replace('(\\n)', '').reset_index()
sw_grouped_quotes_test = sw_test.groupby(['character'])['dialogue'].apply(lambda text: ''.join(text.to_string(index=False))).str.replace('(\\n)', '').reset_index()

In [34]:
# Nsw = no stop words; using Multinomial NB 
sw_grouped_nsw_pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
sw_grouped_nsw = sw_grouped_nsw_pipeline.fit(sw_grouped_quotes_train.dialogue, sw_grouped_quotes_train.character)
sw_grouped_nsw = sw_grouped_nsw_pipeline.predict(sw_grouped_quotes_test.dialogue)
np.mean(sw_grouped_nsw == sw_grouped_quotes_test.character)

0.875

In [35]:
sw_grouped_nsw

array(['BEN', 'HAN', 'LUKE', 'LEIA', 'LUKE', 'THREEPIO', 'VADER', 'YODA'],
      dtype='<U8')

In [36]:
vader = all_eps.loc[all_eps['character'].isin(['VADER'])]

In [41]:
# Using SGD Classifier grouped character dialogue (shifted before compilation)
from sklearn.linear_model import SGDClassifier
sw_grouped_nsw_pipeline_sgd = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
sw_grouped_nsw_sgd = sw_grouped_nsw_pipeline_sgd.fit(sw_grouped_quotes_train.dialogue, sw_grouped_quotes_train.character)
sw_grouped_nsw_sgd = sw_grouped_nsw_pipeline_sgd.predict(sw_grouped_quotes_test.dialogue)
np.mean(sw_grouped_nsw_sgd == sw_grouped_quotes_test.character)

0.875

In [39]:
print(sw_grouped_nsw_sgd)

['BEN' 'HAN' 'LUKE' 'LEIA' 'LUKE' 'THREEPIO' 'VADER' 'YODA']


In [40]:
sw_grouped_quotes_test

Unnamed: 0,character,dialogue
0,BEN,Plug in. He should be able to interpret the ...
1,HAN,And hope they don't have blast...
2,LANDO,"All right, old buddy. You know, I know what s..."
3,LEIA,I don't know what you're talking about. I'm ...
4,LUKE,"They're going to execute her. Look, a few mi..."
5,THREEPIO,We're com...
6,VADER,If you only knew the power of the dark side. ...
7,YODA,"Yes, yes. To Obi-Wan you listen. The cave. Re..."
