In [2]:
import pandas as pd
import csv
import io

In [3]:
ep_iv = pd.read_csv('SW_EpisodeIV.txt', delim_whitespace=True, header=0, escapechar='\\')
ep_v = pd.read_csv('SW_EpisodeV.txt', delim_whitespace=True, header=0, escapechar='\\')
ep_vi = pd.read_csv('SW_EpisodeVI.txt', delim_whitespace=True, header=0, escapechar='\\')

In [4]:
all_eps = (ep_iv.append(ep_v, ignore_index=True)).append(ep_vi, ignore_index=True)

In [5]:
sw_highs = all_eps[all_eps.groupby('character').character.transform(len) >= 49]

In [6]:
sw_highs['character'].value_counts()

LUKE        494
HAN         459
THREEPIO    301
LEIA        227
VADER       140
BEN         115
LANDO       101
YODA         49
Name: character, dtype: int64

In [7]:
sw_highs.shape

(1886, 2)

In [8]:
sw_highs = sw_highs.sample(frac=1).reset_index(drop=True)
sw_train = sw_highs.loc[:1600]
sw_test = sw_highs.loc[1601:]

In [9]:
sw_train['character'].value_counts()

LUKE        430
HAN         380
THREEPIO    252
LEIA        194
VADER       115
BEN         104
LANDO        85
YODA         41
Name: character, dtype: int64

In [10]:
sw_test['character'].value_counts()

HAN         79
LUKE        64
THREEPIO    49
LEIA        33
VADER       25
LANDO       16
BEN         11
YODA         8
Name: character, dtype: int64

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(sw_train.dialogue)
X_train_counts.shape

(1601, 2089)

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1601, 2089)

In [13]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, sw_train.character)

In [14]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf = text_clf.fit(sw_train.dialogue, sw_train.character)

In [15]:
import numpy as np
predicted = text_clf.predict(sw_test.dialogue)
np.mean(predicted == sw_test.character)

0.3192982456140351

In [16]:
text_clf_no_stop_words = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
predicted_no_stop_words = text_clf_no_stop_words.fit(sw_train.dialogue, sw_train.character)
predicted_no_stop_words = text_clf_no_stop_words.predict(sw_test.dialogue)
np.mean(predicted_no_stop_words == sw_test.character)

0.3368421052631579

In [17]:
# Aggregate each line of dialogue to one character
sw_grouped_quotes_train = sw_train.groupby(['character'])['dialogue'].apply(lambda text: ''.join(text.to_string(index=False))).str.replace('(\\n)', '').reset_index()
sw_grouped_quotes_test = sw_test.groupby(['character'])['dialogue'].apply(lambda text: ''.join(text.to_string(index=False))).str.replace('(\\n)', '').reset_index()

In [18]:
# Nsw = no stop words; using Multinomial NB 
sw_grouped_nsw_pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
sw_grouped_nsw = sw_grouped_nsw_pipeline.fit(sw_grouped_quotes_train.dialogue, sw_grouped_quotes_train.character)
sw_grouped_nsw = sw_grouped_nsw_pipeline.predict(sw_grouped_quotes_test.dialogue)
np.mean(sw_grouped_nsw == sw_grouped_quotes_test.character)

0.875

In [19]:
print(sw_grouped_nsw)

['BEN' 'HAN' 'LEIA' 'LEIA' 'LUKE' 'THREEPIO' 'VADER' 'YODA']


In [24]:
# Using SGD Classifier grouped character dialogue (shifted before compilation), currently best algorithm
from sklearn.linear_model import SGDClassifier
sw_grouped_nsw_pipeline_sgd = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
sw_grouped_nsw_sgd = sw_grouped_nsw_pipeline_sgd.fit(sw_grouped_quotes_train.dialogue, sw_grouped_quotes_train.character)
sw_grouped_nsw_sgd = sw_grouped_nsw_pipeline_sgd.predict(sw_grouped_quotes_test.dialogue)
np.mean(sw_grouped_nsw_sgd == sw_grouped_quotes_test.character)

0.875

In [34]:
# Predictions
for x in sw_grouped_nsw_sgd: print(x)

BEN
HAN
LANDO
THREEPIO
LUKE
THREEPIO
VADER
YODA


In [35]:
# Actual values
sw_grouped_quotes_test

Unnamed: 0,character,dialogue
0,BEN,"Well, most of the best freighter pilots can b..."
1,HAN,"Charges! Come on, come..."
2,LANDO,"Look, would you get going, you pir..."
3,LEIA,No! Alderaan is peaceful. We have no weapon...
4,LUKE,"All right, I'll be there in a few minutes. B..."
5,THREEPIO,Where am I? I must have taken a bad ste...
6,VADER,"This facility is crude, but it should be adeq..."
7,YODA,Strong is Vader. Mind what you have learned. ...
