In [78]:
import pandas as pd
import csv
import io

In [79]:
ep_iv = pd.read_csv('SW_EpisodeIV.txt', delim_whitespace=True, header=0, escapechar='\\')
ep_v = pd.read_csv('SW_EpisodeV.txt', delim_whitespace=True, header=0, escapechar='\\')
ep_vi = pd.read_csv('SW_EpisodeVI.txt', delim_whitespace=True, header=0, escapechar='\\')

In [80]:
all_eps = (ep_iv.append(ep_v, ignore_index=True)).append(ep_vi, ignore_index=True)

In [81]:
sw_highs = all_eps[all_eps.groupby('character').character.transform(len) >= 49]

In [82]:
sw_highs['character'].value_counts()

LUKE        494
HAN         459
THREEPIO    301
LEIA        227
VADER       140
BEN         115
LANDO       101
YODA         49
Name: character, dtype: int64

In [83]:
sw_highs.shape

(1886, 2)

In [84]:
sw_train = sw_highs.loc[:1600]
sw_test = sw_highs.loc[1601:]

In [85]:
sw_train['character'].value_counts()

LUKE        352
HAN         307
THREEPIO    177
LEIA        136
BEN          88
VADER        64
YODA         28
LANDO         1
Name: character, dtype: int64

In [86]:
sw_test['character'].value_counts()

HAN         152
LUKE        142
THREEPIO    124
LANDO       100
LEIA         91
VADER        76
BEN          27
YODA         21
Name: character, dtype: int64

In [87]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(sw_train.dialogue)
X_train_counts.shape

(1153, 1804)

In [88]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1153, 1804)

In [89]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, sw_train.character)

In [90]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf = text_clf.fit(sw_train.dialogue, sw_train.character)

In [91]:
import numpy as np
predicted = text_clf.predict(sw_test.dialogue)
np.mean(predicted == sw_test.character)

0.24965893587994542

In [92]:
text_clf_no_stop_words = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
predicted_no_stop_words = text_clf_no_stop_words.fit(sw_train.dialogue, sw_train.character)
predicted_no_stop_words = text_clf_no_stop_words.predict(sw_test.dialogue)
np.mean(predicted_no_stop_words == sw_test.character)

0.2592087312414734

In [103]:
# Aggregate each line of dialogue to one character
sw_grouped_quotes_train = sw_train.groupby(['character'])['dialogue'].apply(lambda text: ''.join(text.to_string(index=False))).str.replace('(\\n)', '').reset_index()
sw_grouped_quotes_test = sw_test.groupby(['character'])['dialogue'].apply(lambda text: ''.join(text.to_string(index=False))).str.replace('(\\n)', '').reset_index()

In [121]:
# Nsw = no stop words; using Multinomial NB 
sw_grouped_nsw_pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
sw_grouped_nsw = sw_grouped_nsw_pipeline.fit(sw_grouped_quotes_train.dialogue, sw_grouped_quotes_train.character)
sw_grouped_nsw = sw_grouped_nsw_pipeline.predict(sw_grouped_quotes_test.dialogue)
np.mean(sw_grouped_nsw == sw_grouped_quotes_test.character)

0.625

In [107]:
sw_grouped_nsw

array(['BEN', 'HAN', 'LUKE', 'HAN', 'LUKE', 'THREEPIO', 'BEN', 'YODA'],
      dtype='<U8')

In [118]:
vader = all_eps.loc[all_eps['character'].isin(['VADER'])]

In [123]:
from sklearn.linear_model import SGDClassifier
sw_grouped_nsw_pipeline_sgd = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
sw_grouped_nsw_sgd = sw_grouped_nsw_pipeline_sgd.fit(sw_grouped_quotes_train.dialogue, sw_grouped_quotes_train.character)
sw_grouped_nsw_sgd = sw_grouped_nsw_pipeline_sgd.predict(sw_grouped_quotes_test.dialogue)
np.mean(sw_grouped_nsw_sgd == sw_grouped_quotes_test.character)

0.625