In [363]:
import pandas as pd
import csv
import io

In [364]:
ep_iv = pd.read_csv('SW_EpisodeIV.txt', delim_whitespace=True, header=0, escapechar='\\')
ep_v = pd.read_csv('SW_EpisodeV.txt', delim_whitespace=True, header=0, escapechar='\\')
ep_vi = pd.read_csv('SW_EpisodeVI.txt', delim_whitespace=True, header=0, escapechar='\\')

In [365]:
all_eps = (ep_iv.append(ep_v, ignore_index=True)).append(ep_vi, ignore_index=True)

In [366]:
# Create Dataframe with dialogue from just characters that have more than 48 lines
sw_highs = all_eps[all_eps.groupby('character').character.transform(len) >= 49]

In [367]:
sw_highs['character'].value_counts()

LUKE        494
HAN         459
THREEPIO    301
LEIA        227
VADER       140
BEN         115
LANDO       101
YODA         49
Name: character, dtype: int64

In [368]:
sw_highs.shape

(1886, 2)

In [439]:
sw_highs = sw_highs.sample(frac=1).reset_index(drop=True)
sw_train = sw_highs.loc[:1600]
sw_test = sw_highs.loc[1601:]

In [426]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf = text_clf.fit(sw_train.dialogue, sw_train.character)

In [427]:
import numpy as np
predicted = text_clf.predict(sw_test.dialogue)
np.mean(predicted == sw_test.character)

0.37894736842105264

In [461]:
text_clf_no_stop_words = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
predicted_no_stop_words = text_clf_no_stop_words.fit(sw_train.dialogue, sw_train.character)
predicted_no_stop_words = text_clf_no_stop_words.predict(sw_test.dialogue)
np.mean(predicted_no_stop_words == sw_test.character)

0.3824561403508772

In [462]:
# Aggregate each line of dialogue to one character
sw_grouped_quotes_train = sw_train.groupby(['character'])['dialogue'].apply(lambda text: ''.join(text.to_string(index=False))).str.replace('(\\n)', '').reset_index()
sw_grouped_quotes_test = sw_test.groupby(['character'])['dialogue'].apply(lambda text: ''.join(text.to_string(index=False))).str.replace('(\\n)', '').reset_index()

In [441]:
# Nsw = no stop words; using Multinomial NB 
sw_grouped_nsw_pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
sw_grouped_nsw = sw_grouped_nsw_pipeline.fit(sw_grouped_quotes_train.dialogue, sw_grouped_quotes_train.character)
sw_grouped_nsw = sw_grouped_nsw_pipeline.predict(sw_grouped_quotes_test.dialogue)
np.mean(sw_grouped_nsw == sw_grouped_quotes_test.character)

0.75

In [380]:
print(sw_grouped_nsw)

['BEN' 'HAN' 'HAN' 'HAN' 'LUKE' 'THREEPIO' 'VADER' 'BEN']


In [401]:
# Using SGD Classifier grouped character dialogue (shifted before compilation), currently best algorithm
from sklearn.linear_model import SGDClassifier
sw_grouped_nsw_pipeline_sgd = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
sw_grouped_nsw_sgd = sw_grouped_nsw_pipeline_sgd.fit(sw_grouped_quotes_train.dialogue, sw_grouped_quotes_train.character)
sw_grouped_nsw_sgd = sw_grouped_nsw_pipeline_sgd.predict(sw_grouped_quotes_test.dialogue)
print('SGD Classifier Accuracy:', np.mean(sw_grouped_nsw_sgd == sw_grouped_quotes_test.character))

SGD Classifier Accuracy: 0.625


In [382]:
# Predictions
predictions = sw_grouped_nsw_sgd
actual_values = sw_grouped_quotes_test.character.tolist()
print("{0:12s} {1}".format("Prediction", "True Value"))
for x, y in zip(predictions, actual_values): 
    print("{0:12s} {1}".format(x, y))
print('SGD Classifier Accuracy:', np.mean(sw_grouped_nsw_sgd == sw_grouped_quotes_test.character))

Prediction   True Value
YODA         BEN
HAN          HAN
HAN          LANDO
HAN          LEIA
LUKE         LUKE
THREEPIO     THREEPIO
VADER        VADER
YODA         YODA
SGD Classifier Accuracy: 0.625


In [459]:
# Actual values
sw_grouped_quotes_test

sw_grouped_nsw_pipeline_sgd = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
sw_grouped_nsw_sgd = sw_grouped_nsw_pipeline_sgd.fit(sw_grouped_quotes_train.dialogue, sw_grouped_quotes_train.character)
sw_grouped_nsw_sgd = sw_grouped_nsw_pipeline_sgd.predict(sw_grouped_quotes_test.dialogue)
print('SGD Classifier Accuracy:', np.mean(sw_grouped_nsw_sgd == sw_grouped_quotes_test.character))

SGD Classifier Accuracy: 0.875


In [460]:
from sklearn.linear_model import SGDClassifier
accuracy = 0
sw = all_eps[all_eps.groupby('character').character.transform(len) >= 49]
# while accuracy <= 0.9:
    
# Construct data
new_df = sw.sample(frac=1).reset_index(drop=True)
train_df = new_df.loc[:1600]
test_df = new_df.loc[1601:]
train_df_grouped = train_df.groupby(['character'])['dialogue'].apply(lambda text: ''.join(text.to_string(index=False))).str.replace('(\\n)', '').reset_index()
test_df_grouped = test_df.groupby(['character'])['dialogue'].apply(lambda text: ''.join(text.to_string(index=False))).str.replace('(\\n)', '').reset_index()

pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
sgd = pipeline.fit(train_df_grouped.dialogue, train_df_grouped.character)
sgd = pipeline.predict(test_df_grouped.dialogue)
accuracy = np.mean(sw_grouped_nsw_sgd == sw_grouped_quotes_test.character)
print(accuracy)

0.875
