# Star Wars ML Model using SGD Classifier

In [12]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pandas as pd
import numpy as np

In [13]:
# Load in data
data = pd.read_csv('sw_ml_optimal_data.csv')

In [14]:
# Print first 15 entries
data.head(n=15)

Unnamed: 0,character,dialogue
0,YODA,"I do, yes, I do! Sick have I become. Old and ..."
1,LANDO,Han! Chewie?
2,LUKE,Stay tight and low.
3,LANDO,"We're on our way. Red Group, Gold Group, all f..."
4,HAN,"I do, I really do. You could be a little nicer..."
5,HAN,Wonderful girl! Either I'm going to kill her ...
6,YODA,No! Try not. Do. Or do not. There is no try.
7,LUKE,Ask me again sometime.
8,LANDO,Leia! Go!
9,BEN,"Oh, he's not dead, no... not yet."


In [15]:
# Organize data
train_data = data.loc[:1600]
test_data = data.loc[1601:]
train = train_data.groupby(['character'])['dialogue'].apply(lambda text: ''.join(text.to_string(index=False))).str.replace('(\\n)', '').reset_index()
test = test_data.groupby(['character'])['dialogue'].apply(lambda text: ''.join(text.to_string(index=False))).str.replace('(\\n)', '').reset_index()

In [16]:
# Print training data
train

Unnamed: 0,character,dialogue
0,BEN,"Oh, he's not dead, no... not ..."
1,HAN,"I do, I really do. You could be a little nice..."
2,LANDO,Han! Che...
3,LEIA,"And then you're as good as gone, aren't ..."
4,LUKE,Stay tight and ...
5,THREEPIO,"Oh, uh, yes, uh, I am here, Your Worshipfulne..."
6,VADER,No. I am your fat...
7,YODA,"I do, yes, I do! Sick have I become. Old and..."


In [17]:
# Print test data
test

Unnamed: 0,character,dialogue
0,BEN,Should I h...
1,HAN,"Well, there's a price on my head. If I don't ..."
2,LANDO,Hold it! W...
3,LEIA,And what precisely am I supposed to k...
4,LUKE,"Hey, easy with thos..."
5,THREEPIO,"Oh, I'm terribly sorry. I...I didn't mean to ..."
6,VADER,I sense something... a presence I haven't fel...
7,YODA,"Unexpected this is, and unfortuna..."


In [18]:
# Define model pipeline and fit data
pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
sgd = pipeline.fit(train.dialogue, train.character)

In [19]:
# Predict test data
sgd = pipeline.predict(test.dialogue)
accuracy = np.mean(sgd == test.character)

In [20]:
# Predictions
predictions = sgd
actual_values = test.character
print("{0:20s} {1}".format("Prediction", "True Value"))
for x, y in zip(predictions, actual_values): 
    print("{0:20s} {1}".format(x, y))
print('\nSGD Classifier Accuracy: {x}%'.format(x = accuracy*100))

Prediction           True Value
BEN                  BEN
HAN                  HAN
LANDO                LANDO
LEIA                 LEIA
LUKE                 LUKE
THREEPIO             THREEPIO
VADER                VADER
YODA                 YODA

SGD Classifier Accuracy: 100.0%


In [21]:
train.to_csv('sw_model_train_data.csv')
test.to_csv('sw_model_test_data.csv')