In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import sklearn as sl
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
df = pd.read_csv('./data/Shakespeare_data.csv')
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


Immediately, we can see that the Dataline column is useless to us. Also, since we are predicting what a player says in one of their lines, we can remove all non-speach lines

In [2]:
del df['Dataline']
df = df[df['ActSceneLine'].apply(type)==str]
df['ActSceneLine'] = df['ActSceneLine'].astype(str)
df

Unnamed: 0,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
3,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil
8,Henry IV,1.0,1.1.6,KING HENRY IV,Shall daub her lips with her own children's bl...
9,Henry IV,1.0,1.1.7,KING HENRY IV,"Nor more shall trenching war channel her fields,"
10,Henry IV,1.0,1.1.8,KING HENRY IV,Nor bruise her flowerets with the armed hoofs
11,Henry IV,1.0,1.1.9,KING HENRY IV,"Of hostile paces: those opposed eyes,"
12,Henry IV,1.0,1.1.10,KING HENRY IV,"Which, like the meteors of a troubled heaven,"


In [3]:
# I will use a label encoder to encode the play, player
le_play = LabelEncoder()
le_play.fit(df['Play'])
df['Play'] = le_play.transform(df['Play'])

le_player = LabelEncoder()
df['Player'] = df['Player'].astype(str)
le_player.fit(df['Player'])
df['Player'] = le_player.transform(df['Player'])

# I will split up act scene line into three separeate features to enable easier use by the classification model
actsceneline = df.ActSceneLine.str.split(pat='.', n=-1, expand=True)
df['Act'] = actsceneline[0]
df['Scene'] = actsceneline[1]
df['Line'] = actsceneline[2]
del df['ActSceneLine']

df['PlayerLinenumber'] = df['PlayerLinenumber'].astype(int)

# I'm going to remove the PlayerLine because determining the player from their words would be computationally extensive.
del df['PlayerLine']

df

Unnamed: 0,Play,PlayerLinenumber,Player,Act,Scene,Line
3,9,1,457,1,1,1
4,9,1,457,1,1,2
5,9,1,457,1,1,3
6,9,1,457,1,1,4
7,9,1,457,1,1,5
8,9,1,457,1,1,6
9,9,1,457,1,1,7
10,9,1,457,1,1,8
11,9,1,457,1,1,9
12,9,1,457,1,1,10


In [4]:
# Creating test and train split
labels = df['Player']
features = df[['PlayerLinenumber','Play', 'Act', 'Scene', 'Line']]
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.10)

First I will try a Random Forest Classifier

In [5]:
model = RandomForestClassifier(n_estimators=10)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
labeled_prediction = le_player.inverse_transform(y_predict)
labeled_prediction

array(['First Senator', 'TRANIO', 'OTHELLO', ..., 'QUEEN GERTRUDE',
       'OPHELIA', 'Surveyor'], dtype=object)

We can then test the accuracy of this prediction

In [6]:
accuracy_score(le_player.inverse_transform(y_test), labeled_prediction)

0.8150437428680106

This yielded over 80 percent accuracy. This is extremely good and better than we expected. We will try to use a less computationally intensive algorithm and see if we can acheive similar results.

In [7]:
model2 = DecisionTreeClassifier()
model2.fit(X_train, y_train)
y_predict2 = model2.predict(X_test)
labeled_prediction2 = le_player.inverse_transform(y_predict2)
accuracy_score(le_player.inverse_transform(y_test), labeled_prediction2)

0.7906998858881704

We can see this almost matched the accuracy of the random forrest. This in practice would be a better model as it would scale a lot better and provide almost the same accuracy. In other similar Shakespeare datasets, we might not be given the "PlayerLinenumber". I will make a couple more models to see how impactful this feature is.

In [8]:
less_features = df[['Play', 'Act', 'Scene', 'Line']]
X_train2, X_test2, y_train2, y_test2 = train_test_split(less_features, labels, test_size=0.10)
model3 = RandomForestClassifier(n_estimators=10)
model3.fit(X_train2, y_train2)
y_predict3 = model3.predict(X_test2)
labeled_prediction3 = le_player.inverse_transform(y_predict3)
accuracy_score(le_player.inverse_transform(y_test2), labeled_prediction3)

0.7163370102700647

In [9]:
less_features2 = df[['Play', 'PlayerLinenumber']]
X_train3, X_test3, y_train3, y_test3 = train_test_split(less_features2, labels, test_size=0.10)
model4 = RandomForestClassifier(n_estimators=10)
model4.fit(X_train3, y_train3)
y_predict4 = model4.predict(X_test3)
labeled_prediction4 = le_player.inverse_transform(y_predict4)
accuracy_score(le_player.inverse_transform(y_test3), labeled_prediction4)

0.4100418410041841

This has shown that the PlayerLinenumber is not too impactful in the accuracy of the model and we can stil determine with over 70% accuracy the player given just the Play, Act, Scene, and Line. Overall this was an extremely insightful project to learn about classification algorithms.