In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords

In [2]:
full_ws_df = pd.read_csv("./data/Shakespeare_data.csv")

In [3]:
#Drop rows that are not dialogue
full_ws_df = full_ws_df.loc[full_ws_df["ActSceneLine"].notna()]

In [4]:
full_ws_df.drop(axis=1, labels=["Dataline", "PlayerLinenumber", "Player", "ActSceneLine"], inplace=True)

In [5]:
full_ws_df["Shakespeare"] = 1

In [6]:
full_ws_df = full_ws_df.rename(columns={"PlayerLine":"Line"})

In [7]:
s_df = full_ws_df.loc[full_ws_df["Play"].isin(
    ["Othello", "King Lear", "Antony and Cleopatra", "Coriolanus"])]

In [8]:
timon_df = full_ws_df.loc[full_ws_df["Play"] == "Timon of Athens"]

In [9]:
s_df.drop(axis=1, labels="Play", inplace=True)
timon_df.drop(axis=1, labels="Play", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [10]:
m1 = pd.read_csv("./data/a_trick_to_catch_the_old_one.csv")
m2 = pd.read_csv("./data/the_phoenix.csv")
m3 = pd.read_csv("./data/the_puritan.csv")
m4 = pd.read_csv("./data/the_revengers_tragedy.csv")
m5 = pd.read_csv("./data/your_five_gallants.csv")

In [11]:
m_df = pd.concat([m1, m2, m3, m4, m5])

In [12]:
m_df["Shakespeare"] = 0

In [13]:
m_df

Unnamed: 0,Line,Shakespeare
0,"All's gone! Still thou'rt a gentleman, that's ...",0
1,He that doth his youth expose,0
2,"To brothel, drink, and danger,",0
3,Let him that is his nearest kin,0
4,Cheat him before a stranger.,0
...,...,...
1943,"To them I give my thanks; myself to thee,",0
1944,Thrice-worthy Fitsgrave!,0
1945,I have all my wishes.,0
1946,"And I presume there's none but those can frown,",0


In [14]:
df = pd.concat([s_df, m_df])

In [15]:
df

Unnamed: 0,Line,Shakespeare
18568,"Nay, but this dotage of our general's",1
18569,"O'erflows the measure: those his goodly eyes,",1
18570,That o'er the files and musters of the war,1
18571,"Have glow'd like plated Mars, now bend, now turn,",1
18572,The office and devotion of their view,1
...,...,...
1943,"To them I give my thanks; myself to thee,",0
1944,Thrice-worthy Fitsgrave!,0
1945,I have all my wishes.,0
1946,"And I presume there's none but those can frown,",0


In [16]:
df

Unnamed: 0,Line,Shakespeare
18568,"Nay, but this dotage of our general's",1
18569,"O'erflows the measure: those his goodly eyes,",1
18570,That o'er the files and musters of the war,1
18571,"Have glow'd like plated Mars, now bend, now turn,",1
18572,The office and devotion of their view,1
...,...,...
1943,"To them I give my thanks; myself to thee,",0
1944,Thrice-worthy Fitsgrave!,0
1945,I have all my wishes.,0
1946,"And I presume there's none but those can frown,",0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(df["Line"], df["Shakespeare"], train_size=.9, random_state=96)

In [18]:
stoplist = stopwords.words('english')

## Modeling

In [24]:
tfidf_baseline = TfidfVectorizer(token_pattern=r"(?u)\b[A-Za-z]+\b",stop_words = stoplist)
baseline_vectorized = tfidf_baseline.fit_transform(X_train)

In [25]:
nb = MultinomialNB()

In [61]:
baseline_cv = cross_val_score(nb, baseline_vectorized, y_train)
baseline_cv.mean()

0.7662551212249893

In [62]:
nb.fit(baseline_vectorized, y_train)
baseline_train_score = nb.score(baseline_vectorized, y_train)
print(baseline_train_score)

0.8444253203289348


#### Iteration 1: Overfit Decision Tree

In [23]:
dt = DecisionTreeClassifier()

In [25]:
cv1 = cross_val_score(dt, baseline_vectorized, y_train)
cv1.mean()

0.7128030167631983

In [26]:
dt.fit(baseline_vectorized, y_train)
dt1_score = dt.score(baseline_vectorized, y_train)
print(dt1_score)

0.9812583668005355


#### Iteration 2: Random Forest

In [22]:
rf = RandomForestClassifier()

In [34]:
cv2 = cross_val_score(rf, baseline_vectorized, y_train)
cv2.mean()

0.7510992435465595

In [35]:
rf.fit(baseline_vectorized, y_train)
rf1_score = rf.score(baseline_vectorized, y_train)
print(rf1_score)

0.9812583668005355


#### Iteration 3: Modifying Vectorizer

In [20]:
tfidf_2 = TfidfVectorizer(token_pattern = r"(?u)\b[A-Za-z]+\b",
                          stop_words = stoplist,
                          ngram_range=(1, 2),
                          min_df=5,
                         )
vectorized_2 = tfidf_2.fit_transform(X_train)

In [64]:
cv3 = cross_val_score(rf, vectorized_2, y_train)
cv3.mean()

0.7404853890531145

In [65]:
rf.fit(vectorized_2, y_train)
rf2_score = rf.score(vectorized_2, y_train)
print(rf2_score)

0.9653853509275196


In [66]:
nb_cv2 = cross_val_score(nb, vectorized_2, y_train)
nb_cv2.mean()

0.7605658590790639

In [67]:
nb.fit(vectorized_2, y_train)
nb_train_score2 = nb.score(vectorized_2, y_train)
print(nb_train_score2)

0.805698986421878


#### Iteration 4: Random Forest Hyperparameters

In [26]:
grid_rf1 = {"n_estimators":[100, 200, 300],
           "criterion": ["gini", "entropy", "log_loss"],
           "max_features": ["sqrt", "auto"],
           "max_depth" : [10,20,30,None],
            "min_samples_leaf" : [1,2,4],
            "min_samples_split": [2,5,10]
            }
GS_rf1 = GridSearchCV(
    estimator=rf,
    param_grid=grid_rf1,
    verbose=2,
    n_jobs=-1)

In [27]:
# GS_rf1.fit(vectorized_2, y_train)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done 2568 tasks      | elapsed: 21.4min
[Parallel(n_jobs=-1)]: Done 3240 out of 3240 | elapsed: 21.6min finished


GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [10, 20, 30, None],
                         'max_features': ['sqrt', 'auto'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 200, 300]},
             verbose=2)

In [28]:
# GS_rf1.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 300}

{'criterion': 'entropy',

 'max_depth': None,
 
 'max_features': 'auto',
 
 'min_samples_leaf': 2,
 
 'min_samples_split': 5,
 
 'n_estimators': 300}

In [29]:
# GS_rf1.best_score_

0.74598373229712

0.74598373229712

In [34]:
grid_rf2 = {"n_estimators":[250, 300, 400],
           "criterion": ["entropy"],
           "max_features": ["auto"],
           "max_depth" : [None],
            "min_samples_leaf" : [2, 3],
            "min_samples_split": [3, 5, 7]
            }
GS_rf2 = GridSearchCV(
    estimator=rf,
    param_grid=grid_rf2,
    verbose=2,
    n_jobs=-1)

In [35]:
# GS_rf2.fit(vectorized_2, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   57.8s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  4.2min finished


GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['entropy'], 'max_depth': [None],
                         'max_features': ['auto'], 'min_samples_leaf': [2, 3],
                         'min_samples_split': [3, 5, 7],
                         'n_estimators': [250, 300, 400]},
             verbose=2)

In [36]:
# GS_rf2.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 7,
 'n_estimators': 400}

{'criterion': 'entropy',

 'max_depth': None,
 
 'max_features': 'auto',
 
 'min_samples_leaf': 2,
 
 'min_samples_split': 7,
 
 'n_estimators': 400}

In [37]:
# GS_rf2.best_score_

0.7456011745620648

0.7456011745620648


In [38]:
grid_rf3 = {"n_estimators":[400, 700, 1000],
           "criterion": ["entropy"],
           "max_features": ["auto"],
           "max_depth" : [None],
            "min_samples_leaf" : [2],
            "min_samples_split": [6, 7, 8]
            }
GS_rf3 = GridSearchCV(
    estimator=rf,
    param_grid=grid_rf3,
    verbose=2,
    n_jobs=-1)

In [39]:
# GS_rf3.fit(vectorized_2, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  4.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  4.6min finished


GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['entropy'], 'max_depth': [None],
                         'max_features': ['auto'], 'min_samples_leaf': [2],
                         'min_samples_split': [6, 7, 8],
                         'n_estimators': [400, 700, 1000]},
             verbose=2)

In [40]:
# GS_rf3.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'n_estimators': 1000}

{'criterion': 'entropy',

 'max_depth': None,
 
 'max_features': 'auto',
 
 'min_samples_leaf': 2,
 
 'min_samples_split': 6,
 
 'n_estimators': 1000}

In [41]:
# GS_rf3.best_score_

0.7454577939753413

0.7454577939753413

After these gridsearches, our accuracy score does not seem to be improving. The