In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.dummy import DummyClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.backend import clear_session
from tensorflow.keras.layers import Dropout
from tensorflow.keras.regularizers import l2

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.decomposition import PCA

from skopt.space import Integer, Real, Categorical
from skopt import BayesSearchCV
from scipy.stats import uniform, loguniform, randint

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
char_df = pd.read_csv('../data/csv/ShakespeareCharacterLines_character_corpus.csv', index_col = ['play', 'name'])

In [3]:
type_df = pd.read_csv('../data/csv/ShakespeareCharacterLines_play_corpus.csv', index_col = ['play'])

In [4]:
colon = slice(None)

- tragedy/comedy column for predictions, MNB, look for other things that work well on language data.

In [7]:
char_xlist = char_df.columns.tolist()
char_xlist.remove('character_dies')

char_X = char_df[char_xlist]

char_y = char_df['character_dies']

char_X_train, char_X_test, char_y_train, char_y_test = train_test_split(char_X, char_y, random_state = 42, stratify = char_y)

sc = StandardScaler()

char_Xs_train = sc.fit_transform(char_X_train)
char_Xs_test = sc.transform(char_X_test)
char_Xs = sc.transform(char_X)

MemoryError: Unable to allocate 14.1 GiB for an array with shape (22134, 85721) and data type int64

- Move PCA here with images.

- MNBchar

In [None]:
char_mnb_pipe = MultinomialNB(fit_prior = False)

char_mnb_params = {
    'alpha': uniform(0, 1)
}

char_mnb_rs = RandomizedSearchCV(estimator = char_mnb_pipe,
                     param_distributions = char_mnb_params,
                     scoring = 'roc_auc',
                     n_iter = 50,
                     n_jobs = 8,
                     cv = 5,
                     refit = True,
                     random_state=42)

In [None]:
%%time
char_mnb_rs.fit(char_X_train, char_y_train)

In [None]:
train_preds = char_mnb_rs.best_estimator_.predict(char_X_train)
test_preds = char_mnb_rs.best_estimator_.predict(char_X_test)

metrics.roc_auc_score(char_y_train, train_preds), metrics.roc_auc_score(char_y_test, test_preds)

In [None]:
weights = {key: value for key, value in zip(char_X.columns.tolist(), char_mnb_rs.best_estimator_.feature_log_prob_[0])}
weights = {k: v for k, v in sorted(weights.items(), key=lambda item: item[1], reverse = True)}

In [None]:
weights

In [None]:
plot_confusion_matrix(char_mnb_rs.best_estimator_, char_X_train, char_y_train)

- Baseline accuracy.

In [None]:
char_df['character_dies'].value_counts(normalize = True)

- Char logreg pipe.

In [None]:
logreg_pipe = Pipeline([
    ('sc', StandardScaler()),
    ('logreg', LogisticRegression(random_state = 42, solver = 'liblinear', penalty = 'l1'))
])

logreg_params = {
    'logreg__tol': uniform(0, .1),
    'logreg__C': loguniform(0.0001, 100),
    'logreg__class_weight': Categorical(['balanced', None]),
    'logreg__max_iter': randint(1, 1000),
    'logreg__l1_ratio': uniform(0, 1)
}

logreg_rs_rocauc = RandomizedSearchCV(estimator = logreg_pipe,
                     param_distributions = logreg_params,
                     scoring = 'roc_auc',
                     n_iter = 50,
                     n_jobs = 8,
                     cv = 5,
                     refit = True,
                     random_state=42)

In [None]:
%%time
logreg_rs_rocauc.fit(X_train, y_train)

In [None]:
train_preds = logreg_rs_rocauc.best_estimator_['logreg'].predict(Xs_train)
test_preds = logreg_rs_rocauc.best_estimator_['logreg'].predict(Xs_test)

metrics.roc_auc_score(y_train, train_preds), metrics.roc_auc_score(y_test, test_preds)

In [None]:
weights = {key: value for key, value in zip(x_list, logreg_rs_rocauc.best_estimator_['logreg'].coef_[0])}
weights = {k: v for k, v in sorted(weights.items(), key=lambda item: item[1], reverse = True)}

In [None]:
weights

In [None]:
plot_confusion_matrix(logreg_rs_rocauc.best_estimator_['logreg'], Xs_train, y_train)

- Char NN with PCA.

In [None]:
pca = PCA(n_components = 1000, random_state = 42)
char_Z_train = pca.fit_transform(char_Xs_train)
char_Z_test = pca.transform(char_Xs_test)
char_Z = pca.transform(char_Xs)

In [None]:
%%time
clear_session()
model = Sequential()

model.add(Dense(30, activation = 'relu', input_shape = (1000,)))
model.add(Dense(30, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'adam')

history = model.fit(char_Z_train, char_y_train, validation_data = (char_Z_test, char_y_test), batch_size = 512, 
                   epochs = 20, verbose = 0)

In [None]:
plt.plot(history.epoch, history.history['loss'], c='g');      # green - training loss # Loss
plt.plot(history.epoch, history.history['val_loss'], c='b');  # blue - test loss # Val loss

- char_df predictions block

In [None]:
char_pred_df = pd.DataFrame(index = char_df.index)

In [None]:
char_pred_df['prob'] = [val[1] for val in char_mnb_rs.best_estimator_.predict_proba(char_X)]

In [None]:
char_pred_df['pred'] = char_pred_df['prob'].map(lambda x: 1 if x >= .5 else 0)

In [None]:
for play, name in index_without_number:
    char_pred_df.loc[(play, name), 'character_dies'] = char_df.loc[(play, name), 'character_dies']

In [None]:
char_pred_df.sort_values(by = 'prob', inplace = True)

In [None]:
plot_confusion_matrix(char_mnb_rs.best_estimator_, char_X, char_y)

In [None]:
char_pred_df.loc[('titus-andronicus', colon), :]