In [None]:
# try:
#     %tensorflow_version 2.x
# except Exception:
#     pass

In [23]:
SETUP = True

In [24]:
if SETUP:
    !pip install -q -U toai
    !pip install -q -U nb_black
    !pip install -q -U tensorflow-datasets
    !pip install -q -U --no-deps tensorflow-addons~=0.6
    !pip install -q -U tensorflow_hub
    !pip install -q -U git+https://github.com/huggingface/transformers
    print(__import__("toai").__version__)
    print(__import__("tensorflow").__version__)

0.3.7
2.0.0


In [25]:
# %load_ext nb_black

In [26]:
import os

# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [27]:
from toai.imports import *
from toai.data import DataBundle, DataParams, DataContainer
from toai.metrics import sparse_top_2_categorical_accuracy
from toai.utils import save_file, load_file
from toai.models import save_keras_model, load_keras_model
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import transformers

In [28]:
DATA_DIR = Path("/kaggle/input/game-of-thrones-script-all-seasons")
DATA_DIR.mkdir(parents=True, exist_ok=True)

TEMP_DIR = Path("game-of-thrones-script-all-seasons")
TEMP_DIR.mkdir(parents=True, exist_ok=True)

In [29]:
BATCH_SIZE = 16
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [30]:
game_of_thrones_data = pd.read_csv(DATA_DIR / "Game_of_Thrones_Script.csv", low_memory=False)

In [31]:
# kaggle datasets download -d albenft/game-of-thrones-script-all-seasons

In [32]:
game_of_thrones_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23911 entries, 0 to 23910
Data columns (total 6 columns):
Release Date     23911 non-null object
Season           23911 non-null object
Episode          23911 non-null object
Episode Title    23911 non-null object
Name             23908 non-null object
Sentence         23911 non-null object
dtypes: object(6)
memory usage: 1.1+ MB


In [33]:
game_of_thrones_data.describe(include="all")

Unnamed: 0,Release Date,Season,Episode,Episode Title,Name,Sentence
count,23911,23911,23911,23911,23908,23911
unique,73,8,10,73,564,22300
top,2017-08-13,Season 2,Episode 5,Eastwatch,tyrion lannister,No.
freq,505,3914,3083,505,1760,103


In [34]:
game_of_thrones_data.head()

Unnamed: 0,Release Date,Season,Episode,Episode Title,Name,Sentence
0,2011-04-17,Season 1,Episode 1,Winter is Coming,waymar royce,What do you expect? They're savages. One lot s...
1,2011-04-17,Season 1,Episode 1,Winter is Coming,will,I've never seen wildlings do a thing like this...
2,2011-04-17,Season 1,Episode 1,Winter is Coming,waymar royce,How close did you get?
3,2011-04-17,Season 1,Episode 1,Winter is Coming,will,Close as any man would.
4,2011-04-17,Season 1,Episode 1,Winter is Coming,gared,We should head back to the wall.


In [35]:
name_slice = slice(10, 15)

In [36]:
game_of_thrones_data["Name"].value_counts()[name_slice]

sam                399
bran stark         399
bronn              393
man                381
tywin lannister    381
Name: Name, dtype: int64

In [37]:
# tyrion lannister      1760
# jon snow              1133
# daenerys targaryen    1048
# cersei lannister      1005
# jaime lannister        945
# Name: Name, dtype: int64

In [38]:
def keep_values(df, col_name, values):
    return df.loc[df[col_name].isin(values), :].reset_index(drop=True)

In [39]:
df = keep_values(
    game_of_thrones_data,
    "Name",
    game_of_thrones_data["Name"].value_counts()[10:15].index,
)

In [40]:
# df = game_of_thrones_data

In [41]:
df["Name"].value_counts()

bran stark         399
sam                399
bronn              393
man                381
tywin lannister    381
Name: Name, dtype: int64

In [42]:
data_bundle = DataBundle.from_dataframe(dataframe=df, x_col="Sentence", y_col="Name")

In [43]:
train_bundle, val_bundle, test_bundle = DataBundle.split(
    data_bundle=data_bundle, fracs=[0.8, 0.1, 0.1], random=True
)

In [44]:
len(train_bundle), len(val_bundle), len(test_bundle)

(1563, 196, 194)

In [45]:
def init_label_map(filename, data_bundle):
    try:
        label_map = load_file(filename)
    except:
        label_map = data_bundle.make_label_map()
        save_file(label_map, filename)
    return label_map

In [46]:
label_map = init_label_map(TEMP_DIR / "label_map_v11.pickle", val_bundle)

In [47]:
label_map

{'bran stark': 0, 'bronn': 1, 'man': 2, 'sam': 3, 'tywin lannister': 4}

In [48]:
train_bundle.apply_label_map(label_map)

In [49]:
val_bundle

<toai.data.DataBundle.DataBundle at 0x7fadf1cd4438>

In [50]:
val_bundle.apply_label_map(label_map)

In [51]:
test_bundle.apply_label_map(label_map)

In [52]:
class_weights = dict(
    enumerate(
        sk.utils.class_weight.compute_class_weight(
            "balanced", np.unique(train_bundle.y), train_bundle.y
        )
    )
)

In [53]:
class_weights

{0: 1.0316831683168317,
 1: 1.005144694533762,
 2: 0.9799373040752352,
 3: 0.9387387387387387,
 4: 1.0525252525252524}

In [54]:
# Rethink this
train_bundle = DataBundle.from_unbalanced(
    train_bundle, 400, train_bundle.value_counts()
)

In [55]:
train_bundle.value_counts()

{0: 400, 1: 400, 2: 400, 3: 400, 4: 400}

In [56]:
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-cased")

100%|██████████| 213450/213450 [00:00<00:00, 1000594.80B/s]


In [57]:
def tokenize_and_pad(arr, tokenizer, sequence_length):
    return np.array(
        [
            np.pad(
                tokenizer.encode(x, add_special_tokens=True),
                (0, sequence_length),
                "constant",
                constant_values=tokenizer.pad_token_id,
            )[:sequence_length]
            for x in arr
        ]
    )

In [58]:
train_bundle.x = tokenize_and_pad(train_bundle.x, tokenizer, 64)
val_bundle.x = tokenize_and_pad(val_bundle.x, tokenizer, 64)
test_bundle.x = tokenize_and_pad(test_bundle.x, tokenizer, 64)

In [59]:
def make_bert_dataset(data_bundle, tokenizer):
    features = tf.data.Dataset.from_tensor_slices(data_bundle.x)
    labels = tf.data.Dataset.from_tensor_slices(data_bundle.y)
    dataset = tf.data.Dataset.zip((features, labels)).map(
        lambda x, y: (
            {
                "input_ids": x,
                "attention_mask": int(x != tokenizer.pad_token_id),
                "token_type_ids": tf.zeros_like(x),
            },
            y,
        )
    )
    return dataset

In [60]:
base_bert_dataset = make_bert_dataset(train_bundle, tokenizer)

In [61]:
train_bert_dataset = (
    base_bert_dataset.cache()
    .repeat()
    .shuffle(len(train_bundle))
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

In [62]:
validation_bert_dataset = (
    make_bert_dataset(val_bundle, tokenizer).batch(BATCH_SIZE).prefetch(AUTOTUNE)
)

In [63]:
test_bert_dataset = (
    make_bert_dataset(test_bundle, tokenizer).batch(BATCH_SIZE).prefetch(AUTOTUNE)
)

In [64]:
data_container = DataContainer(
    base=base_bert_dataset,
    train=train_bert_dataset,
    train_steps=len(train_bundle) // BATCH_SIZE,
    validation=validation_bert_dataset,
    test=test_bert_dataset,
    label_map=label_map,
)

In [65]:
data_container.n_classes

5

In [66]:
def train_model(
    model,
    data_container,
    epochs,
    lrs=None,
    optimizers=None,
    patience=5,
    verbose=1,
    class_weights=None,
    log_dir=str(TEMP_DIR / "logs"),
):
    model.layers[0].trainable = False
    if optimizers is None:
        optimizers = [keras.optimizers.Adam(lr) for lr in lrs]
    model.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=optimizers[0],
        metrics=[
            keras.metrics.sparse_categorical_accuracy,
            sparse_top_2_categorical_accuracy,
        ],
    )
    model.fit(
        data_container.train,
        steps_per_epoch=data_container.train_steps,
        validation_data=data_container.validation,
        epochs=epochs[0],
        callbacks=[
            keras.callbacks.ReduceLROnPlateau(patience=1, factor=0.3),
            keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True),
        ],
        class_weight=class_weights,
        verbose=verbose,
    )
    model.layers[0].trainable = True
    model.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=optimizers[1],
        metrics=[
            keras.metrics.sparse_categorical_accuracy,
            sparse_top_2_categorical_accuracy,
        ],
    )
    model.fit(
        data_container.train,
        steps_per_epoch=data_container.train_steps,
        validation_data=data_container.validation,
        epochs=epochs[1],
        callbacks=[
            keras.callbacks.ReduceLROnPlateau(patience=patience // 2, factor=0.3),
            keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True),
            keras.callbacks.TensorBoard(log_dir=log_dir),
        ],
        class_weight=class_weights,
        verbose=verbose,
    )

In [67]:
config = transformers.BertConfig.from_pretrained(
    "bert-base-cased", num_labels=data_container.n_classes
)

100%|██████████| 313/313 [00:00<00:00, 145255.27B/s]


In [68]:
class TFBertForSequenceClassification(transformers.TFBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super(TFBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        self.bert = transformers.TFBertMainLayer(config, name="bert")
        self.dropout1 = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.hidden1 = tf.keras.layers.Dense(1024, activation=tf.keras.activations.relu)
        self.dropout2 = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(
            config.num_labels,
            kernel_initializer=transformers.modeling_tf_utils.get_initializer(
                config.initializer_range
            ),
            name="classifier",
        )

    def call(self, inputs, **kwargs):
        outputs = self.bert(inputs, **kwargs)

        pooled_output = outputs[1]

        pooled_output = self.dropout1(
            pooled_output, training=kwargs.get("training", False)
        )
        hidden = self.dropout2(
            self.hidden1(pooled_output), training=kwargs.get("training", False)
        )
        logits = self.classifier(hidden)

        outputs = (logits,)

        return outputs

In [69]:
model = transformers.TFBertForSequenceClassification.from_pretrained(
    "bert-base-cased", config=config
)

100%|██████████| 526681800/526681800 [00:17<00:00, 29707282.37B/s]


In [70]:
# model = TFBertForSequenceClassification.from_pretrained(
#     "bert-base-cased", config=config
# )

In [71]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  3845      
Total params: 108,314,117
Trainable params: 108,314,117
Non-trainable params: 0
_________________________________________________________________


In [72]:
# Model: "tf_bert_for_sequence_classification_1"
# _________________________________________________________________
# Layer (type)                 Output Shape              Param #   
# =================================================================
# bert (TFBertMainLayer)       multiple                  108310272 
# _________________________________________________________________
# dropout_76 (Dropout)         multiple                  0         
# _________________________________________________________________
# dense_1 (Dense)              multiple                  787456    
# _________________________________________________________________
# dropout_77 (Dropout)         multiple                  0         
# _________________________________________________________________
# classifier (Dense)           multiple                  5125      
# =================================================================
# Total params: 109,102,853
# Trainable params: 109,102,853
# Non-trainable params: 0
# _____________________________

In [73]:
# Train for 281 steps, validate for 37 steps
# Train for 281 steps, validate for 37 steps
# Epoch 1/24
#   1/281 [..............................] - ETA: 1:40:53 - loss: 1.5747 - sparse_categorical_accuracy: 0.1250 - sparse_top_2_categorical_accuracy: 0.5625WARNING:tensorflow:Method (on_train_batch_end) is slow compared to the batch update (8.315654). Check your callbacks.
# 281/281 [==============================] - 217s 772ms/step - loss: 1.6096 - sparse_categorical_accuracy: 0.2186 - sparse_top_2_categorical_accuracy: 0.4433 - val_loss: 1.6321 - val_sparse_categorical_accuracy: 0.2169 - val_sparse_top_2_categorical_accuracy: 0.3881
# Epoch 2/24
# 281/281 [==============================] - 175s 622ms/step - loss: 1.6024 - sparse_categorical_accuracy: 0.2304 - sparse_top_2_categorical_accuracy: 0.4577 - val_loss: 1.6255 - val_sparse_categorical_accuracy: 0.2186 - val_sparse_top_2_categorical_accuracy: 0.3831
# Epoch 3/24
# 281/281 [==============================] - 176s 625ms/step - loss: 1.5975 - sparse_categorical_accuracy: 0.2453 - sparse_top_2_categorical_accuracy: 0.4655 - val_loss: 1.6088 - val_sparse_categorical_accuracy: 0.2305 - val_sparse_top_2_categorical_accuracy: 0.4203
# Epoch 4/24
# 281/281 [==============================] - 175s 624ms/step - loss: 1.5865 - sparse_categorical_accuracy: 0.2636 - sparse_top_2_categorical_accuracy: 0.4940 - val_loss: 1.5901 - val_sparse_categorical_accuracy: 0.2254 - val_sparse_top_2_categorical_accuracy: 0.4814
# Epoch 5/24
# 281/281 [==============================] - 175s 624ms/step - loss: 1.5609 - sparse_categorical_accuracy: 0.2854 - sparse_top_2_categorical_accuracy: 0.5316 - val_loss: 1.5770 - val_sparse_categorical_accuracy: 0.2373 - val_sparse_top_2_categorical_accuracy: 0.4814
# Epoch 6/24
# 281/281 [==============================] - 176s 625ms/step - loss: 1.5318 - sparse_categorical_accuracy: 0.3116 - sparse_top_2_categorical_accuracy: 0.5661 - val_loss: 1.5736 - val_sparse_categorical_accuracy: 0.2390 - val_sparse_top_2_categorical_accuracy: 0.4695
# Epoch 7/24
# 281/281 [==============================] - 175s 623ms/step - loss: 1.4957 - sparse_categorical_accuracy: 0.3421 - sparse_top_2_categorical_accuracy: 0.6052 - val_loss: 1.5676 - val_sparse_categorical_accuracy: 0.2441 - val_sparse_top_2_categorical_accuracy: 0.4729
# Epoch 8/24
# 281/281 [==============================] - 176s 627ms/step - loss: 1.4562 - sparse_categorical_accuracy: 0.3563 - sparse_top_2_categorical_accuracy: 0.6323 - val_loss: 1.5624 - val_sparse_categorical_accuracy: 0.2661 - val_sparse_top_2_categorical_accuracy: 0.5153
# Epoch 9/24
# 281/281 [==============================] - 175s 622ms/step - loss: 1.4208 - sparse_categorical_accuracy: 0.3734 - sparse_top_2_categorical_accuracy: 0.6492 - val_loss: 1.5703 - val_sparse_categorical_accuracy: 0.2847 - val_sparse_top_2_categorical_accuracy: 0.5034
# Epoch 10/24
# 281/281 [==============================] - 175s 623ms/step - loss: 1.3966 - sparse_categorical_accuracy: 0.3848 - sparse_top_2_categorical_accuracy: 0.6797 - val_loss: 1.5663 - val_sparse_categorical_accuracy: 0.2898 - val_sparse_top_2_categorical_accuracy: 0.5119

In [74]:
#               precision    recall  f1-score   support

#            0       0.19      0.06      0.10        93
#            1       0.30      0.47      0.37       110
#            2       0.22      0.24      0.23       110
#            3       0.24      0.50      0.33       101
#            4       0.40      0.12      0.19       176

#     accuracy                           0.27       590
#    macro avg       0.27      0.28      0.24       590
# weighted avg       0.29      0.27      0.24       590

In [75]:
train_model(
    model,
    data_container,
    [0, 24],
    [3e-6, 1e-7],
    class_weights=class_weights,
    patience=3,
)

Train for 125 steps, validate for 13 steps
Train for 125 steps, validate for 13 steps
Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24
Epoch 12/24
Epoch 13/24
Epoch 14/24
Epoch 15/24
Epoch 16/24
Epoch 17/24
Epoch 18/24
Epoch 19/24

KeyboardInterrupt: 

In [76]:
print(
    classification_report(
        [label.numpy() for _, label in data_container.validation.unbatch()],
        model.predict(data_container.validation).argmax(axis=1),
    )
)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        42
           1       0.27      0.38      0.32        47
           2       0.25      0.48      0.33        29
           3       0.21      0.42      0.28        36
           4       0.50      0.02      0.05        42

    accuracy                           0.24       196
   macro avg       0.25      0.26      0.19       196
weighted avg       0.25      0.24      0.19       196



  'precision', 'predicted', average, warn_for)


In [77]:
train_model(
    model,
    data_container,
    [24, 24],
    [3e-6, 3e-6],
    class_weights=class_weights,
    patience=3,
)

Train for 125 steps, validate for 13 steps
Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24
Epoch 12/24
Epoch 13/24
Train for 125 steps, validate for 13 steps
Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24


In [78]:
print(
    classification_report(
        [label.numpy() for _, label in data_container.validation.unbatch()],
        model.predict(data_container.validation).argmax(axis=1),
    )
)

              precision    recall  f1-score   support

           0       0.59      0.62      0.60        42
           1       0.60      0.55      0.58        47
           2       0.58      0.52      0.55        29
           3       0.28      0.36      0.32        36
           4       0.65      0.57      0.61        42

    accuracy                           0.53       196
   macro avg       0.54      0.52      0.53       196
weighted avg       0.55      0.53      0.54       196



In [None]:
train_model(
    model,
    data_container,
    [12, 12],
    [3e-5, 3e-6],
    class_weights=class_weights,
    patience=3,
)

In [None]:
print(
    classification_report(
        [label.numpy() for _, label in data_container.validation.unbatch()],
        model.predict(data_container.validation).argmax(axis=1),
    )
)

In [None]:
#               precision    recall  f1-score   support

#            0       0.48      0.42      0.45       128
#            1       0.27      0.23      0.25        66
#            2       0.80      0.59      0.68       116
#            3       1.00      1.00      1.00        40
#            4       0.35      0.54      0.42        50
#            5       0.51      0.42      0.46       109
#            6       0.46      0.64      0.53        42
#            7       0.51      0.24      0.32       101
#            8       0.29      0.46      0.36        52
#            9       0.19      0.68      0.30        22

#     accuracy                           0.47       726
#    macro avg       0.49      0.52      0.48       726
# weighted avg       0.52      0.47      0.47       726