In [0]:
%tensorflow_version 2.x

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from tensorflow import keras as k

from tensorflow.keras import layers as l

from sklearn.model_selection import train_test_split

In [0]:
df = pd.read_csv('/content/drive/My Drive/procSelectCharLines.csv')

In [0]:
df.head()

Unnamed: 0,Season,Episode,Character,Line,Processed Line
0,10,1,Stan,"You guys, you guys! Chef is going away.",you guys you guys chef is going away
1,10,1,Kyle,Going away? For how long?,going away for how long
2,10,1,Stan,"Chef said he's been bored, so he joining a gro...",chef said hes been bored so he joining a group...
3,10,1,Cartman,I'm gonna miss him. I'm gonna miss Chef and I...,im gonna miss him im gonna miss chef and i an...
4,10,1,Stan,"Dude, how are we gonna go on? Chef was our fuh...",dude how are we gonna go on chef was our fuh f...


In [0]:
def map_char(char):
  for i, chars in enumerate(df['Character'].unique()):
    if char == chars:
      return i
  
  return None

In [0]:
df['Target'] = df['Character'].map(map_char)

In [0]:
df.head()

Unnamed: 0,Season,Episode,Character,Line,Processed Line,Target
0,10,1,Stan,"You guys, you guys! Chef is going away.",you guys you guys chef is going away,0
1,10,1,Kyle,Going away? For how long?,going away for how long,1
2,10,1,Stan,"Chef said he's been bored, so he joining a gro...",chef said hes been bored so he joining a group...,0
3,10,1,Cartman,I'm gonna miss him. I'm gonna miss Chef and I...,im gonna miss him im gonna miss chef and i an...,2
4,10,1,Stan,"Dude, how are we gonna go on? Chef was our fuh...",dude how are we gonna go on chef was our fuh f...,0


In [0]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip

In [0]:
# download pretrained GloVe embeddings
#!wget "https://nlp.stanford.edu/data/glove.840B.300d.zip"

--2020-04-06 13:39:51--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
Unable to establish SSL connection.


In [0]:
RANDOM = 1
lines, targets = [], []

for line, targ in zip(df['Processed Line'], df['Target']):
  lines.append(line)
  targets.append(targ)

x_train, x_test, y_train,  y_test = train_test_split(lines, targets, 
                                                    test_size=0.1, 
                                                    random_state=RANDOM)

y_train = np.array(y_train)
y_test = np.array(y_test)

## Basic Model - Initial baseline

Just a basic model, built from either LSTMs, GRUs or temporal Convolutions. This is to establish a weak baseline for prediction.

In [0]:
words = []

for line in df['Processed Line']:
  tokens = line.split()
  words.extend([word for word in tokens])

vocab = sorted(set(words))

In [0]:
len(vocab)

14636

In [0]:
vocab_size = 14500
embed_dim = 16
max_len = 50
trunc = 'post'
pad = 'post'
oov = '<OOV>'

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = vocab_size,
                                                          oov_token = oov)

tokenizer.fit_on_texts(x_train)

word2idx = tokenizer.word_index

train_seq = tokenizer.texts_to_sequences(x_train)
test_seq = tokenizer.texts_to_sequences(x_test)

train_pad = tf.keras.preprocessing.sequence.pad_sequences(
    train_seq, padding=pad, maxlen=max_len, truncating=trunc
)
test_pad = tf.keras.preprocessing.sequence.pad_sequences(
    test_seq, maxlen=max_len
)


In [0]:
idx2word = dict([(v,k) for (k,v) in word2idx.items()])

In [0]:
def build_model(units = 32, layer_type = 'lstm', bidirectional = False, rec_count = 1):
  
  # units - units in recurrent layers
  # layer_type - one of {'lstm', 'gru', 'conv'}
  # bidirectional - bidirectional recurrent layers
  # rec_count - number of recurrent layers

  layer_types = ['lstm', 'gru', 'conv']

  if layer_type not in layer_types:
    raise ValueError("Invalid layer type. Expected one of: {}".format(repr(layer_types)))
  
  model = k.Sequential()
  # Text modelling begins with embeddings
  model.add(k.layers.Embedding(vocab_size, embed_dim, input_length=max_len))

  if layer_type == 'lstm':

    if bidirectional:
      for _ in range(rec_count-1):
        # Recurrent layers which pass their outputs to another recurrent layer 
        # need the `return_sequences` argument set to True.
        model.add(k.layers.Bidirectional(k.layers.LSTM(units, return_sequences=True, activation='relu')))
      # Final recurrent layer does not need to return sequences
      model.add(k.layers.Bidirectional(k.layers.LSTM(units, activation='relu')))

    else:
      for _ in range(rec_count-1):
        model.add(k.layers.LSTM(units, activation='relu', return_sequences=True))
      model.add(k.layers.LSTM(units, activation='relu'))

  elif layer_type == 'gru':

    if bidirectional:
      for _ in range(rec_count-1):
        model.add(k.layers.Bidirectional(k.layers.GRU(units, activation='relu', return_sequences=True)))
      model.add(k.layers.Bidirectional(k.layers.GRU(units, activation='relu')))

    else:
      for _ in range(rec_count-1):
        model.add(k.layers.GRU(units, activation='relu', return_sequences=True))
      model.add(k.layers.GRU(units, activation='relu'))

  else:

    model.add(k.layers.Conv1D(128, 5, activation='relu'))
    
    # Either average or max pooling may be used - you could even use both
    # if using the functional API
    model.add(k.layers.GlobalAveragePooling1D())
    #model.add(k.layers.GlobalMaxPooling1D())

  # Dense layers form the head of our network as usual
  model.add(k.layers.Dense(32, activation='relu'))
  model.add(k.layers.Dense(6, activation='sigmoid'))

  return model

The convolution variant resulted in the best validation accuracy out of the tested options.

In [0]:
model = build_model(32, 'conv', True, 2)

In [0]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [0]:
epochs = 10
model.fit(train_pad, y_train, epochs=epochs, validation_data=(test_pad, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fd8290b31d0>

## BERT

In [0]:
!pip install bert-for-tf2 
!pip install sentencepiece

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/ff/84/1bea6c34d38f3e726830d3adeca76e6e901b98cf5babd635883dbedd7ecc/bert-for-tf2-0.14.1.tar.gz (40kB)
[K     |████████                        | 10kB 24.1MB/s eta 0:00:01[K     |████████████████▏               | 20kB 823kB/s eta 0:00:01[K     |████████████████████████▎       | 30kB 1.1MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 1.1MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/ac/0d/615c0d4aea541b4f47c761263809a02e160e7a2babd175f0ddd804776cf4/params-flow-0.8.0.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

In [0]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

In [0]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [0]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [0]:
tokenizer.tokenize("Don't be so judgmental")

['don', "'", 't', 'be', 'so', 'judgment', '##al']

In [0]:
RANDOM = 1
lines, targets = [], []

for line, targ in zip(df['Line'], df['Target']):
  line.lower()
  lines.append(line)
  targets.append(targ)

x_train, x_test, y_train,  y_test = train_test_split(lines, targets, 
                                                    test_size=0.1, 
                                                    random_state=RANDOM)

y_train = np.array(y_train)
y_test = np.array(y_test)

In [0]:
train_input = bert_encode(x_train, tokenizer, max_len=50)
test_input = bert_encode(x_test, tokenizer, max_len=50)

In [0]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

In [0]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    hidden1 = Dense(100, activation='relu')(clf_output)
    hidden2 = Dense(50, activation='relu')(hidden1)
    out = Dense(6, activation='sigmoid')(hidden2)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [0]:
model = build_model(bert_layer, max_len=50)


In [0]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 50)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 50)]         0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [0]:
train_history = model.fit(
    train_input, y_train,
    validation_data=(test_input, y_test),
    epochs=10,
    batch_size=16
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

KeyboardInterrupt: ignored

In [0]:
train_input

(array([[ 101, 2017, 4995, ...,    0,    0,    0],
        [ 101, 2057, 2074, ..., 2342, 2149,  102],
        [ 101, 1045, 2052, ...,    0,    0,    0],
        ...,
        [ 101, 4441, 4828, ...,    0,    0,    0],
        [ 101, 2821, 1010, ...,    0,    0,    0],
        [ 101, 2092, 2054, ...,    0,    0,    0]]),
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

In [0]:
tests = ['Screw you guys, i\'m going home.',
                          'Fuck you Cartman!',
                          'Oh gee I\'m sorry fellas.']

In [0]:
test_input = bert_encode(['Screw you guys, i\'m going home.',
                          'Fuck you Cartman!',
                          'Oh gee I\'m sorry fellas.'], tokenizer, max_len=50)

In [0]:
preds = model.predict(test_input)

In [0]:
preds

array([[0.03856492, 0.0137706 , 0.6433693 , 0.00087328, 0.00162739,
        0.0065754 ],
       [0.06894154, 0.43999866, 0.00863979, 0.00977661, 0.00385578,
        0.00557859],
       [0.01207385, 0.0161553 , 0.00196231, 0.03866629, 0.86405784,
        0.03760148]], dtype=float32)

In [0]:
df['Character'].unique()

array(['Stan', 'Kyle', 'Cartman', 'Randy', 'Butters', 'Mr. Garrison'],
      dtype=object)

In [0]:
for line, pred in zip(tests, preds):
  print('Line: '+ '\'' + line + '\'')
  char_pred = df['Character'].unique()[np.argmax(pred)]
  print('AI predicts... ' + char_pred + ' ... said that.')
  print()


Line: 'Screw you guys, i'm going home.'
AI predicts... Cartman ... said that.

Line: 'Fuck you Cartman!'
AI predicts... Kyle ... said that.

Line: 'Oh gee I'm sorry fellas.'
AI predicts... Butters ... said that.

