<a href="https://colab.research.google.com/github/tyleretheridge/DS-API/blob/master/notebooks/Modeling_for_BW4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ALBERT IMPLEMENTATION

In [None]:
%%capture
!pip install bert-for-tf2
!pip install sentencepiece

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Model

import bert
from bert.tokenization.bert_tokenization import FullTokenizer

## Data Loading

`train.tsv` and `dev.tsv` are the training and validation sets with 4 columns of data each.  
- `guid`: a generic identifier for each observation derived from the URL the reddit post was retrieved from  
- `label`: an encoded label of each subreddit, from 0 to 1012. Label encoding is mapped in a separate file.  
- `text_b`: an optional field that is currently populated with "a" for all observations. Only used for sequential text classification/prediction.  
- `text_a`: The untokenized text that is to be used for multi-class classification training.   

`test.tsv` is a set of data formatted similarly to the `train.tsv` and `dev.tsv` set; however, it only contains `guid` and `text_a` columns.  

`encoding_maps` is a csv that contains the corresponding target labels for observations in the training datasets. Generated using sklearn LabelEncoder. Has two columns, `label` and `subreddit`.


In [None]:
# Import data from .tsv files
train = pd.read_csv('train.tsv', sep='\t', names=['guid','label','text_b','text_a'])
dev = pd.read_csv('dev.tsv', sep='\t', names=['guid','label','text_b','text_a'])
test = pd.read_csv('test.tsv', sep='\t', names=['guid', 'text_a'])

# Load encoding maps
encoding_maps = pd.read_csv('encoding_maps.csv', engine='python')

In [None]:
train.head()

## Tokenization

In [None]:
import sentencepiece as spm

spm_model = os.path.join(model_dir, "assets", "30k-clean.model")
sp = spm.SentencePieceProcessor()
sp.load(spm_model)
do_lower_case = True

processed_text = bert.albert_tokenization.preprocess_text("Hello, World!", lower=do_lower_case)
token_ids = bert.albert_tokenization.encode_ids(sp, processed_text)

I think the code below is for a different implementation of BERT, not the one I'm currently using

In [None]:
# GUID = 'guid'
# DATA_COLUMN = 'text_a'
# LABEL_COLUMN = 'label'

# train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=GUID,
#                                                                    text_a = x[DATA_COLUMN], 
#                                                                    text_b = None, 
#                                                                    label = x[LABEL_COLUMN]), axis = 1)
# val_InputExamples = val.apply(lambda x: bert.run_classifier.InputExample(guid=GUID, 
#                                                                    text_a = x[DATA_COLUMN], 
#                                                                    text_b = None, 
#                                                                    label = x[LABEL_COLUMN]), axis = 1)

## ALBERT Modeling

In [None]:
# Write a function to load ALBERT
def load_bert(albert_name):
  """
  Loads ALBERT pretrained model from TFHub
  Input: albert_name (str), 
    name of ALBERT model to load
  Returns: albert_params (dict), 
    loaded params to be used to build model
  """
  model_name = albert_name
  # Fetch ALBERT from TFHub
  albert_dir = bert.fetch_tfhub_albert_model(model_name, ".models")
  # Load ALBERT params
  albert_params = bert.albert_params(model_name)

  # Print status of loaded model
  print("Model Name:", model_name)
  print("Model Directory", model_dir)
  return albert_params

albert_params = load_bert('albert_base')

Fetching ALBERT model: albert_base version: 2
Already  fetched:  albert_base.tar.gz
already unpacked at: .models/albert_base
Model Name: albert_base
Model Directory .models/albert_base


In [None]:
# Write a function to build ALBERT model
def build_bert(albert_params):
  """
  Input: dictionary of BERT params
  """
  bert_layer = bert.BertModelLayer.from_params(albert_params, name="albert")
  l_input_ids = keras.layers.Input(shape=(128,), dtype='int32', name="input_ids")
  l_token_type_ids = keras.layers.Input(shape=(128,), dtype='int32', name="token_type_ids")
  output = l_bert([l_input_ids, l_token_type_ids])
  output = keras.layers.Lambda(lambda x: x[:, 0, :])(output)
  output = keras.layers.Dense(2)(output)
  model = keras.Model(inputs=[l_input_ids, l_token_type_ids], outputs=output)

  model.build(input_shape=(None, 128))
  model.compile(optimizer=keras.optimizers.Adam(),
                loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")])

  for weight in l_bert.weights:
      print(weight.name)

  model.summary()  
  return model, bert_layer

In [None]:
#huggingface

## Classification using a Feed-Forward Neural Network
Adapted from https://www.tensorflow.org/hub/tutorials/tf2_text_classification


In [None]:
import tensorflow as tf

model = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
hub_layer = hub.KerasLayer(model, output_shape=[128], input_shape=[], 
                           dtype=tf.string, trainable=True)

model = tf.keras.Sequential()
model.add(hub_layer)
# Increase nodes in hidden layer
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dense(1013, activation='softmax'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_2 (KerasLayer)   (None, 128)               124642688 
_________________________________________________________________
dense_4 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_5 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_6 (Dense)              (None, 1013)              260341    
Total params: 125,001,845
Trainable params: 125,001,845
Non-trainable params: 0
_________________________________________________________________


In [None]:
top_k = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, 
                                               name='sparse_top_k_categorical_accuracy', 
                                               dtype=None)

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="acc"), top_k])

In [None]:
model.fit(train['text_a'].values, train['label'].values,
          batch_size=1024,
          epochs=1000)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

KeyboardInterrupt: ignored

In [None]:
# adjust batch size
#model.save