In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import random
import gc

from tqdm import tqdm
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoModel, AutoTokenizer, AutoConfig, AdamW

gc.collect()
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
random.seed(42)
torch.manual_seed(42)
np.random.seed(42)

***Prepare data***

In [3]:
!apt-get install unzip
!unzip ../input/sentiment-analysis-on-movie-reviews/test.tsv.zip test.tsv
!unzip ../input/sentiment-analysis-on-movie-reviews/train.tsv.zip train.tsv

In [4]:
os.listdir()

In [5]:
import pandas as pd

In [6]:
## preprocessing
train = pd.read_csv("train.tsv",sep="\t")
train.head()

In [7]:
train["Sentiment"].value_counts().plot(kind="bar")

In [8]:
seq_length = 512

num_samples = len(train)

num_samples,seq_length

In [9]:
from transformers import BertTokenizer

In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [11]:
tokens = tokenizer(train["Phrase"].tolist(),
                   max_length = seq_length,
                   truncation=True,
                  padding = "max_length",
                  add_special_tokens = True,
                  return_tensors = "np")

In [12]:
tokens.keys()

In [13]:
tokens['input_ids']

In [14]:
import numpy as np 

with open("movie-xids.npy","wb") as f:
    np.save(f,tokens['input_ids'])
with open("movie-xmask.npy","wb") as f:
    np.save(f,tokens['attention_mask'])

In [15]:
arr = train['Sentiment']

In [16]:
arr.nunique()

In [17]:
labels = np.zeros((num_samples,arr.nunique()))
labels.shape

In [18]:
labels[np.arange(num_samples),arr] = 1

In [19]:
with open("movie-labels.npy","wb") as f:
    np.save(f,labels)

In [20]:
with open("movie-xids.npy","rb") as f:
    Xids = np.load(f,allow_pickle = True)
with open("movie-xmask.npy","rb") as f:
    Xmask = np.load(f,allow_pickle = True)
with open("movie-labels.npy","rb") as f:
    labels = np.load(f,allow_pickle = True)

In [21]:
import tensorflow as tf

In [22]:
dataset = tf.data.Dataset.from_tensor_slices(
(Xids,Xmask,labels))

In [23]:
def map_function(input_ids,masks,labels):
    return {"input_ids": input_ids,
           "attention_mask":masks},labels

In [24]:
dataset = dataset.map(map_function)

In [25]:
batch_size = 16

In [26]:
dataset = dataset.shuffle(10000).batch(batch_size,
                                      drop_remainder = True)

In [27]:
split = 0.9

In [28]:
size = int((Xids.shape[0] / batch_size) * split)

In [29]:
train_ds = dataset.take(size)

In [30]:
val_ds= dataset.skip(size)

In [31]:
tf.data.experimental.save(train_ds,"train")
tf.data.experimental.save(val_ds,"val")

In [32]:
train_ds.element_spec

In [33]:
ds = tf.data.experimental.load("train",
            element_spec = train_ds.element_spec)

In [34]:
from transformers import TFAutoModel

In [35]:
bert = TFAutoModel.from_pretrained("bert-base-cased")

In [36]:
bert.summary()

In [37]:
import tensorflow as tf

# two input layers, we ensure layer name variables match to dictionary keys in TF dataset
input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

# we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)
embeddings = bert.bert(input_ids, attention_mask=mask)[1]  # access final activations (alread max-pooled) [1]
# convert bert embeddings into 5 output classes
x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(5, activation='softmax', name='outputs')(x)

In [38]:
# initialize model
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

# (optional) freeze bert layer
model.layers[2].trainable = False

# print out model summary
model.summary()

In [40]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [41]:
 element_spec = ({'attention_mask': tf.TensorSpec(shape=(16, 512),
                                                  dtype=tf.int64, name=None),
                  'input_ids': tf.TensorSpec(shape=(16, 512), dtype=tf.int64, name=None)},
                 tf.TensorSpec(shape=(16, 5), dtype=tf.float64, name=None))  


# load the training and validation sets
train_ds = tf.data.experimental.load('train', element_spec=element_spec)
val_ds = tf.data.experimental.load('val', element_spec=element_spec)

# view the input format
train_ds.take(1)

In [42]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=3
)