In [1]:
import tokenizers
import transformers
import torch
from torch import nn
from sklearn import metrics, model_selection

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(level=logging.ERROR)

In [3]:
user = !whoami
user

['root']

In [29]:
class Config:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    MAX_LEN = 140
    TRAIN_BATCH_SIZE = 8
    VALID_BATCH_SIZE=4
    EPOCHS = 5
    LR=0.2*3e-5
    BERT_PATH=f"../input/bert-uncased-multilingual-pytorch/"
    TOKENIZER = transformers.BertTokenizer.from_pretrained(BERT_PATH, do_lower_case=True)
    CLASS_MAPPING = {'pad':0, 'en':1, 'hi':2, 'ne':3, 'univ':4, 'acro':5}
#     CLASS_MAPPING = {'en':0, 'hi':1}
    MAPPING2CLASS = {i:j for j,i in CLASS_MAPPING.items()}

In [5]:
f = open('./tweets_train.conll', 'r', encoding="utf-8")
    
tokens = []
LIDs = []
  
sentence_tokens = []
sentence_lids = []

for line in f:  # token level
#        print(line)

        if line != '\n':
            columns = line.split('\t')  # isolate columns
            
        if line == '\n':
            # add sentences to tokens/LIDs
            tokens.append(sentence_tokens)
            LIDs.append(sentence_lids)
            
            # reset lists for next sentence
            sentence_tokens = []
            sentence_lids = []
                
#            print(columns)
        
        # add romanised words to sent_tokens, LIDs to sent_lids
        else:
            sentence_tokens.append(columns[1])
            sentence_lids.append(columns[-2])

    # create big list of [romanised words] and [LIDs]


In [7]:
LID_mapped = []
taggs=[]
for i in LIDs:
    LID_mapped.append([Config.CLASS_MAPPING.get(i, 0) for i in i])

In [8]:
train_tokens, valid_tokens, train_labels, valid_labels=model_selection.train_test_split(tokens, LID_mapped, test_size=0.2)

In [9]:
class BERTDatasetTraining:
    def __init__(self, query, targets, tokenizer, max_length):
        '''
        Dataset function used for training the notebooks. This is used for training pipelines.
        Args:
            query: pd.Series: The columnar series of the dataset that you want your model to train upon.
            targets: pd.Series: The columnar series of the targets associated with your inputs.
            tokenizer: Here we are experimenting with only two tokenizers. 1. WordSplitTokenization, 2. BytePairtokenization
            max_length: int: What is the maximum width of sentence we want to work on. 
        '''
        self.comment_text = query
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.targets = targets

    def __len__(self):
        '''
        This should be particularly be based on the number of datapoints in the training set.
        '''
        return len(self.comment_text)

    def __getitem__(self, item):
        comment_text = " ".join(self.comment_text[item])

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_length,

        )

        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        padding_length = self.max_length - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        targets = self.targets[item] + ([0] * (Config.MAX_LEN - len(self.targets[item])))
#         print(ids, mask, token_type_ids, self.targets[item])
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'tokens': self.comment_text[item],
            'targets': torch.tensor(targets, dtype=torch.float)
        }

In [10]:
dataset = BERTDatasetTraining(train_tokens, train_labels, Config.TOKENIZER, Config.MAX_LEN)
train = next(iter(torch.utils.data.DataLoader(dataset, batch_size=len(train_tokens))))
dataset = BERTDatasetTraining(valid_tokens, valid_labels, Config.TOKENIZER, Config.MAX_LEN)
valid = next(iter(torch.utils.data.DataLoader(dataset, batch_size=len(valid_tokens))))

In [11]:
' '.join(Config.TOKENIZER.convert_ids_to_tokens(dataset[0]['ids']))

'[CLS] not gonna eat anything today cu ##z i have to wear saari ra ##at ko . [UNK] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

---

In [12]:
from transformers import BertConfig, TFBertModel
import tensorflow as tf

In [13]:
ids = tf.keras.layers.Input((Config.MAX_LEN,), dtype=tf.int32)
att = tf.keras.layers.Input((Config.MAX_LEN,), dtype=tf.int32)
tok = tf.keras.layers.Input((Config.MAX_LEN,), dtype=tf.int32)

config = BertConfig.from_pretrained('../input/bert-multilingual-uncased-tensorflow/config.json')
bert_model = TFBertModel.from_pretrained('../input/bert-multilingual-uncased-tensorflow/tf_model.h5',config=config)
x = bert_model(ids)
x1 = tf.keras.layers.Dropout(0.1)(x[0])
x1 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(Config.CLASS_MAPPING), activation='softmax'), name='TimeDistLabel')(x1)
print(x1.shape)
# x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
# x2 = tf.keras.layers.Conv1D(1,1)(x2)
# x2 = tf.keras.layers.Flatten()(x2)
# x2 = tf.keras.layers.Activation('softmax')(x2)

model = tf.keras.models.Model(inputs=ids, outputs=x1)
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', tf.keras.metrics.AUC()])
model.summary()

(None, 140, 6)
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 140)]             0         
_________________________________________________________________
tf_bert_model (TFBertModel)  ((None, 140, 768), (None, 167356416 
_________________________________________________________________
dropout_37 (Dropout)         (None, 140, 768)          0         
_________________________________________________________________
TimeDistLabel (TimeDistribut (None, 140, 6)            4614      
Total params: 167,361,030
Trainable params: 167,361,030
Non-trainable params: 0
_________________________________________________________________


In [14]:
train_labels_oh = tf.keras.utils.to_categorical(train['targets'].numpy(), len(Config.CLASS_MAPPING))
valid_labels_oh = tf.keras.utils.to_categorical(valid['targets'].numpy(), len(Config.CLASS_MAPPING))

In [15]:
model.fit(x=train['ids'].numpy(), y=train_labels_oh, 
          validation_data=(valid['ids'].numpy(), valid_labels_oh),
         batch_size=Config.TRAIN_BATCH_SIZE,
         epochs=5,
         )

Train on 1158 samples, validate on 290 samples
Epoch 1/5








Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f7d80b5fc18>

In [31]:
sentence = 'Ab na jaane kab i\'ll be able to reach there'
tokens = Config.TOKENIZER.encode_plus(sentence,
                                      None,
                                    add_special_tokens=True,
                                    max_length=Config.MAX_LEN,
                                    )
padding_length = Config.MAX_LEN - len(tokens['input_ids'])
ids = tokens['input_ids'] + ([0] * padding_length)
inp = np.array([ids])
res = model.predict(inp)
print([Config.MAPPING2CLASS[i] for i in res.argmax(-1)[0]])

['hi', 'hi', 'hi', 'hi', 'en', 'en', 'en', 'en', 'en', 'en', 'univ', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad', 'pad']
