In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


#### Imports and TPU setting

In [1]:
! pip uninstall kaggle -q
! pip install kaggle -q
! pip install transformers -q

Proceed (y/n)? y


In [2]:
import os
import re
import time
import tokenizers
import numpy as np
import pandas as pd
import transformers
from tqdm import tqdm
import tensorflow as tf
from google.colab import files
import tensorflow_datasets as tfds
import tensorflow.keras.backend as K
from transformers import BertTokenizer
from transformers import TFRobertaModel
from tensorflow.keras.models import Model
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
%matplotlib inline

tf.get_logger().setLevel('ERROR')

Using TensorFlow backend.


#### Load the data

In [7]:
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c 'tweet-sentiment-extraction'

Saving kaggle.json to kaggle.json
Downloading tweet-sentiment-extraction.zip to /content
  0% 0.00/1.39M [00:00<?, ?B/s]
100% 1.39M/1.39M [00:00<00:00, 84.9MB/s]


In [8]:
!unzip '/content/tweet-sentiment-extraction.zip'

Archive:  /content/tweet-sentiment-extraction.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [3]:
train = pd.read_csv('/content/train.csv')
train['text'] = train['text'].astype(str)
train['selected_text'] = train['selected_text'].astype(str)

test = pd.read_csv('/content/test.csv')
test['text'] = test['text'].astype(str)

In [4]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [None]:
from text import  *

In [None]:
train.text.values[:4]

array([' I`d have responded, if I were going',
       ' Sooo SAD I will miss you here in San Diego!!!',
       'my boss is bullying me...', ' what interview! leave me alone'],
      dtype=object)

In [None]:
get_translation(get_translation(list(train.text.values[:4]), dest_lang='de'), dest_lang='en')

['I would have answered if I left',
 'Sooo sad, I will miss you here in San Diego !!!',
 'My boss is harassing me ...',
 'what an interview! leave me alone']

#### Preprocess

In [5]:
print(f"train shape: {train.shape} \ntest shape: {test.shape}")
print(f"ratio between lables in test: {test.sentiment.value_counts()[1] / test.sentiment.value_counts()[0]}")
print("-"*4)
mean_word_len = train.text.apply(lambda x: len(x.split(" "))).mean()
print(f"Dataset with shape of {train.shape[0]} samples. \nMean number of words is: {mean_word_len}. \nDistribution of lables is: \n{train.sentiment.value_counts()}")

train shape: (27481, 4) 
test shape: (3534, 3)
ratio between lables in test: 0.7713286713286713
----
Dataset with shape of 27481 samples. 
Mean number of words is: 13.7794476183545. 
Distribution of lables is: 
neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64


#### Modelling

In [6]:
MAX_LEN = 96

In [7]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base", lowercase = True)

In [75]:
# path = "/content/drive/My Drive/projects/tweet sentiment extraction"

# tokenizer = tokenizers.ByteLevelBPETokenizer(
#     vocab_file=path + os.sep + 'vocab.json', 
#     merges_file=path + os.sep + 'merges.txt', 
#     lowercase=True,
#     add_prefix_space=True
# )

In [8]:
sentiment_id = {'positive': tokenizer.encode_plus('positive', add_prefix_space=True, max_length = MAX_LEN, pad_to_max_length = True, return_attention_mask = True, return_token_type_ids = True, truncation=True).input_ids[1], 
                'negative': tokenizer.encode_plus('negative', add_prefix_space=True, max_length = MAX_LEN, pad_to_max_length = True, return_attention_mask = True, return_token_type_ids = True, truncation=True).input_ids[1], 
                'neutral':  tokenizer.encode_plus('neutral', add_prefix_space=True, max_length = MAX_LEN, pad_to_max_length = True, return_attention_mask = True, return_token_type_ids = True, truncation=True).input_ids[1]}
sentiment_id

{'negative': 2430, 'neutral': 7974, 'positive': 1313}

In [9]:
for i in range(4):
  print(f"Special token {i} -> {tokenizer.decode([i])}")

Special token 0 -> <s>
Special token 1 -> <pad>
Special token 2 -> </s>
Special token 3 -> <unk>


In [11]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# def jaccard(str1, str2): 
#     a = set(str1.lower().split()) 
#     b = set(str2.lower().split())
#     if (len(a)==0) & (len(b)==0): return 0.5
#     c = a.intersection(b)
#     return float(len(c)) / (len(a) + len(b) - len(c))

In [12]:
ct = train.shape[0]

input_ids = np.ones((ct,MAX_LEN),dtype='int32')

attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')

start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')

for k in tqdm(range(train.shape[0])):
    text1 = " " + " ".join(train.loc[k,'text'].split())
    text2 = " ".join(train.loc[k,'selected_text'].split())

    enc = tokenizer.encode_plus(text1.lower(), 
                       add_prefix_space=True,
                       max_length = MAX_LEN, # max length of the text that can go to BERT
                       pad_to_max_length = True, # add [PAD] tokens
                       return_attention_mask = True, # add attention mask to not focus on pad tokens
                       return_token_type_ids = True,
                       truncation=True)
    
    input_ids[k] = np.array(enc.input_ids)
    input_ids[k, np.where(input_ids[k] == 2)[0][0]+1 : np.where(input_ids[k] == 2)[0][0]+4] = 2
    input_ids[k, np.where(input_ids[k] == 2)[0][0]+2] = sentiment_id[train.loc[k,'sentiment']]

    attention_mask[k] = np.array(enc.attention_mask)
    attention_mask[k, np.where(attention_mask[k] == 1)[0][-1] + 1 : np.where(attention_mask[k] == 1)[0][-1] + 4] = 1
   
    token_type_ids[k] = np.array(enc.token_type_ids)

    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)] = 1

    if text1[idx-1] == ' ': 
        chars[idx-1] = 1 
        
    idx=0
    toks = []
    for i, t in enumerate(enc.input_ids[1:]):
          w = tokenizer.decode([t])
          if np.sum(chars[idx:idx+len(w)]) > 0: # if we are in overlapp location append the token
              toks.append(i)
          idx += len(w)
    
    if len(toks) > 0:
        start_tokens[k,toks[0]+1] = 1
        end_tokens[k,toks[-1]+1] = 1

100%|██████████| 27481/27481 [00:57<00:00, 480.46it/s]


In [13]:
ct = test.shape[0]
input_ids_t = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids_t = np.zeros((ct,MAX_LEN),dtype='int32')

for k in tqdm(range(test.shape[0])):
        
    text1 = " " + " ".join(test.loc[k,'text'].split())

    enc = tokenizer.encode_plus(text1.lower(), 
                       add_prefix_space=True,
                       max_length = MAX_LEN, # max length of the text that can go to BERT
                       pad_to_max_length = True, # add [PAD] tokens
                       return_attention_mask = True, # add attention mask to not focus on pad tokens
                       return_token_type_ids = True,
                       truncation=True)
    
    input_ids_t[k] = np.array(enc.input_ids)
    input_ids_t[k, np.where(input_ids_t[k] == 2)[0][0]+1 : np.where(input_ids_t[k] == 2)[0][0]+4] = 2
    input_ids_t[k, np.where(input_ids_t[k] == 2)[0][0]+2] = sentiment_id[test.loc[k,'sentiment']]

    attention_mask_t[k] = np.array(enc.attention_mask)
    attention_mask_t[k, np.where(attention_mask_t[k] == 1)[0][-1] + 1 : np.where(attention_mask_t[k] == 1)[0][-1] + 4] = 1
   
    token_type_ids_t[k] = np.array(enc.token_type_ids)

100%|██████████| 3534/3534 [00:01<00:00, 1855.13it/s]


###### Build model inputs

###### Build model

In [14]:
def build_model():
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)

    roberta_model = TFRobertaModel.from_pretrained('roberta-base')
    x = roberta_model(ids,attention_mask=att,token_type_ids=tok)
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x1 = tf.keras.layers.Conv1D(1,1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x2 = tf.keras.layers.Conv1D(1,1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)

    return model

In [15]:
K.clear_session()
model = build_model()
model.summary()

Some weights of the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 96)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 96)]         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 96)]         0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 96, 768), (N 124645632   input_1[0][0]                    
______________________________________________________________________________________________

In [16]:
jac = []; VER='v0'; DISPLAY=1 # USE display=1 FOR INTERACTIVE
oof_start = np.zeros((input_ids.shape[0],MAX_LEN))
oof_end = np.zeros((input_ids.shape[0],MAX_LEN))
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
for fold,(idxT,idxV) in enumerate(skf.split(input_ids,train.sentiment.values)):

    print('#'*25)
    print(f"### FOLD {fold+1}")
    print('#'*25)
    
    K.clear_session()
    model = build_model()
        
    sv = tf.keras.callbacks.ModelCheckpoint(
        '%s-roberta-%i.h5'%(VER,fold), monitor='val_loss', verbose=1, save_best_only=True,
        save_weights_only=True, mode='auto', save_freq='epoch')
        
    model.fit([input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]], [start_tokens[idxT,], end_tokens[idxT,]], 
        epochs=3, batch_size=32, verbose=DISPLAY, callbacks=[sv],
        validation_data=([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]], 
        [start_tokens[idxV,], end_tokens[idxV,]]))
    
    model.load_weights('%s-roberta-%i.h5'%(VER,fold))
    
    print('#'*5)
    print("Predicting Validation")
    print('#'*5)
    oof_start[idxV,],oof_end[idxV,] = model.predict([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]],verbose=DISPLAY)
    
    print('#'*5)
    print("Predicting Test")
    print('#'*5)
    preds = model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=DISPLAY)
    preds_start += preds[0]/skf.n_splits
    preds_end += preds[1]/skf.n_splits
    
    # DISPLAY FOLD JACCARD
    all = []
    for k in idxV:
        a = np.argmax(oof_start[k,])
        b = np.argmax(oof_end[k,])
        if a>b: 
            st = train.loc[k,'text'] # IMPROVE CV/LB with better choice here
        else:            
            st = tokenizer.decode(input_ids[k][a:b+1])
        all.append(jaccard(st,train.loc[k,'selected_text']))
    jac.append(np.mean(all))
    print(f"FOLD {fold+1} Jaccard {np.mean(all)}")
    print()

#########################
### FOLD 1
#########################


Some weights of the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/3
Epoch 00001: val_loss improved from inf to 1.83191, saving model to v0-roberta-0.h5
Epoch 2/3
Epoch 00002: val_loss improved from 1.83191 to 1.68792, saving model to v0-roberta-0.h5
Epoch 3/3
Epoch 00003: val_loss improved from 1.68792 to 1.67941, saving model to v0-roberta-0.h5
#####
Predicting Validation
#####
#####
Predicting Test
#####
FOLD 1 Jaccard 0.6855969923921674

#########################
### FOLD 2
#########################


Some weights of the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/3
Epoch 00001: val_loss improved from inf to 1.78218, saving model to v0-roberta-1.h5
Epoch 2/3
Epoch 00002: val_loss improved from 1.78218 to 1.70279, saving model to v0-roberta-1.h5
Epoch 3/3
Epoch 00003: val_loss improved from 1.70279 to 1.67826, saving model to v0-roberta-1.h5
#####
Predicting Validation
#####
#####
Predicting Test
#####
FOLD 2 Jaccard 0.6906972244821378

#########################
### FOLD 3
#########################


Some weights of the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/3
Epoch 00001: val_loss improved from inf to 1.70524, saving model to v0-roberta-2.h5
Epoch 2/3
Epoch 00002: val_loss improved from 1.70524 to 1.62728, saving model to v0-roberta-2.h5
Epoch 3/3
Epoch 00003: val_loss did not improve from 1.62728
#####
Predicting Validation
#####
#####
Predicting Test
#####
FOLD 3 Jaccard 0.6961756159437377

#########################
### FOLD 4
#########################


Some weights of the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/3
Epoch 00001: val_loss improved from inf to 1.67201, saving model to v0-roberta-3.h5
Epoch 2/3
Epoch 00002: val_loss did not improve from 1.67201
Epoch 3/3
Epoch 00003: val_loss improved from 1.67201 to 1.65794, saving model to v0-roberta-3.h5
#####
Predicting Validation
#####
#####
Predicting Test
#####
FOLD 4 Jaccard 0.6904126036150283

#########################
### FOLD 5
#########################


Some weights of the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/3
Epoch 00001: val_loss improved from inf to 1.75699, saving model to v0-roberta-4.h5
Epoch 2/3
Epoch 00002: val_loss improved from 1.75699 to 1.66032, saving model to v0-roberta-4.h5
Epoch 3/3
Epoch 00003: val_loss did not improve from 1.66032
#####
Predicting Validation
#####
#####
Predicting Test
#####
FOLD 5 Jaccard 0.6901100121668927



In [17]:
print('>>>> OVERALL 5Fold CV Jaccard =',np.mean(jac))

>>>> OVERALL 5Fold CV Jaccard = 0.6905984897199928


###### Training

###### Load model

###### 2nd phase training