# Info
## kaggle
[current kaggle kernel](https://www.kaggle.com/al0kharba/tensorflow-roberta-0-712)

- conv and dense layer added
- lr scheduler added

[original kaggle kernel](https://www.kaggle.com/cdeotte/tensorflow-roberta-0-705)

## model
[hugging_face_Roberta](https://huggingface.co/transformers/model_doc/roberta.html#)

- 문제: TFRobertamodel은 tf2부터 지원 가능... 

In [42]:
import pandas as pd, numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers
print('TF version',tf.__version__)

TF version 1.14.0


# 01. DATA

In [3]:
def read_train():
    train = pd.read_csv('train.csv')
    train['text'] = train['text'].astype(str)
    train['selected_text'] = train['selected_text'].astype(str)
    return train
def read_test():
    test = pd.read_csv('test.csv')
    test['text'] = test['text'].astype(str)
    return test

df_train = read_train()
df_test = read_test()

def jaccard(str1, str2): 
    a = set(str(str1).lower().split()) 
    b = set(str(str2).lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# 02. PREPROCESSING (using `numpy`)

In [5]:
MAX_LEN = 96
PATH = './roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'vocab-roberta-base.json', 
    merges_file=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

### df_train
- `input_ids` = [0 + TEXT_IDS + 2 + 2 + SENTIMENT_ID + 2] (`0` == CLS, `2` == SEP)
- `attention_mask` = (TEXT_IDS + 5)가 1
- `token_type_ids`
- `start_tokens` = selected_text 시작 index
- `end_tokens` = selected_text 끝 index

In [19]:
ct = df_train.shape[0]
input_ids = np.ones((ct, MAX_LEN), dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(df_train.shape[0]): # by row
    
    text1 = " " + " ".join(df_train.loc[k, 'text'].split())
    text2 = " ".join(df_train.loc[k,'selected_text'].split())
    idx = text1.find(text2) # text에서 selected_text의 시작 지점
    chars = np.zeros(len(text1)) # selected_text 부분을 1 
    chars[idx:idx+len(text2)] = 1
    if text1[idx-1] == ' ': # selected_text와 text의 시작이 같은 경우
        chars[idx-1] = 1
    enc = tokenizer.encode(text1)
    
    # ID_OFFSETS (ID 마다 character 개수 셀 수 있도록)
    offsets = []
    idx = 0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx, idx+len(w)))
        idx += len(w)
        
    # START END TOEKNS
    toks = [] # selected_text가 있는 ID들의 인덱스
    for i, (a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b]) # selecte_text의 개수
        if sm>0:
            toks.append(i)
    
    s_tok = sentiment_id[df_train.loc[k, 'sentiment']] # sentiment token
    
    # IDS, MASK, START_TOKEN, END_TOKEN 채우기
    input_ids[k, :len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask[k, :len(enc.ids)+5] = 1
    if len(toks)>0:
        start_tokens[k, toks[0]+1] = 1
        end_tokens[k, toks[-1]+1] = 1
          

### df_test

In [20]:
ct = df_test.shape[0]
input_ids_t = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids_t = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(df_test.shape[0]):
        
    # INPUT_IDS
    text1 = " "+" ".join(df_test.loc[k,'text'].split())
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[df_test.loc[k,'sentiment']]
    input_ids_t[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask_t[k,:len(enc.ids)+5] = 1

# 03. Model 

In [33]:
# learning rate needs to be tested! 
def scheduler(epoch):
    return 3e-5 * 0.2**epoch

In [44]:
def build_model():
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    
    config = RobertaConfig.from_pretrained(PATH+'config-roberta-base.json')
    bert_model = TFRobertaModel.from_pretrained(PATH+'pretrained-roberta-base.h5',config=config)
    x = bert_model(ids,attention_mask=att,token_type_ids=tok)
    
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x1 = tf.keras.layers.Conv1D(128, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64, 2,padding='same')(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x2 = tf.keras.layers.Conv1D(128, 2, padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss='binary_crossentropy', optimizer=optimizer)

    return model

In [45]:
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))
DISPLAY=1
for i in range(5):
    print('#'*25)
    print('### MODEL %i'%(i+1))
    print('#'*25)
    
    K.clear_session()
    model = build_model()
    model.load_weights('../input/model4/v4-roberta-%i.h5'%i)

    print('Predicting Test...')
    preds = model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=DISPLAY)
    preds_start += preds[0]/n_splits
    preds_end += preds[1]/n_splits

#########################
### MODEL 1
#########################


NameError: name 'TFRobertaModel' is not defined