In [None]:
!pip install transformers
!pip install datasets
!pip install huggingface_hub

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from itertools import chain
import datasets
from transformers.keras_callbacks import PushToHubCallback
from transformers import TFAutoModelForCausalLM, DataCollatorForLanguageModeling, create_optimizer, AdamWeightDecay

import huggingface_hub
huggingface_hub.login("YOURTOKEN")

modelName = "Eliac11/tinkNLP"
datafilename = "data.csv"

In [7]:
def getData(d):
    chunk = 64
    concat = {k: sum(d[k], []) for k in d.keys()}
    l = len(concat[list(d.keys())[0]])
    l = l if l < chunk else (l // chunk) * chunk
    data = { k: [t[i : i + chunk] for i in range(0, l, chunk)] for k, t in concat.items()}
    data["labels"] = data["input_ids"].copy()
    return data

In [11]:
df = pd.read_csv(datafilename)
df = df.dropna()

trnDf, valDf = train_test_split(df[:5000], test_size = 0.1)
trnDf.head()

trainDs = Dataset.from_pandas(trnDf, split="train")
testDs = Dataset.from_pandas(valDf, split="test")


tokenizer = AutoTokenizer.from_pretrained(modelName)

def converting(e):
    dat = []
    for i in ["context_3", "context_2", "context_1", "response"]:
      dat += [e[i]]
    return tokenizer(" ".join(dat))

In [None]:
tokenTrn = trainDs.map(converting, remove_columns=trainDs.column_names)
tokenTest = testDs.map(converting, remove_columns=testDs.column_names)

In [None]:

lmTrn = tokenTrn.map(getData, batched=True, num_proc=4)
lmTest = tokenTest.map(getData, batched=True, num_proc=4)

dataCollator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")
model = TFAutoModelForCausalLM.from_pretrained(modelName, from_pt=True)


In [None]:
tfTrain = model.prepare_tf_dataset(lmTrn, shuffle=True, batch_size=16, collate_fn=dataCollator)
tfTest = model.prepare_tf_dataset(lmTest, shuffle=False, batch_size=16, collate_fn=dataCollator)

In [15]:
optmzr = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.005)
model.compile(optimizer=optmzr)

In [None]:
hub = PushToHubCallback(output_dir="FitModel", tokenizer=tokenizer)

In [33]:
modelSavePath = './data/'
bestModelSavePath = './databest/'

In [34]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    modelSavePath, verbose=1, save_weights_only=False)

es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=bestModelSavePath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

tb_callback = tf.keras.callbacks.TensorBoard('./logs', update_freq=1)

In [None]:
model.fit(x=tfTrain, validation_data=tfTest, epochs=100, callbacks=[cp_callback, es_callback, tb_callback, model_checkpoint_callback, hub])

Epoch 1/100
Epoch 1: saving model to ./data/




Epoch 2/100
Epoch 2: saving model to ./data/




Epoch 3/100
Epoch 3: saving model to ./data/




Epoch 4/100
Epoch 4: saving model to ./data/




Epoch 5/100
Epoch 5: saving model to ./data/




Epoch 6/100
Epoch 6: saving model to ./data/




Epoch 7/100
Epoch 7: saving model to ./data/




Epoch 8/100
Epoch 8: saving model to ./data/




Epoch 9/100
Epoch 9: saving model to ./data/




Epoch 10/100
Epoch 10: saving model to ./data/




Epoch 11/100
Epoch 11: saving model to ./data/




Epoch 12/100
Epoch 12: saving model to ./data/




Epoch 13/100
Epoch 13: saving model to ./data/




Epoch 14/100
Epoch 14: saving model to ./data/


