In [None]:
!pip install transformers datasets

In [None]:
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
import pandas as pd
import datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/AI/plot2tag_encoded_6500.csv", header=None, sep='\t')
df.columns = ['text']
df

Unnamed: 0,text
0,~^monster^frankenstein's monster^mirror^alchem...
1,~^monster^frankenstein's monster^mirror^alchem...
2,~^monster^frankenstein's monster^mirror^alchem...
3,~^monster^frankenstein's monster^mirror^alchem...
4,~^monster^frankenstein's monster^mirror^alchem...
...,...
6536,~^parenthood^psychodrama^human animal hybrid^b...
6537,~^snow adventure^~@The sequel on the big scree...
6538,~^witch^sorceress^magic^supernatural killer^us...
6539,~^pokemon^~@A story of young adults who are on...


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
data = datasets.Dataset.from_pandas(df)
data

Dataset({
    features: ['text'],
    num_rows: 6541
})

In [None]:
strategy = tf.distribute.get_strategy()

In [None]:
%%time

output = {}

def tokenize_function(examples, tokenizer=tokenizer):

    examples = [ex for ex in examples["text"]]

    output = tokenizer(
        examples,
        add_special_tokens=True,
        max_length=1024,
        truncation=True,
        pad_to_max_length=True,
    )

    output["labels"] = [x[1:] for x in output["input_ids"]]
    output["labels"] = [
        [-100 if x == tokenizer.pad_token_id else x for x in y]
        for y in output["labels"]
    ]

    output["input_ids"] = [x[:-1] for x in output["input_ids"]]
    output["attention_mask"] = [x[:-1] for x in output["attention_mask"]]
    return output

data = data.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    load_from_cache_file=True,
)
print(data)

Map:   0%|          | 0/6541 [00:00<?, ? examples/s]



Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6541
})
CPU times: user 25 s, sys: 580 ms, total: 25.6 s
Wall time: 36.9 s


In [None]:
data.set_format(type="python", columns=["input_ids", "attention_mask", "labels"])
data = data.train_test_split(
    test_size=0.1, shuffle=True, seed=1, load_from_cache_file=True
)

In [None]:
# data.save_to_disk("content/drive/MyDrive/gpt2_v6500_dataset_shuffle_train_test")

In [None]:
train_tensor_inputs = tf.convert_to_tensor(data["train"]["input_ids"])
train_tensor_labels = tf.convert_to_tensor(data["train"]["labels"])
train_tensor_mask = tf.convert_to_tensor(data["train"]["attention_mask"])
train = tf.data.Dataset.from_tensor_slices(
    (
        {"input_ids": train_tensor_inputs, "attention_mask": train_tensor_mask},
        train_tensor_labels,
    )
)

test_tensor_inputs = tf.convert_to_tensor(data["test"]["input_ids"])
test_tensor_labels = tf.convert_to_tensor(data["test"]["labels"])
test_tensor_mask = tf.convert_to_tensor(data["test"]["attention_mask"])
test = tf.data.Dataset.from_tensor_slices(
    (
        {"input_ids": test_tensor_inputs, "attention_mask": test_tensor_mask},
        test_tensor_labels,
    )
)

CPU times: user 13.9 s, sys: 917 ms, total: 14.9 s
Wall time: 14.9 s


In [None]:
# Model params
BATCH_SIZE_PER_REPLICA = 2
EPOCHS = 5
INITAL_LEARNING_RATE = 0.001

In [None]:

try:
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
except NameError as e:
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA
BUFFER_SIZE = len(train)

train_ds = (
    train.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
)
test_ds = test.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
# Зниження темпу навчання
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    INITAL_LEARNING_RATE,
    decay_steps=500,
    decay_rate=0.7,
    staircase=True)

# ініціалізація моделі
with strategy.scope():
    model = TFGPT2LMHeadModel.from_pretrained(
        "gpt2",
        use_cache=False
    )
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
    model.compile(optimizer=optimizer, loss=model.hf_compute_loss)

    model.summary()

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Model: "tfgpt2lm_head_model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 124439808 
 r)                                                              
                                                                 
Total params: 124,439,808
Trainable params: 124,439,808
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.config.embd_pdrop = model.config.resid_pdrop = model.config.attn_pdrop = 0.2

In [None]:
# Stop training when validation acc starts dropping
# Save checkpoint of model after each period
from datetime import datetime

now = datetime.now().strftime("%Y-%m-%d_%H%M")
# Стоврення колбеків
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", verbose=1, patience=1, restore_best_weights=True
    ),

    tf.keras.callbacks.ModelCheckpoint(
        filepath= "/content/drive/MyDrive/AI/model_checkpoints/gpt2tens_v6500/" + now + "_GPT2-Model_{epoch:02d}_{val_loss:.4f}.ckpt",
        save_weights_only=True,
        monitor='val_loss',
        mode='min',
        save_best_only=True,
        verbose=1
    )
]

In [None]:
checkpoint_dir = '/content/drive/MyDrive/AI/model_checkpoints/gpt2tens_v6500'
latest_checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)
model.load_weights(latest_checkpoint_path)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7fc7d05affd0>

In [None]:
# Тренування моделі
steps_per_epoch = int(BUFFER_SIZE // BATCH_SIZE)
print(
    f"Model Params:\nbatch_size: {BATCH_SIZE}\nEpochs: {EPOCHS}\n"
    f"Step p. Epoch: {steps_per_epoch}\n"
    f"Initial Learning rate: {INITAL_LEARNING_RATE}"
)
hist = model.fit(
    train_ds,
    validation_data=test_ds,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1,
    initial_epoch=2
)

Model Params:
batch_size: 2
Epochs: 5
Step p. Epoch: 2943
Initial Learning rate: 0.001
Epoch 3/5
Epoch 3: val_loss improved from inf to 3.35710, saving model to /content/drive/MyDrive/AI/model_checkpoints/gpt2tens_v6500/2023-05-22_1422_GPT2-Model_03_3.3571.ckpt
Epoch 4/5

Epoch 4: val_loss did not improve from 3.35710
Epoch 4: early stopping


In [None]:
# checkpoint_dir = '/content/drive/MyDrive/AI/model_checkpoints/gpt2tens_v6500'
# latest_checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)
# model.load_weights(latest_checkpoint_path)

In [None]:
filepath= "/content/drive/MyDrive/AI/model_checkpoints/gpt2tens_v6500/2023-05-20_1613_GPT2-Model_02_3.3107.ckpt"
# model_check = tf.train.load_checkpoint(filepath)
# model.load_weights(model_check)

In [None]:
inp = "~^devil^forest^time travel^secret organization~@"

In [None]:
# model.save("/content/drive/MyDrive/AI/gpt2tens_model/model.h5", save_format="tf")
model.save_pretrained("/content/drive/MyDrive/AI/GPT2-Fantasy/")

In [None]:
from transformers import pipeline
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

model_test = TFGPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/AI/GPT2-Fantasy/')
gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

story = pipeline(
    "text-generation",
    model=model_test,
    tokenizer=gpt_tokenizer,
    device=0
)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at /content/drive/MyDrive/AI/gpt2tens_model2/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
story("~^devil^forest^time travel^secret organization~@", temperature=1.0,
                                max_length=128,
                                repetition_penalty=7.0,
                                num_beams=4, seed=0)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "~^devil^forest^time travel^secret organization~@A young man and his wife are killed in a car accident while driving across the country. Their father, who has lived for thousands of years, is convinced that he's connected to an ancient Indian deity known as Ganja. While trying to save their daughter from Ganja, they encounter many supernatural beings which have appeared over the course of centuries. One such being may be Ganja himself but it will take more than just one person to save them. In order to escape Ganja you must fight back against Ganja with your own life-or-death struggles. Can"}]