# Model Training

## 1.Import Dependancies

In [11]:
import tensorflow as tf
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TFAutoModelForCausalLM
)
import pandas as pd
from datasets import Dataset

## 2. Load the model and tokenizer

In [12]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = TFAutoModelForCausalLM.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


## Load and Preprocess Dataset

### helper functions

In [13]:
# Tokenization function

def tokenize_function(example):
    return tokenizer(
        example["context"],
        text_pair=example["response"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

In [14]:
# Convert to TensorFlow dataset

def tf_format(example):
    return (
        {
            "input_ids": tf.convert_to_tensor(example["input_ids"]),
            "attention_mask": tf.convert_to_tensor(example["attention_mask"]),
        },
        tf.convert_to_tensor(example["input_ids"]),
    )

In [15]:
dataset_path = "dataset.csv"

# Load dataset
df = pd.read_csv(dataset_path)

# Ensure all values are strings and fill NaN with an empty string
df["context"] = df["context"].astype(str).fillna("")
df["response"] = df["response"].astype(str).fillna("")


# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

train_dataset = tf.data.Dataset.from_generator(
    lambda: (tf_format(sample) for sample in tokenized_dataset),
    output_signature=(
        {
            "input_ids": tf.TensorSpec(shape=(512,), dtype=tf.int32),
            "attention_mask": tf.TensorSpec(shape=(512,), dtype=tf.int32),
        },
        tf.TensorSpec(shape=(512,), dtype=tf.int32),
    ),
)


# Shuffle and batch
BATCH_SIZE = 8
train_dataset = train_dataset.shuffle(10000).batch(BATCH_SIZE)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

## Compile And Train Model

In [16]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer=optimizer, loss=loss_fn)

# Train the model
EPOCHS = 5
model.fit(train_dataset, epochs=EPOCHS)

print("Training complete!")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training complete!


In [17]:
model.save_pretrained("model_tf")
tokenizer.save_pretrained("model_tf")

('model_tf/tokenizer_config.json',
 'model_tf/special_tokens_map.json',
 'model_tf/vocab.json',
 'model_tf/merges.txt',
 'model_tf/added_tokens.json',
 'model_tf/tokenizer.json')

In [None]:
model.evaluate(train_dataset)



7.290469511644915e-05

In [None]:
test = 'I have been having troubles sleeping'

input_ids = tokenizer.encode(test, return_tensors="tf")

# Generate text
generated_output = model.generate(
    input_ids,
    max_length=100,           # maximum length of the generated sequence
    num_beams=5,             # use beam search for more coherent outputs
    no_repeat_ngram_size=2,  # avoid repetition
    early_stopping=True
)

# Decode and print the output
generated_text = tokenizer.decode(generated_output[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


I have been having troubles sleeping sleeping asleep asleep slept slept asleep sleep sleep Sleep SleepSleepSleepsleepsleepspacespaceSpaceSpacespace space space Space SpaceSpace Space space spaces spaces spac spac Spac Spac spac cos cos Cos CoscoscosCosCoscos CosCos Cos Cosmos Cosmos Cosmic Cosmic cosmic cosmic cosmos cosmos universe universe world world worlds worlds lives lives lived lived living living live live Live Live Living Living Live LIVE LIVE LiveLiveLive Live live LIVELivelivelivelifelifeLifeLifeLL L L l l L
