In [31]:
from transformers import (TextDataset, DataCollatorForLanguageModeling,GPT2Tokenizer,
                          GPT2LMHeadModel,Trainer, TrainingArguments)
import pandas as pd
import matplotlib.pyplot as plt
import wandb
wandb.init(mode="disabled")
import warnings
warnings.filterwarnings('ignore')

In [32]:
data=pd.read_csv('../data/sample_q_and_a/response_2.csv')
data.sample(5)

Unnamed: 0,Question,Response
6248,\nI feel like I was born in the wrong body I f...,Hi. Do you have any opportunity to work with a...
3438,My friend is abusing her prescription medicine...,Your friend needs to admit they have a problem...
6476,\nI have so many issues to address. I have a h...,Hello! You may have heard the saying that coun...
6556,\nHow does a counselor decide when to end coun...,"For a therapist, deciding to end counseling se..."
5862,\nI love my girlfriend so much. I get an erect...,I'm sorry to hear of your problem.First step a...


In [33]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7115 entries, 0 to 7114
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Question  7115 non-null   object
 1   Response  7111 non-null   object
dtypes: object(2)
memory usage: 111.3+ KB


In [34]:
data.to_csv('../data/sample_q_and_a/train_response2.csv')


In [35]:
def load_dataset(file_path, tokenizer, block_size = 1024):
    dataset_train = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset_train

In [36]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

In [37]:
def train(train_file_path, model_name, output_dir, overwrite_output_dir,
          per_device_train_batch_size, num_train_epochs):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    # Load datasets
    train_dataset = load_dataset(train_file_path, tokenizer)

    # Load data collator
    data_collator = load_data_collator(tokenizer)

    # Save tokenizer
    tokenizer.save_pretrained(output_dir)

    # Load or initialize model
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Save model
    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        logging_dir="./logs",
        logging_steps=100,  # Log every 100 steps
        save_steps=500,  # Save checkpoint every 500 steps
        logging_first_step=True,
        save_total_limit=2,
        learning_rate=.0001
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )
    hist = trainer.train()
    trainer.save_model()
    return trainer,hist

In [45]:
train_file_path = "../data/sample_q_and_a/train_response2.csv"
model_name = 'gpt2'
output_dir = '../data/sample_q_and_a/custom_model'
overwrite_output_dir = True
per_device_train_batch_size = 2
num_train_epochs = 30

In [46]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [47]:
model = GPT2LMHeadModel.from_pretrained(model_name)
modelmodel = GPT2LMHeadModel.from_pretrained(model_name)
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [48]:
model.base_model


GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [49]:
# Train
hist=train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
) 

OutOfMemoryError: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacty of 7.43 GiB of which 50.44 MiB is free. Including non-PyTorch memory, this process has 7.38 GiB memory in use. Of the allocated memory 7.25 GiB is allocated by PyTorch, and 54.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF