In [1]:
!pip install transformers

from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoConfig

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Load the tokenizer and model
context_length = 1024

import torch
print(torch.cuda.is_available())
torch.cuda.empty_cache()
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', max_length=context_length)

# Add question and answer tokens.
q_token, a_token, pad_token = '<|question|>', '<|answer|>', '<|pad|>'
tokenizer.add_tokens([q_token, a_token])
tokenizer.add_special_tokens({'pad_token': pad_token})

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

True
GPT-2 size: 124.4M parameters


In [3]:
# Prepare the dataset.
import pandas as pd
# Load the FIQA dataset.
DATASET_DIRNAME = "./drive/MyDrive/FiQA_train_task2/"
QUESTIONS_FILENAME = "FiQA_train_question_final.tsv"
ANSWERS_FILENAME = "FiQA_train_doc_final.tsv"
QNA_PAIRS_FILENAME = "FiQA_train_question_doc_final.tsv"

q_df = pd.read_csv(DATASET_DIRNAME + QUESTIONS_FILENAME, delimiter='\t')
a_df = pd.read_csv(DATASET_DIRNAME + ANSWERS_FILENAME, delimiter='\t')
qna_df = pd.read_csv(DATASET_DIRNAME + QNA_PAIRS_FILENAME, delimiter='\t')

# Merge dataframes based on the qna pairs
merged_df1 = qna_df.merge(q_df, left_on='qid', right_on='qid')
df = merged_df1.merge(a_df, left_on='docid', right_on='docid')
df.drop(['Unnamed: 0', 'Unnamed: 0_x','Unnamed: 0_y', 'qid', 'docid', 'timestamp_x', 'timestamp_y'], axis=1, inplace=True)
df.head(6)

Unnamed: 0,question,doc
0,What is considered a business expense on a bus...,The IRS Guidance pertaining to the subject. I...
1,Claiming business expenses for a business with...,Yes you can claim your business deductions if ...
2,Transferring money from One business checking ...,You should have separate files for each of the...
3,Having a separate bank account for business/in...,Having a separate checking account for the bus...
4,Having a separate bank account for business/in...,"You don't specify which country you are in, so..."
5,Having a separate bank account for business/in...,"If it makes your finances easier, why not? My ..."


In [4]:
nan_rows = df[df.isnull().any(axis=1)]
print(nan_rows)

before_drop = df.shape[0]

dropped_rows = df[df.isnull().any(axis=1)]

df = df.dropna(how='any')

after_drop = df.shape[0]

print("Number of dropped rows: ", before_drop-after_drop)

                                                question  doc
215           I have $100,000 in play money… what to do?  NaN
1855   Optimal way to use a credit card to build bett...  NaN
2307   Why do some online stores not ask for the 3-di...  NaN
3347   Why do US retirement funds typically have way ...  NaN
3391   Is there any way to buy a new car directly fro...  NaN
3758                         Emptying a Roth IRA account  NaN
3765   How to evaluate stocks? e.g. Whether some stoc...  NaN
3822   What are the reasons to get more than one cred...  NaN
3829   What are the reasons to get more than one cred...  NaN
4007        How to share income after marriage and kids?  NaN
4077   Opening American credit cards while residing i...  NaN
4152      What should I be aware of as a young investor?  NaN
4414   Is there any instrument with real-estate-like ...  NaN
5158           Why are credit cards preferred in the US?  NaN
5168   Confused about employee stock options: How do ...  NaN
5416    

In [5]:
!pip install datasets
from datasets import Dataset

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
# remove inputs that are longer than 1024 when encoded
remove_indices = []

for i, row in df.iterrows():
  s = "{} {} {} {}".format(q_token, df.loc[i, 'question'], a_token, df.loc[i, 'doc'])
  df.loc[i, 'content'] = s
  tokenized_s = tokenizer.encode(s, add_special_tokens=False, return_tensors='pt')
  l = len(tokenized_s[0])
  if l > 1024:
    remove_indices.append(i)

df.drop(remove_indices, inplace=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (1071 > 1024). Running this sequence through the model will result in indexing errors


In [7]:
raw_datasets = Dataset.from_pandas(df)
train_ds, valid_ds = raw_datasets.train_test_split(.2).values()

In [8]:
from datasets import DatasetDict
raw_datasets = DatasetDict(
    {
        "train": train_ds,
        "valid": valid_ds,
    }
)

In [9]:
def tokenize(element):
    outputs = tokenizer(element["content"], add_special_tokens=False, max_length=context_length, truncation=True, padding='max_length')
    input_batch = []
    for input_ids in outputs["input_ids"]:
        input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 13534
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 3384
    })
})

In [10]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [11]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 1024])
attention_mask shape: torch.Size([5, 1024])
labels shape: torch.Size([5, 1024])


In [12]:
model.resize_token_embeddings(len(tokenizer)) 

Embedding(50260, 768)

In [13]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="codeparrot-ds",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    fp16=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
)

Using cuda_amp half precision backend


In [14]:
len(tokenized_datasets['train'][1]['input_ids'])
lens = []
for i in tokenized_datasets['train']:
  lens.append(len(i['input_ids']))

max(lens)

1024

In [15]:
import os

trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 13534
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 16920
  Number of trainable parameters = 124442112


Step,Training Loss,Validation Loss
5000,1.1612,1.06629
10000,0.9908,1.021535
15000,0.8942,0.98713


***** Running Evaluation *****
  Num examples = 3384
  Batch size = 4
Saving model checkpoint to codeparrot-ds/checkpoint-5000
Configuration saved in codeparrot-ds/checkpoint-5000/config.json
Configuration saved in codeparrot-ds/checkpoint-5000/generation_config.json
Model weights saved in codeparrot-ds/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in codeparrot-ds/checkpoint-5000/tokenizer_config.json
Special tokens file saved in codeparrot-ds/checkpoint-5000/special_tokens_map.json
added tokens file saved in codeparrot-ds/checkpoint-5000/added_tokens.json
***** Running Evaluation *****
  Num examples = 3384
  Batch size = 4
Saving model checkpoint to codeparrot-ds/checkpoint-10000
Configuration saved in codeparrot-ds/checkpoint-10000/config.json
Configuration saved in codeparrot-ds/checkpoint-10000/generation_config.json
Model weights saved in codeparrot-ds/checkpoint-10000/pytorch_model.bin
tokenizer config file saved in codeparrot-ds/checkpoint-10000/tokenizer_confi

In [16]:
# Load the model
saved_model_dir = 'codeparrot-ds'

loaded_model = GPT2LMHeadModel.from_pretrained(saved_model_dir)

loading configuration file codeparrot-ds/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50259,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.26.0",
  "use_cache": true,
  "vocab_size":

In [35]:
test_input = "{} {} {}".format(
    q_token, 
    'How to deposit a cheque issued to an associate in my business into my business account?',
    a_token
    )
encoded_input = tokenizer.encode(test_input, return_tensors='pt')
print(encoded_input)

tensor([[50257,  2437,   284, 14667,   257,  1125,  4188,  4884,   284,   281,
         11602,   287,   616,  1597,   656,   616,  1597,  1848,    30, 50258]])


In [36]:
result = loaded_model.generate(encoded_input, top_k=1, top_p=1, max_length=1024)
decoded_result = tokenizer.decode(result[0], skip_special_tokens=True)
print(decoded_result)

Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "pad_token_id": 50259,
  "transformers_version": "4.26.0"
}



<|question|> How to deposit a cheque issued to an associate in my business into my business account? <|answer|> I'm not sure what you're asking for a business, but I'm not sure what you're asking for a business.  I'm not sure what you're asking for a business, but I'd suggest you do, and you're asking about the LLC, and you're asking for a business.  You can't do this, but you can't do this.  You can't do this, and you're asking for a business, and you can't get a lawyer.  You can't do this.  You can't do this.  You can't get a lawyer.  You can't get a lawyer, and you can't get a lawyer, and you can't get a lawyer.  You can't get a lawyer.  You can't get a lawyer, and you can't get a lawyer.  You can't get a lawyer, and you can't get a lawyer, and you can't get a lawyer and get a lawyer.  You can't get a lawyer.  You can't get a lawyer, but you can't get a lawyer, and you can't get a lawyer, and you can't get a lawyer.  You can't get a lawyer and get a lawyer.  You can't get a lawyer. 