In [5]:
!pip install -q transformers datasets accelerate sentencepiece[sentencepiece] evaluate

[0m

In [6]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [7]:
import os
from pathlib import Path
Data = 'stories.csv'
Output = 'distilgpt2-finetuned'
os.makedirs(Output, exist_ok = True)

print('Using data:', Data)

Using data: stories.csv


In [8]:
import pandas as pd
df = pd.read_csv(Data)
print(df.shape)
print(df.columns)
display(df.head(5))


(98, 2)
Index(['Unnamed: 0', 'story'], dtype='object')


Unnamed: 0.1,Unnamed: 0,story
0,0,All in due time.\n\nWhen our plane landed and ...
1,1,Millionaire.\n\nWhen my checking account regis...
2,2,"LIVE ON THE SCENE!\n\n""Thats right, Kelly, if ..."
3,3,The New Kid\n\nThere was something off about t...
4,4,Seven\n\nSeven. \n\nHow can such a boring numb...


In [9]:
Text = 'story'
texts = df[Text].astype(str).tolist()
print("Samples :", len(texts))

Samples : 98


In [10]:
out_txt = 'corpus.txt'
with open(out_txt, 'w', encoding = 'utf-8') as f:
  for t in texts:
    line = ' '.join(t.split())
    if line.strip():
      f.write(line + '\n')
print('Wrote corpus lines to', out_txt)

Wrote corpus lines to corpus.txt


In [11]:
import math
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer

In [12]:
model_name = 'distilgpt2'
block_size = 128
per_device_train_batch_size = 8
per_device_eval_batch_size = 8
num_train_epochs = 10
learnig_rate = 5e-5
weight_decay = 0.01
fp16 = True

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)

if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '<|pad|>'})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [13]:
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

dataset = load_dataset('text', data_files = {'train' : out_txt, 'validation' : out_txt}, split=None)

train_ds = dataset['train'].train_test_split(test_size = 0.05, seed = 42)['train']
val_ds = dataset['train'].train_test_split(test_size = 0.05, seed = 42)['test']

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [14]:
def tokenize_function(examples):
  return tokenizer(examples['text'], return_attention_mask = False)

tokenizer_train = train_ds.map(tokenize_function, batched = True, remove_columns = ['text'])
tokenizer_val = val_ds.map(tokenize_function, batched = True, remove_columns = ['text'])

def group_texts(examples):
  concatenated = sum(examples['input_ids'], [])
  total_length = len(concatenated)
  total_length = (total_length // block_size) * block_size
  result = {'input_ids' : [concatenated[i : i + block_size] for i in range(0, total_length, block_size)]}
  result['labels'] = result['input_ids'].copy()
  return result

Map:   0%|          | 0/93 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [15]:
lm_train = tokenizer_train.map(group_texts, batched = True, batch_size = 1000, remove_columns = tokenizer_train.column_names)
lm_val = tokenizer_val.map(group_texts, batched = True, batch_size = 1000, remove_columns = tokenizer_val.column_names)

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

Map:   0%|          | 0/93 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [16]:
training_args = TrainingArguments(
    output_dir = Output,
    overwrite_output_dir = True,
    num_train_epochs = num_train_epochs,
    per_device_train_batch_size = per_device_train_batch_size,
    per_device_eval_batch_size = per_device_eval_batch_size,
    eval_strategy = 'steps',
    eval_steps = 500,
    save_steps = 500,
    save_total_limit = 3,
    learning_rate = learnig_rate,
    weight_decay = weight_decay,
    fp16 = fp16,
    logging_steps = 100,
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_loss',
    greater_is_better = False,
    push_to_hub = False,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [17]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = lm_train,
    eval_dataset = lm_val,
    data_collator = data_collator,
)

trainer.train()
trainer.save_model(Output)
tokenizer.save_pretrained(Output)
print("Saved model & tokenizer to", Output)

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


Saved model & tokenizer to distilgpt2-finetuned


In [19]:
from transformers import pipeline
model.eval()
generator = pipeline("text-generation", model=Output, tokenizer=Output, device=0 if torch.cuda.is_available() else -1)

prompt = "In a forest"
out = generator(prompt, max_new_tokens=60, do_sample=True, temperature=0.9, top_k=50, num_return_sequences=1)
print(out[0]["generated_text"])


Device set to use cuda:0


In a forest in a remote part of the world, I saw the strange creature at the bottom of a pond, it had dark fur, an odd nose, blood gushing out of its mouth like a sick rat and its stomach churning blood. As I scoured carefully for the strange creature's name, I
