pip install torch transformers sklearn pandas

For Mac M1:

curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm Elon Musk,", max_length=30, num_return_sequences=5)

The next step is to use all tweets to build a TextDataset. The TextDataset is a custom implementation of the Pytroch Dataset class implemented by the transformers library. 

First, we split the tweets into a train and test section then write them into a train_dataset.txt and test_dataset.txt



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter

data = pd.read_csv("dataset/train_cleaned.csv")['content'].to_numpy()
train, test = train_test_split(data,test_size=0.15)
traindata = ''
testdata = ''
for i in train:
    traindata += i.replace("&amp", "") +'\n'
f = open('train_dataset.txt','w')
f.write(traindata)
for i in test:
    testdata += i.replace("&amp","") +'\n'
f = open('test_dataset.txt','w')
f.write(testdata)

The next step is to download the tokenizer. We use the tokenizer from the german-gpt2 model.

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
#if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import TextDataset,DataCollatorForLanguageModeling,LineByLineTextDataset

def load_dataset(train_path,test_path,tokenizer):
    train_dataset_LineByLine = LineByLineTextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset_LineByLine = LineByLineTextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False
    )
    return train_dataset_LineByLine, test_dataset_LineByLine, train_dataset,test_dataset,data_collator

train_dataset_LineByLine, test_dataset_LineByLine, train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)




In [13]:
from transformers import LineByLineTextDataset
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead

device = "cuda"
model = AutoModelWithLMHead.from_pretrained("gpt2").to(device)


from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="./gpt2-musk", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=1, # number of training epochs
    per_device_train_batch_size=2,# batch size for training
    per_device_eval_batch_size=1,  # batch size for evaluation
    eval_steps = 1000, # Number of update steps between two evaluations.
    #gradient_accumulation_steps=2,
    #eval_accumulation_steps = 1,
    #save_steps=800, # after # steps model is saved
    warmup_steps=5000,# number of warmup steps for learning rate scheduler
    report_to="tensorboard"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset_LineByLine,
    eval_dataset=test_dataset_LineByLine,
    #prediction_loss_only=True,
)

loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at /home/jae00yzha/.cache/huggingface/transformers/fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_p

In [8]:
for id, tensor  in enumerate(train_dataset):
    print(type(tensor))
    #print(tokenizer.convert_ids_to_tokens(tensor))
    print(tensor)
    if id == 1:
        break

<class 'torch.Tensor'>
tensor([ 9690,   329,  2263,   257,  2863,   319,   523,   867,   812,  2084,
            0, 15616,  2651,   284,  1262,   262,   649,  5686, 43523, 38712,
           13,   198, 18467,  3929,   572,   262,  1410,   981,  8680,   284,
          262, 21751,   422,   402,  1381,  1525,    13, 37614,  5035,   986,
          198, 23067,  5668,   329,  6079, 10701,   736,   284,  4219, 15636,
         1262,   691,  5636, 13654,    13,   575,   323,    13, 17330,   374,
          655,  2636,  3463,   287,  2272,    13,   198, 10995,   198,  9690,
           13, 40172,    11,   543,   355, 17833,   284,   307,  3938,  2162,
         3393, 42339,   287,  2208,  4334,  1398,    11,   318,   257,  4988,
          764,  3412,   996,  1846, 37970,   287, 40172,    14,    49,  2373,
          273,  1486,  2162,  1382,  1528,    14, 10464,    11,   340,   991,
        46701,  1254,  1103,    13,   198,    45,  3008,    13,  6930,     0,
          198, 17821,   198, 24428,   283

In [10]:
for id, tensor  in enumerate(train_dataset_LineByLine):
    print(type(tensor))
    print(tokenizer.convert_ids_to_tokens(tensor["input_ids"]))
    print(tensor["input_ids"])
    if id == 2:
        break

<class 'dict'>
['Thanks', 'Ġfor', 'Ġtaking', 'Ġa', 'Ġchance', 'Ġon', 'Ġso', 'Ġmany', 'Ġyears', 'Ġago', '!', 'ĠLooking', 'Ġforward', 'Ġto', 'Ġusing', 'Ġthe', 'Ġnew', 'ĠIr', 'idium', 'Ġconstellation', '.']
tensor([ 9690,   329,  2263,   257,  2863,   319,   523,   867,   812,  2084,
            0, 15616,  2651,   284,  1262,   262,   649,  5686, 43523, 38712,
           13])
<class 'dict'>
['Fin', 'ishing', 'Ġoff', 'Ġthe', 'Ġplan', 'Ġwhile', 'Ġlistening', 'Ġto', 'Ġthe', 'Ġsoundtrack', 'Ġfrom', 'ĠG', 'ats', 'by', '.', 'ĠSeems', 'Ġappropriate', '...']
tensor([18467,  3929,   572,   262,  1410,   981,  8680,   284,   262, 21751,
          422,   402,  1381,  1525,    13, 37614,  5035,   986])
<class 'dict'>
['Design', 'Ġcompleted', 'Ġfor', 'Ġbringing', 'Ġrocket', 'Ġback', 'Ġto', 'Ġlaunch', 'pad', 'Ġusing', 'Ġonly', 'Ġthr', 'usters', '.', 'ĠY', 'ay', '.', 'ĠWings', 'Ġr', 'Ġjust', 'Ġdead', 'Ġweight', 'Ġin', 'Ġspace', '.']
tensor([23067,  5668,   329,  6079, 10701,   736,   284,  4219, 15636, 

In [14]:
for i in trainer.get_train_dataloader():
    print(i)

{'input_ids': tensor([[ 2061,   815,   307,  4166,   319, 20313,    30, 50257, 50257, 50257,
         50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
         50257, 50257, 50257, 50257],
        [ 5703,   550,  3936, 32083,   329,  9965,   287, 13241, 11819,    13,
          7994,   284,  1182,   284,  2097,  4606,  4854,   284, 12811,   329,
         11938,   287, 15326,    13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[ 2061,   815,   307,  4166,   319, 20313,    30,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100],
        [ 5703,   550,  3936, 32083,   329,  9965,   287, 13241, 11819,    13,
          7994,   284,  1182,   284,  2097,  4606,  4854,   284, 12811,   329,
         11938,   287, 15326,    13]])}
{'in

In [6]:
trainer.train()

***** Running training *****
  Num examples = 14262
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 14262
  0%|          | 0/14262 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids.

In [8]:
trainer.save_model()

Saving model checkpoint to ./gpt2-musk
Configuration saved in ./gpt2-musk/config.json
Model weights saved in ./gpt2-musk/pytorch_model.bin


In [9]:
from transformers import pipeline

tweet = pipeline('text-generation',model='gpt2-musk', tokenizer=tokenizer )

loading configuration file gpt2-musk/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-musk",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.21.0",
  "use_cache": true,
  "vocab_size": 50257
}

loading config

In [10]:
#generator = pipeline('text-generation', model='gpt2')
from transformers import pipeline, set_seed
set_seed(42)
tweet("With steel membrane wings like a Dragon,", max_length=50, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'With steel membrane wings like a Dragon, with massive wings and wings which take half of the air, they are powered by the powerful H1B engine. It is powered from a steam engine at 40,000 rpm. The wings, made of aluminum'},
 {'generated_text': "With steel membrane wings like a Dragon, which have wingtips, are an indication of a human being's intelligence. This means that the wings are not a robotic construct, but is a physical thing. They are essentially a self-conscious mechanical mechanism."},
 {'generated_text': "With steel membrane wings like a Dragon, the wings are much bigger than what is shown in this picture. They cover more distance with more feathers on the wing, and the wings are also very short compared to most birds. But it's amazing. Some"},
 {'generated_text': 'With steel membrane wings like a Dragon, a humanoid figure would not easily fly or land in cold waters with the wind in its sails, and he was able to avoid direct damage from water.\nThe flying