pip install torch transformers sklearn pandas

For Mac M1:

curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm Elon Musk,", max_length=30, num_return_sequences=5)

The next step is to use all tweets to build a TextDataset. The TextDataset is a custom implementation of the Pytroch Dataset class implemented by the transformers library. 

First, we split the tweets into a train and test section then write them into a train_dataset.txt and test_dataset.txt



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.tensorboard import SummaryWriter

data = pd.read_csv("dataset/train_cleaned.csv")['content'].to_numpy()
train, test = train_test_split(data,test_size=0.15)
traindata = ''
testdata = ''
for i in train:
    traindata += i.replace("&amp", "") +'\n'
f = open('train_dataset.txt','w')
f.write(traindata)
for i in test:
    testdata += i.replace("&amp","") +'\n'
f = open('test_dataset.txt','w')
f.write(testdata)

The next step is to download the tokenizer. We use the tokenizer from the german-gpt2 model.

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
#if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

In [None]:
from transformers import TextDataset,DataCollatorForLanguageModeling,LineByLineTextDataset

def load_dataset(train_path,test_path,tokenizer):
    train_dataset_LineByLine = LineByLineTextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset_LineByLine = LineByLineTextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False
    )
    return train_dataset_LineByLine, test_dataset_LineByLine, train_dataset,test_dataset,data_collator

train_dataset_LineByLine, test_dataset_LineByLine, train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)


In [None]:
from transformers import LineByLineTextDataset
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("gpt2")


from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="./gpt2-musk", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=1, # number of training epochs
    per_device_train_batch_size=64,# batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    eval_steps = 1000, # Number of update steps between two evaluations.
    #save_steps=800, # after # steps model is saved
    #warmup_steps=500,# number of warmup steps for learning rate scheduler
    report_to="tensorboard"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    #prediction_loss_only=True,
)

In [None]:
for id, tensor  in enumerate(train_dataset):
    print(type(tensor))
    print(tokenizer.convert_ids_to_tokens(tensor))
    if id == 1:
        break

In [None]:
for id, tensor  in enumerate(train_dataset_LineByLine):
    print(type(tensor))
    print(tokenizer.convert_ids_to_tokens(tensor["input_ids"]))
    if id == 2:
        break

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
from transformers import pipeline

tweet = pipeline('text-generation',model='gpt2-musk', tokenizer=tokenizer )

In [None]:
#generator = pipeline('text-generation', model='gpt2')
from transformers import pipeline, set_seed
set_seed(42)
tweet("With steel membrane wings like a Dragon,", max_length=50, num_return_sequences=5)