# Pretrained GPT2 Model to generate Elon Musk Tweets

**Training Data:**   
Source: https://www.kaggle.com/datasets/aryansingh0909/elon-musk-tweets-updated-daily?select=elonmusk.csv   
License: CC0: Public Domain   

Enviroment: tf-gpu

In [1]:
import pandas as pd
import re

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
import torch
import tensorflow as tf
import nltk
from nltk.corpus import stopwords

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
print(torch.cuda.is_available())

False


## Import Data

In [4]:
tweets_df = pd.read_csv('./data/elonmusk.csv')
tweets_df

Unnamed: 0,Datetime,Tweet Id,Text,Username
0,2023-06-29 16:20:19+00:00,1674452749378002945,@mwseibel Yup,elonmusk
1,2023-06-29 15:53:52+00:00,1674446089179766789,@TitterDaily True,elonmusk
2,2023-06-29 03:46:37+00:00,1674263071337111552,@paulg Generational trauma. An example of why ...,elonmusk
3,2023-06-29 01:19:59+00:00,1674226170488057856,Improved longform posts,elonmusk
4,2023-06-28 18:05:58+00:00,1674116945808068608,@BillyM2k Best protip ever,elonmusk
...,...,...,...,...
24445,2011-12-03 08:22:07+00:00,142881284019060736,That was a total non sequitur btw,elonmusk
24446,2011-12-03 08:20:28+00:00,142880871391838208,"Great Voltaire quote, arguably better than Twa...",elonmusk
24447,2011-12-01 10:29:04+00:00,142188458125963264,I made the volume on the Model S http://t.co/w...,elonmusk
24448,2011-12-01 09:55:11+00:00,142179928203460608,Went to Iceland on Sat to ride bumper cars on ...,elonmusk


## Preprocessing

In [5]:
# sort out short tweets <7 words
min_word_count = 7
tweets_df['word_count'] = tweets_df['Text'].apply(lambda x: len(x.split()))
df_filtered = tweets_df[tweets_df['word_count'] >= min_word_count]

# extract text column
texts = df_filtered['Text'].tolist()

# extract word_count column
df_filtered = df_filtered.drop(columns=['word_count'])

print(df_filtered.head())# sort out short tweets <7 words

                     Datetime             Tweet Id  \
2   2023-06-29 03:46:37+00:00  1674263071337111552   
10  2023-06-28 02:53:54+00:00  1673887418184007680   
17  2023-06-27 23:12:47+00:00  1673831774038949888   
19  2023-06-27 16:14:14+00:00  1673726440284495872   
21  2023-06-27 13:27:15+00:00  1673684418974609410   

                                                 Text  Username  
2   @paulg Generational trauma. An example of why ...  elonmusk  
10  @cb_doge @TuckerCarlson @TheBabylonBee @ZubyMu...  elonmusk  
17  Watch the entire first episode on this platfor...  elonmusk  
19  @RobertMSterling @WholeMarsBlog @nytimes Prett...  elonmusk  
21  @InfographicTony @MarcusHouse @FelixSchlang @c...  elonmusk  


In [6]:
# create a list of tweet texts
texts = df_filtered['Text'].tolist()

for text in texts[:5]:
    print(text)

@paulg Generational trauma. An example of why forgiveness of those who harmed you (if it stops further harm) is so important.
@cb_doge @TuckerCarlson @TheBabylonBee @ZubyMusic @AppleTV Very exciting!
Watch the entire first episode on this platform. Great move by Apple!

Note, you can Airplay from your iPhone to TV to watch on a big screen.
@RobertMSterling @WholeMarsBlog @nytimes Pretty much only time I see NYT articles is when they’re mentioned here. Their readership, especially user-minutes per day, is tiny compared to this platform.
@InfographicTony @MarcusHouse @FelixSchlang @considercosmos @GregScott_photo @LunarCaveman @SpaceX @SpacesFuture @TJ_Cooney @LabPadre @Erdayastronaut Three center Raptors of Booster will fire at ~50% thrust during hot staging


In [7]:
# cleaning and preprocessing of the tweet texts
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = re.sub(r'@\w+', '', text)  # Removal of mentions
    text = re.sub(r'http\S+', '', text)  # Removal of URLs
    text = re.sub(r'[^\w\s.!?]', '', text)  # Removal of only certain punctuation marks
    return text.strip().lower() #Removal of spaces at the beginning and end of strings and conversion of the entire string to lower case letters

cleaned_texts = [clean_text(text) for text in texts]
cleaned_texts[0:3]

['generational trauma. an example forgiveness harmed if stops harm important.',
 'very exciting!',
 'watch entire first episode platform. great move apple! note airplay iphone tv watch big screen.']

## Model Training

In [None]:
# Create a dataset from the tweets
dataset = Dataset.from_dict({'text': cleaned_texts})

# Load model und tokenizer
model_name = 'distilgpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_name)

# Specify pad token
tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_function(examples):
    tokenized_output = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=50)
    tokenized_output['labels'] = tokenized_output['input_ids'].copy()
    return tokenized_output

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=2,  # Number of training epochs
    per_device_train_batch_size=8, # Batch Size
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    save_steps=10_000, # Save model every 10.000 steps
    save_total_limit=2,  
    learning_rate=5e-5, # Low learning rate to find the optimum parameters
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

# Model training
trainer.train()

  return torch.load(checkpoint_file, map_location="cpu")
100%|██████████| 14/14 [00:01<00:00,  7.52ba/s]
The following columns in the training set don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: text. If text are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 13960
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 872


Step,Training Loss


## Tweet Generation

In [None]:
def generate_tweet(prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(inputs['input_ids'], max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example: Generation of a tweet based on a start word or phrase
prompt = "In the future Tesla will"
generated_tweet = generate_tweet(prompt)
print(generated_tweet)

## Save the Model

In [None]:
# Save model and tokenizer
model.save_pretrained("./saved_model")
tokenizer.save_pretrained("./saved_model")

## Using the Model

In [None]:
# Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./saved_model")
tokenizer = GPT2Tokenizer.from_pretrained("./saved_model")

# Example of the use of the model
inputs = tokenizer("With the help of ai we will", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0]))