In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import csv
import pandas as pd

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [4]:
model_path = '/BotPython/gpt2_model'

tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model = model.to(device)

In [5]:
with open('data_tweets_full.csv', encoding="utf8") as csv_file:
            data_twq = csv.reader(csv_file, delimiter=';')

In [6]:
data_twq = pd.read_csv('data_tweets_full.csv',sep=';').drop(['Unnamed: 0'],axis=1)

In [7]:
data_twq

Unnamed: 0,0
0,Never allow anyone to tell you that you're not...
1,"Always keep pushing, you never know how close ..."
2,Become more conscious about your inputs. Read ...
3,Never stop learning.\nNever stop living. \nNev...
4,The things we fear doing are the things we NEE...
...,...
35191,"‚ÄúWhatever our souls are made of, his and mine ..."
35192,‚ÄúLove is or it ain‚Äôt. Thin love ain‚Äôt love at ...
35193,‚ÄúI took a deep breath and listened to the old ...
35194,‚ÄúThe only way out of the labyrinth of sufferin...


In [9]:
def return_first_word(tweet):
    return str(tweet[0].split(' ')[0])

In [10]:
first_words = data_twq.apply(return_first_word, axis=1).copy()

In [11]:
pd.DataFrame(pd.DataFrame(first_words).value_counts())

Unnamed: 0_level_0,0
0,Unnamed: 1_level_1
If,2514
The,2319
You,1563
A,1559
Women,961
...,...
Lift\nFight\nMeet,1
"Lift,",1
Lift.\nBuild.\nFight.\nWork.\nProtect.\nProvide.\n\nWhen,1
1.5,1


In [12]:
sumInstances = pd.DataFrame(first_words).value_counts().sum()

In [13]:
words = pd.DataFrame(pd.DataFrame(first_words).value_counts().index.tolist())

In [14]:
propability = pd.DataFrame(pd.DataFrame(first_words).value_counts().values / sumInstances)

In [15]:
word_prob = words.join(propability,how='left', lsuffix='_left')
word_prob.columns = ['word', 'prob']

In [16]:
word_prob

Unnamed: 0,word,prob
0,If,0.071429
1,The,0.065888
2,You,0.044408
3,A,0.044295
4,Women,0.027304
...,...,...
3966,Lift\nFight\nMeet,0.000028
3967,"Lift,",0.000028
3968,Lift.\nBuild.\nFight.\nWork.\nProtect.\nProvid...,0.000028
3969,1.5,0.000028


In [3]:

def choose_from_top(probs, n=5):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [17]:
def generate_content(random = True,start_with='',output_file='generated_content.txt', size=100):
    MODEL_EPOCH = 4

    output_file_path = f'GENERATED_CONTENT/{output_file}'

    model.eval()
    if os.path.exists(output_file_path):
        os.remove(output_file_path)
    
    tweet_num = 0
    with torch.no_grad():
   
        for tweet_idx in range(size):
        
            tweet_finished = False
            first_word = ''
            if random: 
                first_word = word_prob['word'][np.random.choice(np.arange(len(word_prob)),p=word_prob['prob'])]
            else:
                first_word = start_with
            cur_ids = torch.tensor(tokenizer.encode(first_word)).unsqueeze(0).to(device)

            for i in range(100):
                outputs = model(cur_ids, labels=cur_ids)
                loss, logits = outputs[:2]
                softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
                if i < 3:
                    n = 20
                else:
                    n = 3
                next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

                if next_token_id in tokenizer.encode('<|endoftext|>'):
                    tweet_finished = True
                    break

            
            if tweet_finished:
                
                tweet_num = tweet_num + 1
                
                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)

                with open(output_file_path, 'a', encoding='utf-8') as f:
                    f.write(f"{output_text} \n\n")

In [25]:
generate_content(output_file = 'random5_generated.txt', size=10)

In [105]:
#Start tweets with "Driving" 
generate_content(False, 'Driving ',output_file = 'Driving_generated.txt', size=50)