In [54]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import os
import csv
import pandas as pd
import tweepy
from auth_tw import get_key
import tweepy

# Check for GPU availability and set the device accordingly
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [55]:
def load_model(MODEL_EPOCH=4):
    """
    Function to load a trained GPT-2 model from a specified epoch.

    Args:
    - MODEL_EPOCH (int): The epoch of the model to load (default is 4).

    Returns:
    - model (GPT2LMHeadModel): The loaded GPT-2 model.
    - tokenizer (GPT2Tokenizer): The tokenizer corresponding to the loaded model.
    """

    # Define the folder where trained models are stored
    models_folder = "../trained_models"

    # Construct the path to the model file for the specified epoch
    model_path = os.path.join(models_folder, f"gpt2_xl_manbot_{MODEL_EPOCH}.pt")

    # Initialize the tokenizer with the base GPT-2 model
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # Initialize the GPT-2 model for language modeling
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Load the state dict of the model parameters from the specified path
    model.load_state_dict(torch.load(model_path))

    # Move the model to the specified device (GPU)
    model = model.to(device)

    # Set the model in evaluation mode
    model.eval()

    # Return the loaded model and tokenizer
    return model, tokenizer


In [56]:
model, tokenizer = load_model()

data_twq = pd.read_csv('data_twq.csv',sep=';').drop(['Unnamed: 0'],axis=1)

In [None]:
def return_first_word(tweet):
    return str(tweet[0].split(' ')[0])

In [None]:
# Apply a function 'return_first_word' to each row of the DataFrame 'data_twq'
# The result is stored in the 'first_words' Series
first_words = data_twq.apply(return_first_word, axis=1).copy()

# Calculate the total count of occurrences of each first word in the 'first_words' Series
sumInstances = pd.DataFrame(first_words).value_counts().sum()

# Create a DataFrame containing the unique first words
words = pd.DataFrame(pd.DataFrame(first_words).value_counts().index.tolist())

# Calculate the probability of each first word occurrence
propability = pd.DataFrame(pd.DataFrame(first_words).value_counts().values / sumInstances)

# Join the 'words' DataFrame and the 'propability' DataFrame based on index
word_prob = words.join(propability,how='left', lsuffix='_left')

# Rename the columns of the resulting DataFrame
word_prob.columns = ['word', 'prob']

In [None]:
def choose_from_top(probs, n=5):
    # Find the indices of the top n elements in probs
    ind = np.argpartition(probs, -n)[-n:]

    # Get the top n probabilities
    top_prob = probs[ind]

    # Normalize the probabilities to make sure they sum up to 1
    top_prob = top_prob / np.sum(top_prob) 

    # Randomly choose an index based on the normalized probabilities
    choice = np.random.choice(n, 1, p = top_prob)

    # Get the token_id corresponding to the chosen index
    token_id = ind[choice][0]
    return int(token_id)

In [None]:
def post_on_twitter(tweet_text = ""):
    """
    Function to post a tweet on Twitter.

    Args:
    - tweet_text (str): The text content of the tweet (default is an empty string).

    Returns:
    None
    """
    
    # Authenticate with Twitter API using OAuth
    auth = tweepy.OAuthHandler(get_key("api_key"), get_key("api_key_secret"))
    auth.set_access_token(get_key("access_token"), get_key("access_token_secret"))

    # Create a client using tweepy.Client
    client = tweepy.Client(bearer_token=get_key("bearer_token"), consumer_key=get_key("api_key"), consumer_secret=get_key("api_key_secret"), access_token=get_key("access_token"), access_token_secret=get_key("access_token_secret"))

    try:
        # Attempt to create a tweet with the provided text
        response = client.create_tweet(text=tweet_text)
        tweet_id =  response.data['id']
        print('Tweet posted successfully! Tweet ID:', tweet_id)
    except Exception as e:
        print('Error occurred while posting the tweet:', e)

In [None]:
def generate_content(random = True,start_with='',output_file='generated_content.txt', size=5, post_on_twitter = False):
    """
    Function to generate content (tweets) based on a trained language model.

    Args:
    - random (bool): Flag indicating whether to start with a random word (default is True).
    - start_with (str): The starting word for generation if 'random' is set to False (default is an empty string).
    - output_file (str): The file path to save the generated content (default is 'generated_content.txt').
    - size (int): The number of tweets to generate (default is 5).
    - post_on_twitter (bool): Flag indicating whether to post generated content on Twitter (default is False).

    Returns:
    None
    """

    # Define the file path to save the generated content
    output_file_path = f'{output_file}'

    # Set the model in evaluation mode
    model.eval()

    # Remove the existing file if it already exist
    if os.path.exists(output_file_path):
        os.remove(output_file_path)
    
    tweet_num = 0

    # Perform generation without gradient computation
    with torch.no_grad():
   
        for tweet_idx in range(size):
        
            tweet_finished = False
            first_word = ''
            
            # Choose the first word randomly from the word probability distribution
            if random: 
                first_word = word_prob['word'][np.random.choice(np.arange(len(word_prob)),p=word_prob['prob'])]
            else:
                first_word = start_with

            # Encode the first word and convert it to a tensor
            cur_ids = torch.tensor(tokenizer.encode(first_word)).unsqueeze(0).to(device)

            # Generate tokens to complete the tweet
            for i in range(100):
                outputs = model(cur_ids, labels=cur_ids)
                loss, logits = outputs[:2]
                softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding

                # Adjust the top-N sampling based on the iteration
                if i < 3:
                    n = 20
                else:
                    n = 3

                # Select the next token using top-N sampling
                next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n) #Randomly(from the topN probability distribution) select the next word
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

                # Check if tweet generation is complete
                if next_token_id in tokenizer.encode('<|endoftext|>'):
                    tweet_finished = True
                    break

            # If tweet generation is complete, process and write to file
            if tweet_finished:
                
                tweet_num = tweet_num + 1                
                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)

                # Optionally post the generated content on Twitter
                if post_on_twitter:
                    post_on_twitter(output_text)
                print(output_text)

                # Write the generated tweet to the output file
                with open(output_file_path, 'a', encoding='utf-8') as f:
                    f.write(f"{output_text} \n\n")

In [48]:
#Generate 1 tweet, save it to random5_generated.txt file and post a tweet containing generated text
generate_content(output_file = 'random5_generated.txt', size=1,post_on_twitter=True)

Corporations profit from social media.

The average person has no idea what they're doing.

The average person doesn't care.

The average person thinks they're the best thing they're doing.

The average person doesn't care.<|endoftext|>
Help them love what you are doing.<|endoftext|>


In [None]:
#Start tweets with "Driving" 
generate_content(False, 'Driving ',output_file = 'Driving_generated.txt', size=50)