In [22]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, RobertaForSequenceClassification, RobertaTokenizer
import torch

# Load GPT-2 model and tokenizer
model_name_gpt2 = "gpt2"
model_gpt2 = GPT2LMHeadModel.from_pretrained(model_name_gpt2)
tokenizer_gpt2 = GPT2Tokenizer.from_pretrained(model_name_gpt2)

# Load RoBERTa model and tokenizer for sequence classification (used for coherence scoring)
model_name_roberta = 'roberta-base'
model_roberta = RobertaForSequenceClassification.from_pretrained(model_name_roberta)
tokenizer_roberta = RobertaTokenizer.from_pretrained(model_name_roberta)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
def score_coherence_with_roberta(sentence1, sentence2):
    """
    Score the coherence of two sentences using RoBERTa's Sequence Classification.
    
    Args:
        sentence1 (str): The first sentence.
        sentence2 (str): The second sentence.
        
    Returns:
        float: A coherence score, where higher indicates more coherent.
    """
    # Encode the sentences as a single input
    inputs = tokenizer_roberta.encode_plus(sentence1, sentence2, return_tensors='pt', add_special_tokens=True)
    # Get logits from RoBERTa
    logits = model_roberta(**inputs).logits
    # Coherence score is the softmax probability of the "IsNext" class, assuming index 0 is for "IsNext"
    coherence_score = torch.softmax(logits, dim=1)[0][0].item()
    return coherence_score

In [24]:
def generate_sentence_with_sequence_and_roberta(start_word, sequence):
    generated = start_word
    for index, letter in enumerate(sequence):
        attempts = 0
        found_word = False
        while not found_word and attempts < 100:
            prompt_text = generated
            input_ids = tokenizer_gpt2.encode(prompt_text, return_tensors='pt')
            outputs = model_gpt2.generate(
                input_ids,
                max_length=input_ids.shape[1] + 10,
                temperature=0.8,
                num_return_sequences=5,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                repetition_penalty=1.2,
                pad_token_id=tokenizer_gpt2.eos_token_id
            )
            candidate_words = set()
            for output in outputs:
                generated_text = tokenizer_gpt2.decode(output, skip_special_tokens=True)
                new_text = generated_text[len(prompt_text):].strip()
                words = new_text.split()
                for word in words:
                    if word.lower().startswith(letter):
                        candidate_words.add(word)
                        break
            best_word = None
            best_score = float('-inf')
            for word in candidate_words:
                test_sentence = f"{generated} {word}"
                score = score_coherence_with_roberta(prompt_text, test_sentence)
                if score > best_score:
                    best_score = score
                    best_word = word
            if best_word:
                generated += ' ' + best_word
                found_word = True
            else:
                attempts += 1
        if not found_word:
            print(f"Could not find a suitable word starting with '{letter}'.")
            break
    return generated

In [25]:
# Example usage
start_word = "Important"
sequence = ["n", "c", "e", "p", "t", "i", "o", "n"]
generated_sentence = generate_sentence_with_sequence_and_roberta(start_word, sequence)
print(generated_sentence)

Important note, content Eric prior the its our new
