# Standard Q & A

In [None]:
import torch
from transformers import GPTJForCausalLM, GPTJTokenizer

# Load the GPT-J tokenizer and model
tokenizer = GPTJTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

# Define the prompt or question
prompt = "What is the capital of France?"

# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors='pt').input_ids

# Generate the answer
sample_outputs = model.generate(
    input_ids,
    do_sample=True,
    max_length=100,
    top_k=50,
    top_p=0.95,
    temperature=1.0,
)

# Decode the answer
answer = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)

# Print the answer
print(answer)

# text-to-QA

In [3]:
import torch
from transformers import GPTJForCausalLM, GPTJTokenizer

ModuleNotFoundError: No module named 'torch'

In [None]:
# Load the GPT-J tokenizer and model
tokenizer = GPTJTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

# Define the input text
text = "The capital of France is Paris. France is located in Western Europe."


# Tokenize the input text
inputs = tokenizer.encode(text, return_tensors="pt")

# Generate the questions and answers
questions = []
answers = []
for i in range(5):
    prompt = "Question: "
    answer = "Answer: "
    prompt_input = tokenizer.encode(prompt, return_tensors="pt")
    answer_input = tokenizer.encode(answer, return_tensors="pt")
    prompt_length = len(tokenizer.decode(prompt_input[0], skip_special_tokens=True))
    max_length = 100 - prompt_length
    outputs = model.generate(inputs, do_sample=True, max_length=max_length, top_p=0.95, top_k=50)
    qa = tokenizer.decode(outputs[0], skip_special_tokens=True).split(".", 1)
    if len(qa) == 2:
        question, answer = qa
        questions.append(prompt + question.strip())
        answers.append(answer_input.decode() + answer.strip())

# Print the generated questions and answers
for i in range(len(questions)):
    print(questions[i])
    print(answers[i])
    print()

# Text&Q to answer

needed for MultiDocQA

In [None]:
import torch
from transformers import GPTJForCausalLM, GPTJTokenizer

In [None]:

# Load the GPT-J tokenizer and model
tokenizer = GPTJTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

# Define the input text and question
text = "The capital of France is Paris. France is located in Western Europe."
question = "What is the capital of France?"

# Tokenize the input text and question
input_text = text + "\nQuestion: " + question + "\nAnswer:"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate the answer
output = model.generate(input_ids, do_sample=True, max_length=100, top_p=0.95, top_k=50)
answer = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the answer
print(answer)

# Similarity check:

## Bert

In [None]:
import torch
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


In [None]:
# Define the input strings
input_text1 = "This is the first input text."
input_text2 = "This is the second input text."

# Tokenize the input texts and convert to tensors
tokens1 = tokenizer.encode(input_text1, add_special_tokens=True)
input_ids1 = torch.tensor(tokens1).unsqueeze(0)
tokens2 = tokenizer.encode(input_text2, add_special_tokens=True)
input_ids2 = torch.tensor(tokens2).unsqueeze(0)

# Pass the input ids through the BERT model to generate the embeddings
with torch.no_grad():
    outputs1 = model(input_ids1)
    embeddings1 = outputs1.last_hidden_state.mean(dim=1)
    outputs2 = model(input_ids2)
    embeddings2 = outputs2.last_hidden_state.mean(dim=1)

# Compute the cosine similarity between the embeddings
similarity = cosine_similarity(embeddings1, embeddings2)[0][0]

# Print the similarity score
print("Similarity score:", similarity)

In [14]:
# del BertModel, BertTokenizer

NameError: name 'BertModel' is not defined

## GPTJ

In [1]:
import torch
from transformers import GPTJModel, GPT2Tokenizer

In [16]:
# Load the GPT-J model and tokenizer
model = GPTJModel.from_pretrained('EleutherAI/gpt-j-6B')
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-j-6B')

# Define the input text
input_text = "This is an example input text to generate embeddings."

# Tokenize the input text and convert to tensors
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate the text embeddings
with torch.no_grad():
    embeddings = model(input_ids).last_hidden_state.mean(dim=1)

# Print the text embeddings
print(embeddings)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

# Text Completion

In [None]:
import torch
from transformers import GPTJForCausalLM, GPT2Tokenizer

In [10]:
# Load the GPT-J model and tokenizer
model = GPTJForCausalLM.from_pretrained('EleutherAI/gpt-j-6B')
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-j-6B')

# Define the input text
input_text = "This is an example sentence to be completed with GPT-J:"

# Tokenize the input text and convert to tensors
input_ids = tokenizer.encode(input_text, return_tensors='pt')

# Generate the text completion
output = model.generate(input_ids, max_length=100, do_sample=True)

# Decode the output tokens to get the completed text
completed_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the completed text
print(completed_text)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

OSError: [Errno 28] No space left on device

# Text Summerizer

In [None]:
import torch
from transformers import GPTJTokenizer, GPTJForCausalLM


In [None]:

# Load the GPT-J tokenizer and model
tokenizer = GPTJTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")

# Define the input text
input_text = "Put your input text here."

# Tokenize the input text
inputs = tokenizer.encode(input_text, return_tensors="pt")

# Generate the summary
summary_ids = model.generate(inputs, max_length=100, num_beams=4, early_stopping=True)

# Decode the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the summary
print(summary)

# Notes:

GPTJModel is a standard implementation of the GPT-J model that can be used for a variety of tasks, including text generation, text classification, and text embedding. It takes as input a sequence of tokens and outputs a tensor representing the hidden states of the model. This tensor can be used for various downstream tasks, such as text classification, text similarity, or text clustering. GPTJModel does not include any special handling of padding or masking, so the input sequences should be padded to a fixed length before being passed to the model.

GPTJForCausalLM, on the other hand, is a pre-configured version of the GPT-J model that is specifically designed for language modeling tasks, such as text generation or text completion. It includes special handling of padding and masking, which is necessary for text generation tasks where the model needs to predict the next token in a sequence based on the previous tokens. GPTJForCausalLM takes as input a sequence of tokens and generates a sequence of output tokens, with each token being predicted based on the previous tokens. GPTJForCausalLM is therefore more specialized than GPTJModel, and is best suited for language modeling tasks.