In [1]:
import nltk
from transformers import DPRQuestionEncoder, DPRContextEncoder, DPRQuestionEncoderTokenizer, DPRContextEncoderTokenizer
from transformers import BartForConditionalGeneration, BartTokenizer
import torch

from  chunking import Chunking

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = "cuda:1"
# Download NLTK data
# I had to export as an env var where the data were downloaded : export NLTK_DATA=/home/hay4hi/nltk_data
nltk.set_proxy('http://rb-proxy-de.bosch.com:8080')
nltk.download('punkt')
nltk.download('punkt_tab')

# Load retriever models and tokenizers
question_encoder = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base').to(device)
context_encoder = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base').to(device)
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')


# Bart does not work so great for question answering
# Load generator model and tokenizer
#llm_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(device)
#llm_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# GPT2 does not work so great for question answering
#from transformers import GPT2LMHeadModel, GPT2Tokenizer
#llm_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
#llm_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/hay4hi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/hay4hi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load pre-trained T5 model and tokenizer
llm_model = T5ForConditionalGeneration.from_pretrained('t5-11B')  # You can also use 't5-base' or 't5-large'
llm_tokenizer = T5Tokenizer.from_pretrained('t5-11B')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
torch.cuda.empty_cache()

In [16]:
from bs4 import BeautifulSoup
import requests

# URL of the webpage
url = 'https://fangj.github.io/friends/season/0804.html'

# Headers to mimic a browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    webpage_content = response.text
    soup = BeautifulSoup(webpage_content, 'html.parser')
    
    # Extract the text content
    text_content = soup.get_text(separator=' ', strip=True)  # Use space as a separator instead of newlines
    cleaned_text = ' '.join(text_content.split())  # Remove excessive spaces and newlines
    print("Webpage downloaded and processed successfully.")
    
    # Save the cleaned text content to a file
    with open('friends_S8_E4.txt', 'w') as file:
        file.write(text_content)
    print("Cleaned webpage content saved to 'friends_S8_E4.txt'.")
else:
    print(f"Failed to download webpage. Status code: {response.status_code}")

Webpage downloaded and processed successfully.
Cleaned webpage content saved to 'friends_S8_E4.txt'.


In [3]:
# Read the text document
with open('friends_S8_E4.txt', 'r') as file:
    text_data = file.read()



In [4]:
chunker = Chunking()

# Step 1: Divide the text into chunks (e.g., sentences)
#chunker.chunking_into_sentences(large_document)
chunker.chunking_sliding_window(text_data, window_size=10, stride=1)
chunks = chunker.chunks

# Step 2: Encode the chunks using the context encoder
chunk_embeddings = [context_encoder(**context_tokenizer(chunk, return_tensors='pt').to(device)).pooler_output for chunk in chunks]

In [5]:
# Input query
query = "what is the backpacking story ?"

# Step 3: Encode the query using the question encoder
query_embedding = question_encoder(**question_tokenizer(query, return_tensors='pt').to(device)).pooler_output

In [6]:
similarities = [torch.cosine_similarity(query_embedding, chunk_embedding) for chunk_embedding in chunk_embeddings]
retrieved_chunk = chunks[torch.argmax(torch.tensor(similarities))]
print("the retrieved chunk is : ", retrieved_chunk)

the retrieved chunk is :  It’s this story I
came up with, very romantic. I swear any woman that hears it; they’re like putty. Ross: Really? Well then tell it to me. Joey: Okay. Now you’re gonna want to have sex with me when you hear it, but
you have to remember it is just the story. Ross: (sarcastic) I’ll try to control myself. Joey: Okay. (Clears throat) Years ago, when I was backpacking across Western
Europe… Ross: (laughs) You were backpacking across Western Europe? Joey: Have a nice six more months Ross!


In [None]:
## FOR GTP2

#query = "who is joey in friends"
the_prompt = f"Q : {query}, knowing that {retrieved_chunk}, A:"
prompt_tokens = llm_tokenizer.encode(the_prompt, return_tensors='pt').to(device)
answer_tokens = llm_model.generate(
        prompt_tokens,
        max_length=200,  # Adjust as needed
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        temperature=1,
        top_k=10,
        top_p=0.95,
        do_sample=True
    )
response = llm_tokenizer.decode(answer_tokens[0], skip_special_tokens=True)

print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Q : what is the backpacking story ?, knowing that It’s this story I
came up with, very romantic. I swear any woman that hears it; they’re like putty. Ross: Really? Well then tell it to me. Joey: Okay. Now you’re gonna want to have sex with me when you hear it, but
you have to remember it is just the story. Ross: (sarcastic) I’ll try to control myself. Joey: Okay. (Clears throat) Years ago, when I was backpacking across Western
Europe… Ross: (laughs) You were backpacking across Western Europe? Joey: Have a nice six more months Ross!, A: "It" is a song about a young girl who is
Ross: I don't know why I said it but I know you like it! A young guy: What do you mean? Ross : (giggle) Ross, if


In [None]:
#query = "can you summarize the backpacking story ?"
input_text = f"question: {query}  context: "
input_text += retrieved_chunk
input_tokens = llm_tokenizer.encode(input_text, return_tensors='pt')

# Generate the answer
output_tokens = llm_model.generate(input_tokens, 
                               max_length=200,  # Adjust as needed
                                num_return_sequences=1,
                                no_repeat_ngram_size=2,
                                temperature=2.0,
                                top_k=10,
                                top_p=0.95,
                                do_sample=True)
answer = llm_tokenizer.decode(output_tokens[0], skip_special_tokens=True)
print(answer)

Years ago, when I was backpacking across Western Europe
