In [2]:
!pip install datasets sentence-transformers faiss-cpu transformers

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->da

In [3]:
# Import necessary libraries
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the dataset
print("Loading dataset...")
dataset = load_dataset('rotten_tomatoes', split='train[:1000]')  # Using only first 1000 samples

# Initialize the sentence transformer model
print("Initializing the sentence transformer model...")
sentence_model = SentenceTransformer('paraphrase-MiniLM-L3-v2')

# Prepare documents and their embeddings
print("Preparing document embeddings...")
documents = dataset['text']
embeddings = sentence_model.encode(documents, convert_to_numpy=True)

# Create and populate Faiss index
print("Creating Faiss index...")
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)

# Save the Faiss index and document texts
print("Saving Faiss index and document texts...")
faiss.write_index(index, 'faiss_index.index')
np.save('document_texts.npy', documents)

# Load models for generation
print("Loading models...")
tokenizer = AutoTokenizer.from_pretrained('sshleifer/distilbart-cnn-6-6')
generator = AutoModelForSeq2SeqLM.from_pretrained('sshleifer/distilbart-cnn-6-6')

# Function to generate answers
def generate_answer(query):
    # Encode the query using the sentence transformer
    query_embedding = sentence_model.encode([query], convert_to_numpy=True)

    # Find similar documents
    _, I = index.search(query_embedding, 1)
    context = documents[I[0][0]]

    # Generate answer
    input_ids = tokenizer(query + " Context: " + context, return_tensors='pt')['input_ids']
    generated = generator.generate(input_ids, max_length=50)
    answer = tokenizer.decode(generated[0], skip_special_tokens=True)
    return answer, context

if __name__ == '__main__':
    # Test the model
    query = "What is the sentiment of this movie review?"
    answer, context = generate_answer(query)
    print(f"Query: {query}")
    print(f"Context: {context}")
    print(f"Answer: {answer}")

    # Another example
    query = "Summarize this movie review."
    answer, context = generate_answer(query)
    print(f"\nQuery: {query}")
    print(f"Context: {context}")
    print(f"Answer: {answer}")

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/699k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Initializing the sentence transformer model...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Preparing document embeddings...
Creating Faiss index...
Saving Faiss index and document texts...
Loading models...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/460M [00:00<?, ?B/s]



Query: What is the sentiment of this movie review?
Context: the film is often filled with a sense of pure wonderment and excitement not often seen in today's cinema du sarcasm
Answer:  The film is often filled with a sense of pure wonderment and excitement not often seen in today's cinema du sarcasm. The film has been a hit with some of the world's most popular films. The film was released this week

Query: Summarize this movie review.
Context: this is a very fine movie -- go see it .
Answer: Summarize this movie review. Summarize: This is a very fine movie -- go see it. The movie is about a young couple who are trying to find a way to get through the tough times. The film is about
