# Retriver

In [1]:
import torch
from sentence_transformers import SentenceTransformer
from torch.nn import DataParallel
import numpy as np

class MistralEmbedder:
    def __init__(self, model_name="Salesforce/SFR-Embedding-Mistral", device='cuda'):
#         device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = SentenceTransformer(model_name, device=device)
    
    def embed_documents(self, texts):
        """
        Generate embeddings for a list of text documents using the Mistral model.
        
        Parameters:
        - texts (List[str]): A list of texts to embed.
        
        Returns:
        - List of embeddings as numpy arrays.
        """
        # Ensure texts is a list for batch processing
        if isinstance(texts, str):
            texts = [texts]
        
        # Generate embeddings and return them as numpy arrays
        embeddings = self.model.encode(texts)
        embeddings_list = embeddings.tolist() if isinstance(embeddings, np.ndarray) else embeddings
        return embeddings_list
    def embed_query(self, query):
        """
        Generate an embedding for a single query string using the Mistral model.
        
        Parameters:
        - query (str): The query string to embed.
        
        Returns:
        - A numpy array representing the embedding of the query.
        """
        # Generate the embedding for the query
        embedding = self.model.encode(query, convert_to_numpy=True)

        # Convert numpy array to list if necessary
        embedding_list = embedding.tolist() if isinstance(embedding, np.ndarray) else embedding

        # Return the embedding
        return embedding_list

# Initialize the embedding wrapper
mistral_embedder = MistralEmbedder()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# Generator

In [2]:
from langchain_community.llms import LlamaCpp


n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="llama-2-7b-chat.Q6_K.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=2048,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from llama-2-7b-chat.Q6_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32

# Vector Store

In [4]:
from langchain_community.vectorstores import Chroma
vectorstore = Chroma(persist_directory="MistralEmbed/", embedding_function=mistral_embedder)

In [5]:
query = "What is The course 05120?"
docs = vectorstore.similarity_search(query)
print(docs)

[Document(page_content='/0/51/1/10/2/8'), Document(page_content='/0/51/1/10/2/8', metadata={'source': 'Web Scholar PDFs/e01515c6138bc525f7aec30fc85f2adf028d4156.pdf'}), Document(page_content='(cid:51) Takeaway:'), Document(page_content='(cid:51) Takeaway:', metadata={'source': 'Web Scholar PDFs/8aa98fbfb6f1e979dead13ce24075503fe47658e.pdf'})]


In [6]:
import pickle

def load_pickle(file_path):
    """Load the contents of a pickle file."""
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

def find_new_documents(new_data_file, old_data_file):
    """Find and return documents in the new data file that are not in the old data file."""
    # Load data from both pickle files
    new_data = load_pickle(new_data_file)
    old_data = load_pickle(old_data_file)
    
    # Find documents in new_data that are not in old_data
    new_documents = [doc for doc in new_data if doc not in old_data]
    
    return new_documents

# Adjust these file paths to match your pickle file locations
new_data_file_path = 'final_data_splits.pkl'
old_data_file_path = 'splitDocuments.pkl'

# Find the new documents
new_documents = find_new_documents(new_data_file_path, old_data_file_path)

# Optionally, print the new documents or the number of new documents found
print(f"Found {len(new_documents)} new documents.")

Found 5 new documents.


In [7]:
# Flatten the list of lists into a single list of Document objects
new_docs_flat = [doc for sublist in new_documents for doc in sublist]

# Now, add these Document objects to your vector store
try:
    added_document_ids = vectorstore.add_documents(new_docs_flat)
    print(f"Successfully added {len(added_document_ids)} new documents.")
except Exception as e:
    print(f"Error adding new documents: {e}")

Successfully added 36384 new documents.


In [18]:
# Inspect the first few documents to understand their structure
for i, doc in enumerate(new_documents[:5]):  # Adjust the number as needed
    print(f"Document {i}: Type {type(doc)}")
    if isinstance(doc, dict):
        print(f" - Keys: {list(doc.keys())}")
    else:
        print(f" - Content: {str(doc)[:100]}")  # Print the first 100 characters to get a sense of the content


Document 0: Type <class 'list'>
 - Content: [Document(page_content='3 2 0 2\n\nl u J\n\n0 1\n\n] L C . s c [\n\n2 v 5 8 1 8 1 . 5 0 3 2 : v i X 
Document 1: Type <class 'list'>
 - Content: [Document(page_content='Sweepstakes Slang\n\nBuggy: Vehicle being raced and also a nickname for the 
Document 2: Type <class 'list'>
 - Content: [Document(page_content='Who founded Carnegie Mellon University?\n\nCarnegie Technical Schools was fo
Document 3: Type <class 'list'>
 - Content: [Document(page_content="The course 05120, 'Introduction to Human-Computer Interaction,' offers 5 uni
Document 4: Type <class 'list'>
 - Content: [Document(page_content='Course: 05291\n\nCourse_Title: Learning Media Design\n\nUnits: 12\n\nSec: A\


In [14]:
from langchain.docstore.document import Document

# Create a list of lists, each containing one Document object
placeholder_docs_nested = [[Document(page_content=f"Placeholder text for document {i}")] for i in range(5)]

In [15]:
for i, doc in enumerate(placeholder_docs_nested[:5]):  # Adjust the number as needed
    print(f"Document {i}: Type {type(doc)}")
    if isinstance(doc, dict):
        print(f" - Keys: {list(doc.keys())}")
    else:
        print(f" - Content: {str(doc)[:100]}")  # Print the first 100 characters to get a sense of the content

Document 0: Type <class 'list'>
 - Content: [Document(page_content='Placeholder text for document 0')]
Document 1: Type <class 'list'>
 - Content: [Document(page_content='Placeholder text for document 1')]
Document 2: Type <class 'list'>
 - Content: [Document(page_content='Placeholder text for document 2')]
Document 3: Type <class 'list'>
 - Content: [Document(page_content='Placeholder text for document 3')]
Document 4: Type <class 'list'>
 - Content: [Document(page_content='Placeholder text for document 4')]


In [17]:
try:
    added_document_ids = vectorstore.add_documents(placeholder_docs_flat)
    print(f"Successfully added {len(added_document_ids)} placeholder documents.")
except Exception as e:
    print(f"Error adding placeholder documents: {e}")

Successfully added 5 placeholder documents.


In [16]:
# Flatten the list of lists into a single list of Document objects
placeholder_docs_flat = [doc for sublist in placeholder_docs_nested for doc in sublist]


In [11]:
from langchain.docstore.document import Document

# Assuming new_documents is a list of strings or objects that can be converted to strings
new_doc_objects = [Document(page_content=str(doc)) for doc in new_documents]



In [None]:
# Now, you can add these Document objects to your vector store
try:
    added_document_ids = vectorstore.add_documents(new_doc_objects)
    print(f"Successfully added {len(added_document_ids)} documents.")
except Exception as e:
    print(f"Error adding documents: {e}")

In [None]:

from langchain.docstore.document import Document

# Assuming `new_documents` is a list of your new document contents as strings
# and `vectorstore` is your initialized vector store object

# Convert each text string into a Document object
docs_to_add = [Document(page_content=text) for text in docs_texts]

# Now, add these Document objects to your vector store
added_document_ids = vectorstore.add_documents(docs_to_add)

print(f"Added {len(added_document_ids)} documents.")

In [4]:
from langchain.docstore.document import Document
text = "..... put the text you copy pasted here......"
doc = Document(page_content=text)

In [5]:
new_docs = [doc]
vectorstore.add_documents(
    new_docs
)

['1cf85403-fd3d-4d47-9538-eecd2d964338']

In [None]:
vectorstore.get({'ids':1})

In [10]:
from langchain import hub
from langchain_core.runnables import RunnablePassthrough, RunnablePick
from langchain_core.prompts.chat import HumanMessagePromptTemplate, PromptTemplate

rag_prompt = hub.pull("rlm/rag-prompt")
rag_prompt.messages
prompt = HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use one sentence maximum and keep the answer CONCISE. Keep the answer CONCISE.\nQuestion: {question} \nContext: {context} \nAnswer:"))
rag_prompt.messages = [prompt]

In [11]:
print(rag_prompt)

input_variables=['context', 'question'] metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use one sentence maximum and keep the answer CONCISE. Keep the answer CONCISE.\nQuestion: {question} \nContext: {context} \nAnswer:"))]


In [13]:
retriever = vectorstore.as_retriever()

In [14]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
# Chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

In [17]:
import os
from tqdm import tqdm


# Load questions from questions.txt
with open("SubmissionData/train/questions.txt", "r") as f:
    questions = [line.strip() for line in f.readlines()]

# Output directory for answers
output_dir = "SubmissionData/system_outputs/"
answer_file = os.path.join(output_dir, "MistralEmbeddingsTrain.txt")

# Run the question-answering loop and save answers
answers = []
with tqdm(total=len(questions), desc="Answering questions") as progress_bar:
    with open(answer_file, "w") as f:
        for question in questions:
            response = qa_chain.invoke(question)
            f.write(response.replace("\n","") + "\n")
            answers.append(response)
            progress_bar.update(1)

Answering questions:   0%|          | 0/315 [00:00<?, ?it/s]Llama.generate: prefix-match hit

llama_print_timings:        load time =    4891.83 ms
llama_print_timings:      sample time =      13.26 ms /    38 runs   (    0.35 ms per token,  2865.55 tokens per second)
llama_print_timings: prompt eval time =   10371.75 ms /   493 tokens (   21.04 ms per token,    47.53 tokens per second)
llama_print_timings:        eval time =    6017.01 ms /    37 runs   (  162.62 ms per token,     6.15 tokens per second)
llama_print_timings:       total time =   16541.93 ms /   530 tokens
Answering questions:   0%|          | 1/315 [00:16<1:27:06, 16.65s/it]Llama.generate: prefix-match hit

llama_print_timings:        load time =    4891.83 ms
llama_print_timings:      sample time =      21.73 ms /    50 runs   (    0.43 ms per token,  2301.07 tokens per second)
llama_print_timings: prompt eval time =    7492.29 ms /   351 tokens (   21.35 ms per token,    46.85 tokens per second)
llama_print_timings: