In [247]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from transformers.data.processors.squad import SquadExample

In [248]:
import sys
print(sys.executable)

C:\Users\dkmsf\AppData\Local\Programs\Python\Python312\python.exe


In [249]:
# Load data from a file
file_path = "file_path.txt"  # Specify the path to your file
with open(file_path, "r") as file:
    text = file.read()

# Split the text into documents using RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
# Create a Document object with the text content
document = Document(text)

# Split the document into chunks using RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents([document])

In [250]:
docs[0]

Document(page_content='Cheesemaking is the craft of making cheese. It involves a series of steps such as curdling milk, draining whey, and pressing the curd. The process varies depending on the type of cheese being made and can take anywhere from a few hours to several months. Some popular types of cheese include cheddar, mozzarella, and brie.')

In [251]:
modelPath = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [252]:
text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result[:3]

[-0.03833853825926781, 0.1234646588563919, -0.028642965480685234]

In [253]:
db = FAISS.from_documents(docs, embeddings)

In [254]:
question = "What is cheesemaking?"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

Cheesemaking is the craft of making cheese. It involves a series of steps such as curdling milk, draining whey, and pressing the curd. The process varies depending on the type of cheese being made and can take anywhere from a few hours to several months. Some popular types of cheese include cheddar, mozzarella, and brie.


In [255]:
# Using "Intel/dynamic_tinybert" tokenizer.
tokenizer = AutoTokenizer.from_pretrained("Intel/dynamic_tinybert")

# Create a question-answering model object by loading the pretrained "Intel/dynamic_tinybert" model.
model = AutoModelForQuestionAnswering.from_pretrained("Intel/dynamic_tinybert")

In [256]:
# Specify the model name you want to use
model_name = "Intel/dynamic_tinybert"

# Load the tokenizer associated with the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

question_answerer = pipeline(
    "question-answering", 
    model=model_name, 
    tokenizer=tokenizer,
    return_tensors='pt'
)
llm = HuggingFacePipeline(
    pipeline=question_answerer,
    model_kwargs={"temperature": 0.7, "max_length": 512},
)

In [257]:
retriever = db.as_retriever()

In [258]:
docs = retriever.get_relevant_documents("What is Cheesemaking?")
print(docs[0].page_content)

Cheesemaking is the craft of making cheese. It involves a series of steps such as curdling milk, draining whey, and pressing the curd. The process varies depending on the type of cheese being made and can take anywhere from a few hours to several months. Some popular types of cheese include cheddar, mozzarella, and brie.


In [259]:
retriever = db.as_retriever(search_kwargs={"k": 4})
question = "Who is Thomas Jefferson?"

input_example = SquadExample(
    qas_id="1",
    question_text=question,
    context_text="""Cheesemaking is the craft of making cheese. It involves a series of steps such as curdling milk, draining whey, and pressing the curd. The process varies depending on the type of cheese being made and can take anywhere from a few hours to several months. Some popular types of cheese include cheddar, mozzarella, and brie.""",
    answer_text=None,
    start_position_character=None,
    title="Cheese"
)

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever, return_source_documents=False)

# Extract question text from the input_example
question_text = input_example.question_text

# Ensure text is a string before calling replace
if isinstance(question_text, str):
    question_text = question_text.replace("\n", " ")

# Create an instance of HuggingFaceEmbeddings
huggingface_embeddings = HuggingFaceEmbeddings(model_name="distilbert-base-uncased")

# Pass the question text to the embed_documents method of HuggingFaceEmbeddings
result = huggingface_embeddings.embed_documents([question_text])[0]

print(result)

No sentence-transformers model found with name distilbert-base-uncased. Creating a new one with MEAN pooling.


[-0.04896394535899162, -0.006729791406542063, -0.3647022545337677, 0.004044950008392334, 0.3382938802242279, -0.2157203108072281, 0.2625890374183655, 0.5500601530075073, -0.2230548858642578, -0.21030808985233307, 0.16781382262706757, -0.3424034118652344, -0.33403629064559937, 0.30805960297584534, -0.2800794243812561, 0.13385149836540222, 0.12795516848564148, 0.07784637063741684, -0.553861141204834, 0.18284986913204193, -0.14413373172283173, 0.15191903710365295, -0.07032287865877151, 0.06462229788303375, 0.18543173372745514, 0.08218877762556076, 0.029099229723215103, -0.379950612783432, 0.05148771032691002, -0.11618591099977493, 0.18356195092201233, 0.053147170692682266, -0.20872782170772552, 0.014008245430886745, -0.43047019839286804, -0.07080668210983276, 0.1269363909959793, 0.12476851791143417, -0.09446483105421066, 0.4293730854988098, -0.8170519471168518, -0.3816610276699066, 0.05571054667234421, 0.3743886649608612, -0.366502970457077, -0.2664715051651001, 0.2044128030538559, 0.1299

In [272]:
qa = pipeline("question-answering")

# Define the question and context
question = "Explain Cheesemaking?"
context = """Cheesemaking is the craft of making cheese. It involves a series of steps such as curdling milk, draining whey, and pressing the curd. The process varies depending on the type of cheese being made and can take anywhere from a few hours to several months. Some popular types of cheese include cheddar, mozzarella, and brie."""

# Create a SquadExample object
input_example = SquadExample(
    qas_id="1",
    question_text=question,
    context_text=context,
    answer_text=None,
    start_position_character=None,
    title="Cheese"
)

# Run the question answering pipeline with the example wrapped in a list
result = qa([input_example])
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.07902113348245621, 'start': 82, 'end': 133, 'answer': 'curdling milk, draining whey, and pressing the curd'}
