In [1]:
import os
from dotenv import load_dotenv

load_dotenv('token.env')
api_token = os.getenv('api_token')


In [2]:
# from langchain.document_loaders import PyPDFLoader

# loader = PyPDFLoader("Accelerating-Apache-Spark-3.pdf")
# data = loader.load()

from langchain.document_loaders import TextLoader

loader = TextLoader("instructions.txt")
data = loader.load()

In [3]:
data[0].page_content

'[General Gameplay]\n- Remind the player to type their actions clearly and concisely to ensure smooth gameplay.\n- Provide occasional hints, tips, or prompts to guide the player\'s decision-making process.\n- Encourage creativity and exploration while maintaining a sense of challenge and adventure.\n\n[Exploration]\n- The player finds themselves in a dense, enchanted forest teeming with mystical creatures and hidden treasures.\n- Encourage the player to explore the forest by describing the surroundings, potential encounters, and points of interest.\n- Use vivid imagery and descriptive language to immerse the player in the forest environment.\n- Provide hints or clues about nearby landmarks, valuable items, or potential dangers.\n\n[Actions]\n- Exploration: The player can move deeper into the forest to discover new areas and encounters.\n- Combat: Engaging in combat with nearby monsters is an option.\n- Rest: The player can take a break to regain health and stamina.\n- Treasure Hunting:

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter,CharacterTextSplitter

# splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1000,
#     chunk_overlap=100
# )

# data = splitter.split_documents(data)
# len(data)

# r_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=450,
#     chunk_overlap=0, 
#     separators=["\n\n"]
# )

from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=10, chunk_overlap=1)

In [5]:
# text_splitter = RecursiveCharacterTextSplitter(separators=["forest"], chunk_size=450, chunk_overlap=0)
docs = text_splitter.split_documents(data)

Created a chunk of size 294, which is longer than the specified 10
Created a chunk of size 430, which is longer than the specified 10
Created a chunk of size 308, which is longer than the specified 10
Created a chunk of size 792, which is longer than the specified 10
Created a chunk of size 329, which is longer than the specified 10


In [6]:
dash_line = "-" * 100
print(len(docs))
for doc in docs:
    print(doc.page_content)
    print(dash_line + '\n' + dash_line)


6
[General Gameplay]
- Remind the player to type their actions clearly and concisely to ensure smooth gameplay.
- Provide occasional hints, tips, or prompts to guide the player's decision-making process.
- Encourage creativity and exploration while maintaining a sense of challenge and adventure.
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
[Exploration]
- The player finds themselves in a dense, enchanted forest teeming with mystical creatures and hidden treasures.
- Encourage the player to explore the forest by describing the surroundings, potential encounters, and points of interest.
- Use vivid imagery and descriptive language to immerse the player in the forest environment.
- Provide hints or clues about nearby landmarks, valuable items, or potential dangers.
----------------------------------------------------------------------

In [7]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [8]:
from langchain.vectorstores import Chroma

In [9]:
vectordb = Chroma.from_documents(
    documents=docs,
    embedding = embeddings,
    persist_directory = 'instructions'
    )

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "google/gemma-7b-it"
dtype = torch.bfloat16

tokenizer = AutoTokenizer.from_pretrained(model_name,token = api_token)
model = AutoModelForCausalLM.from_pretrained(model_name,token = api_token, device_map = 'auto', torch_dtype = dtype)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



In [12]:
from transformers import pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

pipe = pipeline(
 "text-generation", 
 model=model, 
 tokenizer=tokenizer,
 return_tensors='pt',
 max_new_tokens=100,
 model_kwargs={"torch_dtype": torch.bfloat16}
)

In [13]:
llm = HuggingFacePipeline(
 pipeline=pipe,
 model_kwargs={"temperature": 0.7},
)

In [14]:
qa = RetrievalQA.from_chain_type(
 llm=llm,
 chain_type="stuff",
 retriever=vectordb.as_retriever(search_kwargs={"k":3}),

)

In [15]:
qa.invoke("You are in enchanted woods. What can happen?")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


{'query': 'You are in enchanted woods. What can happen?',
 'result': ' You are in enchanted woods. You can explore the forest, defeat monsters, and uncover hidden treasures.'}