In [None]:
pip install transformers torch langchain-community


In [2]:
import os
import warnings
warnings.filterwarnings("ignore")

import torch

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import HuggingFacePipeline

from huggingface_hub import login



In [3]:
# Load credentials
from dotenv import load_dotenv
load_dotenv(dotenv_path='../.env', verbose=True)




True

In [4]:
# Retrieve the Hugging Face token from environment variables
hf_token = os.getenv('HF_TOKEN')

In [5]:
# Log in to Hugging Face
login(token=hf_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [6]:
# Define the directory containing PDF documents
directory_path = "../data/Training_docs/"



In [7]:
# Initialize the directory loader
directory_loader = PyPDFDirectoryLoader(directory_path)



In [8]:
# Load all documents from the directory
documents = directory_loader.load()



In [9]:
# Combine the content of all documents into a single string
text_data = "\n".join([doc.page_content for doc in documents])



In [10]:
# Define the query
query = """
    Given the information {information}, please tell me the most important points in the text.
    Provide a summary of the text in 5 bullet points.
    """



In [11]:
# Define the prompt template
prompt_template = PromptTemplate(
    input_variables=["information"],
    template=query
)




In [None]:
# Load the Llama 3 model and tokenizer
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    use_auth_token=hf_token,
    torch_dtype=torch.float16,
    device_map="auto"
)



Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]Error while downloading from https://cdn-lfs-us-1.hf.co/repos/55/ac/55acddbb5c2ac2041b89a858eeba82e6130c6160294d75fe51bfa8bd7a4e4518/d8cf9c4d0dd972e1a2131bfe656235ee98221679711a3beef6d46dadf0f20b5c?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00001-of-00004.safetensors%3B+filename%3D%22model-00001-of-00004.safetensors%22%3B&Expires=1743971357&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0Mzk3MTM1N319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzU1L2FjLzU1YWNkZGJiNWMyYWMyMDQxYjg5YTg1OGVlYmE4MmU2MTMwYzYxNjAyOTRkNzVmZTUxYmZhOGJkN2E0ZTQ1MTgvZDhjZjljNGQwZGQ5NzJlMWEyMTMxYmZlNjU2MjM1ZWU5ODIyMTY3OTcxMWEzYmVlZjZkNDZkYWRmMGYyMGI1Yz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=FBrwHNOkBx7l99GXKwVtOjeSdD6x3NOxhVv5c%7E-ELsX7oYkcq3j67FTU0Qi21dsla-txeTkySPgC42y1kRUjIgkUfjiy6hj04aBgLSEvWv6bfNsqgSsDrHD0fyaz5sju9VaCbrIMVnhlkDwuENgZj8cJEsky%7E4SIUC4B6E8%7EX

In [None]:
# Create a text generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=1024,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1
)



In [None]:
# Wrap the pipeline in a LangChain LLM
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)



In [None]:
# Create the LLM chain
chain = LLMChain(llm=llm, prompt=prompt_template)



In [None]:
# Invoke the chain with the text data
output = chain.invoke(input={"information": text_data})



In [None]:
# Print the output
print(output)