In [36]:
import os
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain_community.vectorstores import Chroma

In [37]:
import os

In [38]:
urls = [
    'https://github.com/Hannibal046/Awesome-LLM?tab=readme-ov-file#milestone-papers',
    'https://stanford-cs324.github.io/winter2022/lectures/',
    'https://stanford-cs324.github.io/winter2022/lectures/introduction/',
    'https://stanford-cs324.github.io/winter2022/lectures/capabilities/',
    'https://stanford-cs324.github.io/winter2022/lectures/data/',
    'https://stanford-cs324.github.io/winter2022/lectures/training/',
    'https://stanford-cs324.github.io/winter2022/lectures/environment/',
]

In [39]:
loaders = UnstructuredURLLoader(urls=urls)
data = loaders.load()

In [40]:
context = "\n".join(str(p.page_content) for p in data)
     

In [41]:
print("The total number of words in the context:", len(context))

The total number of words in the context: 120248


In [42]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7000, chunk_overlap=2000)
context = "\n\n".join(str(p.page_content) for p in data)

In [43]:
texts = text_splitter.split_text(context)

In [44]:
texts[0]

"Hannibal046\n\nAwesome-LLM\n\nPublic\n\nNotifications\n    You must be signed in to change notification settings\n\nFork\n    1.3k\n\nStar\n          15.8k\n\nAwesome-LLM: a curated list of Large Language Model\n\nLicense\n\nCC0-1.0 license\n\n15.8k\n          stars\n\n1.3k\n          forks\n\nBranches\n\nTags\n\nActivity\n\nStar\n\nNotifications\n\nCode\n\nIssues\n          0\n\nPull requests\n          1\n\nActions\n\nProjects\n          0\n\nSecurity\n\nInsights\n\nCode\n\nIssues\n\nPull requests\n\nActions\n\nProjects\n\nSecurity\n\nInsights\n\nHannibal046/Awesome-LLM\n\nThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.\n\nmain\n\nBranches\n\nTags\n\nGo to file\n\nCode\n\nFolders and files\n\nName Name Last commit message Last commit date Latest commit History 433 Commits paper_list paper_list resources resources .gitignore .gitignore LICENSE.md LICENSE.md README.md README.md contributing.md contributing.md View all f

In [45]:
print(len(texts))

24


In [46]:
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Print all environment variables for debugging
print(os.environ)

environ({'ALLUSERSPROFILE': 'C:\\ProgramData', 'APPDATA': 'C:\\Users\\Lenovo\\AppData\\Roaming', 'APPLICATION_INSIGHTS_NO_DIAGNOSTIC_CHANNEL': 'true', 'BS4': ' c:\\users\\lenovo\\appdata\\local\\packages\\pythonsoftwarefoundation.python.3.9_qbz5n2kfra8p0\\localcache\\local-packages\\python39\\site-packages', 'C++': 'C:\\msys64\\mingw64\\bin', 'C++ COMPILER': 'C:\\msys64\\mingw64\\bin', 'CDS_HOME': 'C:\\cds_spb_home', 'CHOCOLATEYINSTALL': 'C:\\ProgramData\\chocolatey', 'CHOCOLATEYLASTPATHUPDATE': '133367473047977869', 'CHROME_CRASHPAD_PIPE_NAME': '\\\\.\\pipe\\crashpad_15184_ZJDLJFDWMHIOVSXU', 'COMMONPROGRAMFILES': 'C:\\Program Files\\Common Files', 'COMMONPROGRAMFILES(X86)': 'C:\\Program Files (x86)\\Common Files', 'COMMONPROGRAMW6432': 'C:\\Program Files\\Common Files', 'COMPUTERNAME': 'DEV-PC', 'COMSPEC': 'C:\\WINDOWS\\system32\\cmd.exe', 'CUDA_PATH': 'C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.1', 'CUDA_PATH_V12_1': 'C:\\Program Files\\NVIDIA GPU Computing Toolkit\\C

In [47]:
embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")

In [48]:
vector_index = Chroma.from_texts(texts, embeddings).as_retriever()

In [49]:
prompt_template = """
  Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
  provided context just say, "answer is not available in the context"\n\n
  Context:\n {context}?\n
  Question: \n{question}\n

  Answer:
"""

prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])

In [50]:
llm = ChatGoogleGenerativeAI(model="gemini-pro",google_api_key=os.environ["GOOGLE_API_KEY"]) 

chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)

In [51]:
# question = "What are some milestone model architectures and papers in the last few years"
# question = "What are the layers in a transformer block"
# question = "What are trending llms"
question= "Tell me about datasets used to train LLMs and how they’re cleaned"
docs = vector_index.get_relevant_documents(question)
print(docs)


[Document(page_content='Benchmark data contamination.\n\nWhen we are evaluating the capabilities of large language models using benchmark data (e.g., question-answer pairs), it makes a difference whether the benchmark data appears in the training data of the language model. If so, then the benchmark performance will be biased up.\n\nNormally, in machine learning, data hygiene (keeping the training data separate from the test) is relatively easy, but in the case of large language models, both the training data and benchmark data are derived from the Internet, it can be difficult to a priori guarantee their separation.\n\nExample from the XSum summarization dataset:\n\nInput:\n\nThe 48-year-old former Arsenal goalkeeper played for the Royals for four years. He was appointed youth academy director in 2000 and has been director of football since 2003. A West Brom statement said: “He played a key role in the Championship club twice winning promotion to the Premier League in 2006 and 2012.\n

In [52]:
response = chain.invoke(
    {"input_documents":docs, "question": question}
    , return_only_outputs=True)
print(response['output_text'])

Datasets used to train LLMs are derived from the Internet, which makes it difficult to guarantee their separation from benchmark data. 

To address this issue, various filtering and curation techniques are employed:

**Input-and-output contamination:** Both the input and output appear in the training data. Varies from 1.87% to 24.88% (XSum is 15.49%).

**Input contamination:** The input appears in the training data. Varies from 1.8% to 53.6% (QNLI, which is derived from Wikipedia).

**WebText**. The WebText dataset was used to train GPT-2. It was created by scraping all outbound links that received at least 3 karma (upvotes) and filtering out Wikipedia.

**OpenWebText**. OpenWebText is a replication of WebText in spirit. It was created by extracting all the URLs from the Reddit submissions dataset, using Facebook's fastText to filter out non-English text, and removing near duplicates.

**Colossal Clean Crawled Corpus (C4)**. C4 is a larger dataset that was created to train the T5 model