In [1]:
# cell 1: Install dependencies
!pip install -q langchain langchain-community langchain-huggingface faiss-cpu sentence-transformers bitsandbytes accelerate
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[31m
[0m

In [2]:
# cell 2: Import libraries
import os
import torch
from langchain.document_loaders import TextLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    BitsAndBytesConfig
)



In [3]:
# cell 3: Configuration
MODEL_NAME = "microsoft/DialoGPT-medium"
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
DOCUMENT_SOURCE = "https://en.wikipedia.org/wiki/Large_language_model"

print("Configuration set up successfully!")

Configuration set up successfully!


In [4]:
# cell 4: Load and process documents
def load_documents(source):
    if source.startswith('http'):
        loader = WebBaseLoader(source)
    else:
        loader = TextLoader(source)
    return loader.load()

documents = load_documents(DOCUMENT_SOURCE)
print(f"Loaded {len(documents)} documents")

Loaded 1 documents


In [5]:
# cell 5: Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
texts = text_splitter.split_documents(documents)
print(f"Split into {len(texts)} chunks")

Split into 187 chunks


In [6]:
# cell 6: Create embeddings and vector store
embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
)

vector_store = FAISS.from_documents(texts, embeddings)
print("Vector store created successfully!")

  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector store created successfully!


In [9]:
# cell 7: Load custom LLM
def load_custom_llm(model_name=MODEL_NAME):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True
    )

    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        repetition_penalty=1.15
    )

    return HuggingFacePipeline(pipeline=pipe)

llm = load_custom_llm()
print("Custom LLM loaded successfully!")

Device set to use cuda:0


Custom LLM loaded successfully!


In [10]:
# cell 8: Create custom prompt template
prompt_template = """Use the following context to answer the question. If you don't know the answer, just say you don't know.

Context: {context}

Question: {question}

Answer: """

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

print("Prompt template created!")

Prompt template created!


In [11]:
# cell 9: Create RAG chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)

print("RAG chain created successfully!")

RAG chain created successfully!


In [12]:
# cell 10: Test function
def ask_question(question):
    result = qa_chain({"query": question})
    print(f"Question: {question}")
    print(f"Answer: {result['result']}")
    print("\nSources:")
    for doc in result['source_documents']:
        print(f"- {doc.metadata.get('source', 'Unknown')}")
    print("="*50)

In [13]:
# cell 11: Test the RAG system
questions = [
    "What are large language models?",
    "How are they trained?",
    "What are their applications?"
]

for question in questions:
    ask_question(question)

  result = qa_chain({"query": question})
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: What are large language models?
Answer: Use the following context to answer the question. If you don't know the answer, just say you don't know.

Context: History[edit]
The number of publications about large language models by year grouped by publication types.
The training compute of notable large models in FLOPs vs publication date over the period 2010–2024. For overall notable models (top left), frontier models (top right), top language models (bottom left) and top models within leading companies (bottom right). The majority of these models are language models.
The training compute of notable large AI models in FLOPs vs publication date over the period 2017–2024. The majority of large models are language models or multimodal models with language capacity.

The training compute of notable large AI models in FLOPs vs publication date over the period 2017–2024. The majority of large models are language models or multimodal models with language capacity.
Before the emergence o

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Question: How are they trained?
Answer: Use the following context to answer the question. If you don't know the answer, just say you don't know.

Context: The Reflexion method[71] constructs an agent that learns over multiple episodes. At the end of each episode, the LLM is given the record of the episode, and prompted to think up "lessons learned", which would help it perform better at a subsequent episode. These "lessons learned" are stored as a form of long-term memory and given to the agent in the subsequent episodes.[71]
Monte Carlo tree search can use an LLM as rollout heuristic. When a programmatic world model is not available, an LLM can also be prompted with a description of the environment to act as world model.[72]

Fine-tuning[edit]
Before being fine-tuned, most LLMs are next-token predictors. The fine-tuning adjust the output of an LLM to seem more conversational via techniques like reinforcement learning from human feedback (RLHF) or constitutional AI.[46]
Instruction fin

In [14]:
# cell 12: Save the vector store
vector_store.save_local("faiss_index")
print("Vector store saved to 'faiss_index'")

Vector store saved to 'faiss_index'


In [16]:
# cell 13: Load and test saved system
def load_rag_system():
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    vector_store = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

    llm = load_custom_llm()

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
        chain_type_kwargs={"prompt": PROMPT},
        return_source_documents=True
    )
    return qa_chain

print("Testing loaded system...")
loaded_qa = load_rag_system()
result = loaded_qa({"query": "What is the main purpose of LLMs?"})
print(f"Answer: {result['result']}")

Testing loaded system...


Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: Use the following context to answer the question. If you don't know the answer, just say you don't know.

Context: Extensibility[edit]
Beyond basic text generation, various techniques have been developed to extend LLM capabilities, including the use of external tools and data sources, improved reasoning on complex problems, and enhanced instruction-following or autonomy through prompting methods.

For open-ended exploration, an LLM can be used to score observations for their "interestingness", which can be used as a reward signal to guide a normal (non-LLM) reinforcement learning agent.[73] Alternatively, it can propose increasingly difficult tasks for curriculum learning.[74] Instead of outputting individual actions, an LLM planner can also construct "skills", or functions for complex action sequences. The skills can be stored and later invoked, allowing increasing levels of abstraction in planning.[74]
Multiple agents with memory can interact socially.[75]

Reasoning[edit]
LL