In [None]:
import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"]="<your-key>"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["HUGGING_FACE_API_KEY"] = "<your-key>"

In [None]:
!pip install langchain langchain-chroma chromadb langchain-huggingface sentence-transformers transformers torch accelerate

In [3]:
import pandas as pd
import os
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

In [None]:
from google.colab import drive
drive.mount('<your-drive-location>')


Mounted at /content/drive


In [None]:
folder_path = 'folder location in your drive - path of it'
file_list = os.listdir(folder_path)
print("Files in folder:", file_list)

csv_file = [f for f in file_list if f.endswith('.csv')][1]
# only reading 5001 rows NOT 2.5 Million rows the actual dataset can be found on kaggle: urbandict-word-defs
df_final = pd.read_csv(os.path.join(folder_path, csv_file), nrows=5001)

df_final.head()

Files in folder: ['urbandict-word-defs.csv', 'filtered_urban_data.csv']


Unnamed: 0,word,definition
0,word,definition
1,janky,undesirable; less-than optimum.
2,slumpin',"low down and funky, but [knee deep] enough to ..."
3,yayeeyay,"affirmation; suggestion of encouragement, appr..."
4,hard-core,anything out of our league that can be good or...


In [6]:
df_final.shape

(5001, 2)

In [7]:
def create_documents_from_df(df, num_rows=None):
    """
    Convert DataFrame to LangChain Document objects
    sample_size: if specified, only use first N rows (for testing)
    """
    if num_rows:
        df = df.head(num_rows)

    documents = []
    for idx, row in df.iterrows():
        # Create document content combining word and definition
        content = f"Word: {row['word']}\nDefinition: {row['definition']}"

        # Create metadata
        metadata = {
            "word": row['word'],
            "definition": row['definition'],
            "source": "urban_dictionary",
            "row_id": idx
        }

        doc = Document(page_content=content, metadata=metadata)
        documents.append(doc)

    return documents


In [9]:
document = create_documents_from_df(df_final, num_rows=5001)

In [10]:
# text splitting
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Small chunks
    chunk_overlap=50,
    length_function=len,
)

split_docs = text_splitter.split_documents(document)
print(f"After splitting: {len(split_docs)} chunks")

After splitting: 5037 chunks


In [11]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
# vector store with HuggingFace embeddings
vector_store = Chroma.from_documents(
    documents=split_docs,
    embedding=embeddings,
    persist_directory="./urban_dict_chroma_db"  # Persist to disk
)

In [13]:
# retriever
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}  # Return top 5 most similar documents
)

In [21]:
#  LOCAL LLM using HuggingFace

def setup_local_llm(model_choice="qwen3-0.6b"):
    """
    Set up a local LLM using HuggingFace transformers
    Optimized for Google Colab
    """

    model_options = {
        "qwen3-0.6b": "Qwen/Qwen3-0.6B",
        "qwen3-1.7b"  : "Qwen/Qwen3-1.7B"
    }

    model_name = model_options[model_choice]

    print(f"Loading {model_choice} ({model_name})...")
    print("This may take a few minutes on first run...")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    # Load model with optimizations for Colab
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,  # Use half precision to save memory
        device_map="auto",          # Automatically handle GPU/CPU placement
        trust_remote_code=True,
        low_cpu_mem_usage=True      # Optimize memory usage
    )

    # Create text generation pipeline
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256,         # Reasonable length for dictionary responses
        temperature=0.2,            # Low temperature for consistent responses
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.1,
        return_full_text=False
    )

    # Wrap in LangChain
    hf_llm = HuggingFacePipeline(pipeline=pipe)

    print(f"✅ {model_choice} loaded successfully!")
    return hf_llm

In [22]:
llm = setup_local_llm() #using the default model

Loading qwen3-0.6b (Qwen/Qwen3-0.6B)...
This may take a few minutes on first run...


Device set to use cpu


✅ qwen3-0.6b loaded successfully!


In [23]:
# RAG chain with local LLM
prompt_template = """
You are an expert on Urban Dictionary slang and informal language. Use the following Urban Dictionary definitions to answer the question.

Context from Urban Dictionary:
{context}

Question: {question}

Instructions:
- Provide a clear, accurate answer based on the Urban Dictionary definitions provided
- Include the relevant slang terms and their definitions
- If multiple related terms are found, explain the differences
- Be conversational but informative
- If the context doesn't contain relevant information, say so

Answer:
"""

In [24]:
PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

In [25]:
# Create RAG chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True
)

In [26]:
def clean_response(text):
    """Clean up the model response to remove repetition and unwanted text"""
    # Split by "Answer:" and take the first meaningful response
    if "Answer:" in text:
        parts = text.split("Answer:")
        if len(parts) > 1:
            text = parts[1].strip()

    # Remove repeated sentences
    sentences = text.split('. ')
    unique_sentences = []
    seen = set()

    for sentence in sentences:
        sentence = sentence.strip()
        if sentence and sentence not in seen and len(sentence) > 10:
            unique_sentences.append(sentence)
            seen.add(sentence)

    # Join back and clean up
    cleaned = '. '.join(unique_sentences)
    if cleaned and not cleaned.endswith('.'):
        cleaned += '.'

    return cleaned

In [27]:
# Test the RAG system with LOCAL LLM
def query_urban_dictionary(question):
    """Query the Urban Dictionary RAG system using local LLM"""
    result = qa_chain.invoke({"query": question})  # Using invoke instead of deprecated __call__

    print(f"Question: {question}")
    print(f"Answer: {result['result']}")
    print("\nSource documents:")
    for i, doc in enumerate(result['source_documents']):
        print(f"{i+1}. Word: {doc.metadata['word']}")
        print(f"   Definition: {doc.metadata['definition'][:100]}...")
    print("-" * 50)

    return result

# Test queries
print("\n" + "="*50)
print("TESTING THE RAG SYSTEM")
print("="*50)

test_queries = [
    "What does 'janky' mean?",
    "Tell me about words related to being cool or awesome",
]

for query in test_queries:
    query_urban_dictionary(query)


TESTING THE RAG SYSTEM
Question: What does 'janky' mean?
Answer: The word "janky" means undesirable or far from being optimal. It also refers to something that's messed up or not perfect. The term "stanky" is used for qualities of uncleanliness or as a prostitute. "Grimy" means doing something messed up, often in a humorous way.
Answer:

**Answer:**

The word "janky" means undesirable or far from being optimal. It also refers to something that's messed up or not perfect. The term "stanky" describes qualities of uncleanliness or as a prostitute. "Grimy" means doing something messed up, often in a humorous way. 

If you have more questions about these terms, feel free to ask! 😊
```json
{
  "answer": "The word 'janky' means undesirable or far from being optimal. It also refers to something that's messed up or not perfect. The term'stanky' describes qualities of uncleanliness or as a prostitute. 'Grimy' means doing something messed up, often in a humorous way."
}
``` 
```json
{
  "answer"