In [1]:
!pip install transformers torch gensim numpy scipy matplotlib



In [2]:
import torch
from transformers import AutoTokenizer, AutoModel
import gensim.downloader as api
import numpy as np
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import cosine

# Load pre-trained BERT model and tokenizer using Auto classes
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Function to get the word embedding from BERT
def get_word_embedding(word):
    # Tokenize the input word and get the embeddings
    tokens = tokenizer(word, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens)
    # Take the mean of the token embeddings
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embedding

# Function to perform word arithmetic
def word_arithmetic(word1, word2, word3):
    vec1 = get_word_embedding(word1)
    vec2 = get_word_embedding(word2)
    vec3 = get_word_embedding(word3)

    # Perform arithmetic: word1 - word2 + word3
    result_vec = vec1 - vec2 + vec3
    return result_vec

# Function to find the most similar word
def find_most_similar(target_vec, word_list):
    similarities = []
    for word in word_list:
        word_vec = get_word_embedding(word)
        # Calculate cosine similarity
        similarity = 1 - cosine(target_vec, word_vec)
        similarities.append((word, similarity))
    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[0]  # Return the most similar word and its similarity score

# New examples of word arithmetic
examples = [
    ('tourist', 'museum', 'beach', ['holiday', 'ocean', 'resort', 'city', 'mountain']),
    ('student', 'lecture', 'lab', ['experiment', 'homework', 'research', 'professor', 'classroom']),
    ('smartphone', 'app', 'website', ['tablet', 'software', 'browser', 'internet', 'device']),
    ('musician', 'concert', 'art', ['gallery', 'performance', 'painting', 'exhibit', 'theater']),
    ('chef', 'recipe', 'restaurant', ['dish', 'menu', 'cook', 'cuisine', 'bistro'])
]

# Perform word arithmetic and find the most similar word for each example
for word1, word2, word3, options in examples:
    result_emb = word_arithmetic(word1, word2, word3)
    most_similar, similarity = find_most_similar(result_emb, options)
    print(f"{word1} - {word2} + {word3} is most similar to: {most_similar} (similarity: {similarity:.4f})")



tourist - museum + beach is most similar to: resort (similarity: 0.7643)
student - lecture + lab is most similar to: experiment (similarity: 0.7924)
smartphone - app + website is most similar to: internet (similarity: 0.7568)
musician - concert + art is most similar to: painting (similarity: 0.7791)
chef - recipe + restaurant is most similar to: menu (similarity: 0.7991)


In [5]:
!pip install langchain groq
!pip install langchain-groq groq
!pip install -U langchain-community
!pip install langchain langchain-community huggingface_hub faiss-cpu
!pip install sentence-transformers

Collecting langchain
  Downloading langchain-0.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groq
  Downloading groq-0.11.0-py3-none-any.whl.metadata (13 kB)
Collecting langchain-core<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_core-0.3.0-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.121-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting httpx<1,>=0.23.0 (from groq)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->groq)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->groq)
  Downloading h11-0.14.0-py3-n

In [10]:

pip install langchain faiss-cpu transformers wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=71e988b2e2a708a7d7e057e3bce5ef016357190694f5ea8f5fddcf9968a12e43
  Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [11]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI  # Assuming `ChatGroq` works similarly to OpenAI models
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import WikipediaLoader
import os

# Set Groq API key
os.environ["GROQ_API_KEY"] = ""  # Replace with your Groq API key

# Step 1: Choose 5 articles (article titles are just examples, adjust as needed)
articles = [
    "Artificial Intelligence",
    "Quantum Computing",
    "Climate Change",
    "Ancient Civilizations",
    "Space Exploration"
]

# Step 2: Load and process each article
documents = []
for article_title in articles:
    loader = WikipediaLoader(article_title)
    article_text = loader.load()
    documents.extend(article_text)

# Step 3: Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs_chunks = text_splitter.split_documents(documents)

# Step 4: Create embeddings and store in a VectorDB
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
vector_db = FAISS.from_documents(docs_chunks, embedding_model)

# Step 5: Initialize the Groq LLM
llm = ChatGroq(model_name="mixtral-8x7b-32768")  # Ensure `ChatGroq` is compatible with `langchain`

# Step 6: Load the QA chain using the appropriate LLM
qa_chain = load_qa_chain(llm, chain_type="stuff")

# Step 7: Define the query function using the chain and retriever
def run_query(query):
    docs = vector_db.similarity_search(query)
    result = qa_chain.run(input_documents=docs, question=query)
    return result

# Step 8: Run 10 diverse queries on the RAG system
queries = [
    "What are the latest advancements in renewable energy technology?",
    "Can you explain the concept of quantum entanglement?",
    "What are the primary effects of global warming on marine life?",
    "How did ancient civilizations contribute to modern mathematics?",
    "What are the current goals of international space agencies?"
]

# Step 9: Run each query and record results
for i, query in enumerate(queries, 1):
    response = run_query(query)
    print(f"Query {i}: {query}")
    print(f"Response: {response}\n")



  lis = BeautifulSoup(html).find_all('li')
  embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

stuff: https://python.langchain.com/v0.2/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/v0.2/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/v0.2/docs/how_to/#qa-with-rag
  qa_chain = load_qa_chain(llm, chain_type="stuff")
  result = qa_chain.run(input_documents=docs, question=query)


Query 1: What are the latest advancements in renewable energy technology?
Response: The latest advancements in renewable energy technology include:

1. Floating solar panels: These solar panels are installed on water bodies such as reservoirs, lakes, and lagoons, which can help to reduce evaporation and algae growth. They are especially useful in areas with limited land availability.

2. Perovskite solar cells: These are a type of thin-film solar cell that can be made using low-cost materials and can be printed or rolled onto flexible surfaces. They have the potential to reach efficiencies comparable to traditional silicon solar cells.

3. Advanced wind turbines: Taller wind turbines with longer blades can capture more energy from the wind, making them more efficient. Floating wind turbines can be installed in deeper waters, opening up new areas for wind energy development.

4. Energy storage: Advances in battery technology, such as lithium-ion batteries, have made it possible to store