# Installing Libraries

In [None]:
!pip install langchain langchain-openai langchain-community langchain-chroma python-dotenv sentence-transformers faiss-cpu scikit-learn plotly transformers


Collecting langchain-openai
  Downloading langchain_openai-0.3.18-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain-chroma
  Downloading langchain_chroma-0.2.4-py3-none-any.whl.metadata (1.1 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting langchain-core<1.0.0,>=0.3.58 (from langchain)
  Downloading langchain_core-0.3.61-py3-none-any.whl.metadata (5.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx

# Imports

In [150]:
# Langchain Family
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAI
from langchain_chroma import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings


# Basic Libraries
import os
import glob
import numpy as np

# Metrics & Visulaization
from sklearn.manifold import TSNE
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
from IPython.display import Markdown, display
from sklearn.metrics.pairwise import cosine_similarity
import faiss


In [151]:
# Get OpenAI Token
from google.colab import userdata
openai_token = userdata.get('OPENAI_TOKEN')
if openai_token:
  print("OpenAI Token Retrieved Successfully")
  print(openai_token[:2]+'-xxx-xxx')
else:
  print("Failed to get OpenAI Token")

OpenAI Token Retrieved Successfully
sk-xxx-xxx


# Loading & Chunking Documents

In [152]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase
folders = glob.glob("knowledge-base2/")
text_loader_kwargs = {'encoding': 'utf-8'}

# Defiing documents list
documents = []

for folder in folders:
    loader = DirectoryLoader(folder, glob="**/*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        file_name = os.path.basename(doc.metadata["source"])
        doc.metadata["doc_type"] = os.path.splitext(file_name)[0]
        documents.append(doc)

# Getting length of documents found
len(documents)

51

In [153]:
# Exploring metadata of a sample
documents[1].metadata

{'source': 'knowledge-base2/Genetic Engineering.txt',
 'doc_type': 'Genetic Engineering'}

In [154]:
# Splitting Text into chunks with overlapping

text_splitter = CharacterTextSplitter(separator="\n", chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)



In [155]:
# Getting how many chunks
len(chunks)

222

In [156]:
# Exploring a random chunk
chunks[5]

Document(metadata={'source': 'knowledge-base2/Genetic Engineering.txt', 'doc_type': 'Genetic Engineering'}, page_content='One of the most widely used techniques in genetic engineering is gene cloning, which involves making many copies of a specific gene. Another powerful tool is CRISPR-Cas9, a revolutionary gene editing technology that allows for precise modifications to the DNA sequence in living cells. Genetic engineering has numerous applications. In medicine, it is used to produce therapeutic proteins (e.g., insulin), develop gene therapies for genetic disorders, and create genetically modified cells for immunotherapy. In agriculture, it is used to develop crops with increased yields, enhanced nutritional value, and resistance to pests and herbicides. While offering immense potential, genetic engineering also raises ethical and safety concerns that need to be carefully considered and regulated.')

In [157]:
# Outputting all documents in knowledge-base storage
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

Document types found: machine_learning, Numerical Methods, genetic_engineering, pharmacokinetics, Probability and Statistics, Ecology, database_systems, Electromagnetism, Pharmaceutical Chemistry, Software Engineering, structural_engineering, thermodynamics_1, fluid_dynamics, Artificial Intelligence, Materials Science Engineering, Genetics, quantum_computing, Pharmacology, Chemical Thermodynamics, organic_chemistry, neural_networks, differential_equations, optics, Protein Chemistry, Linear algebra, immunology_1, Molecular biology, polymer_science, molecular_biology, Physical Chemistry, Numerical Analysis, Quantum Mechanics, Immunology, Biomaterials Engineering, Thermodynamics, Analytical Chemistry, Biotechnology, astrophysics, cryptography, Data structures, biochemistry, Clinical Pharmacy, Semiconductor Physics, Fluid mechanics, nanotechnology, Control Systems Engineering, Renewable energy, Genetic Engineering, semiconductor_physics, Organic Chemistry, Calculus_


In [158]:
# Normal search
for chunk in chunks:
    if 'AI' in chunk.page_content:
        print(chunk.metadata['source'], chunk)
        print("--------------------------------------")

knowledge-base2/Artificial Intelligence.txt page_content='Artificial Intelligence (AI) in computer science is the field dedicated to creating systems that can perform tasks that typically require human intelligence. This includes learning from experience, understanding natural language, recognizing patterns, solving problems, and making decisions. The goal of AI is to develop intelligent agents, which are systems that can perceive their environment and take actions to maximize their chances of achieving a goal.' metadata={'source': 'knowledge-base2/Artificial Intelligence.txt', 'doc_type': 'Artificial Intelligence'}
--------------------------------------
knowledge-base2/Artificial Intelligence.txt page_content='AI encompasses a wide range of approaches and subfields. Machine learning, a prominent subfield, focuses on developing algorithms that allow computers to learn from data without being explicitly programmed. This includes supervised learning (learning from labeled data), unsuperv

# Embedding Chunks

In [159]:
# Embedding with HuggingFace Model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Tokenization

In [160]:
# Importing Tokenizer
from transformers import AutoTokenizer

# Defining a random sentence for tokenization process
query = "i'am ahmad hudhud with student number 12220718"

# Applying Pretrained Tokenizer "all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
tokens = tokenizer.tokenize(query)
token_ids = tokenizer.encode(query, add_special_tokens=True)

# Outputting tokens
print("Token Strings:", tokens)
print("Token IDs:", token_ids)

# Getting original sentence (Decoding)
decoded = tokenizer.decode(token_ids)
print(decoded)

Token Strings: ['i', "'", 'am', 'ahmad', 'hu', '##dh', '##ud', 'with', 'student', 'number', '122', '##20', '##7', '##18']
Token IDs: [101, 1045, 1005, 2572, 10781, 15876, 16425, 6784, 2007, 3076, 2193, 13092, 11387, 2581, 15136, 102]
[CLS] i ' am ahmad hudhud with student number 12220718 [SEP]


# Storing Embeddings in the Chroma Vector Database

In [161]:
# Defining Datastore
db_name = "vector_store"

In [162]:
# Check if a Chroma Datastore already exists - if so, delete the collection to start from scratch
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [163]:
# Create our Chroma vectorstore!
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 222 documents


In [164]:
# Extract Data from Chroma vectorstore
collection = vectorstore._collection
ds = collection.get(include=['embeddings', 'documents', 'metadatas'])

# List of Embedding Vectors of each document & it's chunks
vectors = np.array(ds['embeddings'])
docs = ds['documents']

# List of topics of each document
doc_types = [metadata['doc_type'] for metadata in ds['metadatas']]

In [165]:
# Get one vector and find how many dimensions it has
sample_embedding = ds["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 384 dimensions


# TNSE

In [166]:

tsne = TSNE(n_components=2, perplexity=5, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)


unique_doc_types = sorted(set(doc_types))
palette = sns.color_palette("hsv", len(unique_doc_types))
doc_type_to_color = {doc: f'rgb({int(r*255)}, {int(g*255)}, {int(b*255)})' for doc, (r, g, b) in zip(unique_doc_types, palette)}
colors = [doc_type_to_color[dt] for dt in doc_types]


fig = go.Figure()

for doc_type in unique_doc_types:
    indices = [i for i, t in enumerate(doc_types) if t == doc_type]
    fig.add_trace(go.Scatter(
        x=reduced_vectors[indices, 0],
        y=reduced_vectors[indices, 1],
        mode='markers',
        name=doc_type,
        marker=dict(size=10, color=doc_type_to_color[doc_type], opacity=0.7),
        text=[f"<b>{doc_type}</b><br>{docs[i][:100]}..." for i in indices],
        hoverinfo='text'
    ))

fig.update_layout(
    title='2D Visualization of Chroma Vector Store (t-SNE)',
    xaxis_title='t-SNE Dimension 1',
    yaxis_title='t-SNE Dimension 2',
    width=1000,
    height=700,
    legend=dict(itemsizing='constant', font=dict(size=10)),
    margin=dict(r=20, b=20, l=20, t=40)
)

fig.show()


# Calling the Agent

In [167]:
llm = ChatOpenAI(
    temperature=0.7,
    model_name="gpt-4o-mini",
    openai_api_key=openai_token
)
# Memory Context
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
retriever = vectorstore.as_retriever()

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory
)
# Query 1 related to topics of knowledge base , for example here in numerical analysis
query = "Hi i'm ahmad , Define me numerical analysis"
result = conversation_chain.invoke({"question": query})
display(Markdown(result["answer"]))


Numerical analysis is the study of algorithms for solving mathematical problems that arise in science and engineering. It focuses on approximating solutions for problems that often do not have analytical solutions expressed in simple formulas. Numerical analysis provides methods to approximate these solutions using numerical techniques that can be implemented on computers. The field encompasses various numerical methods for tasks such as solving equations, numerical integration and differentiation, and optimizing functions, all while addressing issues of accuracy and stability of the algorithms.

In [168]:
# Query 2 related to Organic Chemistry
query = "Explain me organic chemistry"
result = conversation_chain.invoke({"question": query})
display(Markdown(result["answer"]))


Organic chemistry is the branch of chemistry that focuses on the study of organic compounds, which are primarily composed of carbon atoms. This field examines the structure, properties, composition, reactions, and synthesis of these compounds. While carbon is the central element, organic compounds can also include other elements such as hydrogen, oxygen, nitrogen, sulfur, phosphorus, and halogens.

One of the key features of organic chemistry is the unique ability of carbon to form stable bonds with itself and with other elements, allowing for an immense diversity of organic molecules. These molecules form the foundation of life and are also essential in various synthetic materials.

Organic chemists use various representations, like Lewis structures and skeletal structures, to illustrate the arrangement of atoms and bonds in organic molecules. Functional groups, which are specific groups of atoms within a molecule, play a critical role in determining the chemical behavior and reactivity of those molecules.

A significant aspect of organic chemistry is organic synthesis, which involves constructing complex organic molecules from simpler ones. This is crucial for the development of pharmaceuticals, polymers, and other materials. Techniques such as spectroscopy (including NMR, IR, and Mass Spectrometry) are important tools for identifying and characterizing organic compounds.

Overall, organic chemistry is fundamental to many scientific disciplines, including biochemistry, medicine, and materials science, and its principles are applied in various practical applications, such as drug discovery and the creation of new materials.

In [169]:
# Testing Unrelated Query
query = "Who is Elon Musk ? "

result = conversation_chain.invoke({"question":query})
display(Markdown(result["answer"]))

I don't know.

# Search using FAISS LSH

In [None]:
# Computing embedding dimensions
d =np.array(embeddings.embed_query("test query")) .shape[0]

# Creating LSH Index for searching
index_lsh = faiss.IndexLSH(d, d)

# Creating list of all texts (page content of each chunk)
texts = [doc.page_content for doc in chunks]

# Transform each text as embedded vector , and store them all in an embedding matrix
embeddings_matrix = np.array([embeddings.embed_query(text) for text in texts]).astype('float32')

# Add the matrix for the LSH index , for fast searching
index_lsh.add(embeddings_matrix)

# Defining a query for searching
query_text = "What is the definition of Linear Algebra"
query_embedding = np.array([embeddings.embed_query(query_text)]).astype('float32')

# Getting Document lists with thier Indices Lists , for all queries (list of queries)
D, I = index_lsh.search(query_embedding, 3)

# Print TOP 3 Locality Sensitive Hashing Chunks
print("Top 3 LSH results:")
print('')

# Printing Score , Chunk (First 300 token) and metadata of each candidate chunks , for the first & single query
# ( D[0] & I[0] => 0 for first & single index at same time)

for dist, idx in zip(D[0], I[0]):
    print(f"Score (Distance): {dist}")
    print(f"Text:\n{texts[idx][:300]}...\n")
    print(f"Metadata:\n{chunks[idx].metadata}\n")

    print("="*80)


Top 3 LSH results:

Score (Distance): 72.0
Text:
Linear algebra is a branch of mathematics concerning linear equations, linear functions, and their representations through matrices and vector spaces. It is a fundamental tool in many areas of mathematics, science, and engineering, providing the language and methods to solve systems of linear equati...

Metadata:
{'source': 'knowledge-base2/Linear algebra.txt', 'doc_type': 'Linear algebra'}

Score (Distance): 104.0
Text:
Vector spaces are abstract mathematical structures that generalize the concept of vectors in two or three dimensions. A vector space is a set of vectors that can be added together and multiplied by scalars, satisfying certain axioms. Concepts like linear independence, span, basis, and dimension are ...

Metadata:
{'source': 'knowledge-base2/Linear algebra.txt', 'doc_type': 'Linear algebra'}

Score (Distance): 109.0
Text:
A central object in linear algebra is the matrix, a rectangular array of numbers. Matrices are used t

# Visulaization of Query & Top 3 Chunks with LSH

In [None]:
# Getting embeddings of top 3 results from LSH
selected_embeddings = embeddings_matrix[I[0]]

# Merging query embedding with selected embeddings ( for showing in same figure) => Vertical Stacking
all_embeddings = np.vstack([query_embedding, selected_embeddings])

# Defining TSNE with perplexity of 1, 2 dimensions
tsne = TSNE(perplexity = 1,n_components=2, random_state=42)

# Transforming embeddings into 2 diemnsional space using tnse
embeddings_2d = tsne.fit_transform(all_embeddings)

# Debugging Embeddings for LSH
print("Selected Embeddings : \n\n",selected_embeddings)
print('')

print("All Embeddings \n\n",all_embeddings )
print('')
print("2D Embeddings \n\n",embeddings_2d )

Selected Embeddings : 

 [[-0.04863872 -0.00986886 -0.07452533 ...  0.02935886  0.01650819
  -0.00553349]
 [-0.06330035 -0.01101384 -0.05653913 ... -0.01363987  0.01144241
  -0.04564484]
 [ 0.00312493  0.04330307 -0.10415188 ...  0.0410157  -0.00191511
  -0.00465369]]

All Embeddings 

 [[-0.06299161  0.00611919 -0.10115462 ... -0.01281732  0.03399163
  -0.03131092]
 [-0.04863872 -0.00986886 -0.07452533 ...  0.02935886  0.01650819
  -0.00553349]
 [-0.06330035 -0.01101384 -0.05653913 ... -0.01363987  0.01144241
  -0.04564484]
 [ 0.00312493  0.04330307 -0.10415188 ...  0.0410157  -0.00191511
  -0.00465369]]

2D Embeddings 

 [[ 251.15245      6.6231065]
 [ 191.91019    -61.612106 ]
 [ 230.82188   -203.64743  ]
 [  45.818714   -43.02613  ]]


In [None]:

# Normalize distances between 0-1
norm = matplotlib.colors.Normalize(vmin=min(D[0]), vmax=max(D[0]))

# Defining colors map
cmap = plt.get_cmap('RdYlBu_r')

# Defining Colors for Results based on distance from query point
colors = [f'rgb({int(r*255)}, {int(g*255)}, {int(b*255)})'
          for r, g, b, _ in [cmap(norm(dist)) for dist in D[0]]]

fig = go.Figure()

# Adding Query point with blue color
fig.add_trace(go.Scatter(
    x=[embeddings_2d[0,0]], y=[embeddings_2d[0,1]],
    mode='markers',
    marker=dict(size=15, color='blue'),
    name='Query'
))

# Adding Result Points , assigning distance - based colors for them
fig.add_trace(go.Scatter(
    x=embeddings_2d[1:,0], y=embeddings_2d[1:,1],
    mode='markers+text',
    marker=dict(size=12, color=colors),
    text=[f"Dist: {dist:.2f}" for dist in D[0]],
    textposition="top center",
    name='Top 3 Results'
))

fig.update_layout(title="t-SNE visualization of FAISS LSH Search - Ahmad Hudhud - 1220718")
fig.show()


# Search using Scikit-Learn Cosine Similairty

In [None]:
# Computing Cos-Sim between query & chunk embedding vectors for the first & single query [0]
cos_sim = cosine_similarity(query_embedding, embeddings_matrix)[0]

# Array of top 3 scores ( highest scores by order)
top_idx = np.argsort(cos_sim)[::-1][:3]

# Retrieve Top 3 Cos-Sim Chunks
print("Top 3 Cosine Similarity results:")
print('')

# For each index in top indices, print score , text and metadata
for idx in top_idx:
    print(f"Score: {cos_sim[idx]}")
    print(f"Text: {texts[idx][:300]}...\n")
    print(f"Metadata:\n{chunks[idx].metadata}\n")


Top 3 Cosine Similarity results:

Score: 0.795343279838562
Text: Linear algebra is a branch of mathematics concerning linear equations, linear functions, and their representations through matrices and vector spaces. It is a fundamental tool in many areas of mathematics, science, and engineering, providing the language and methods to solve systems of linear equati...

Metadata:
{'source': 'knowledge-base2/Linear algebra.txt', 'doc_type': 'Linear algebra'}

Score: 0.5926274061203003
Text: Vector spaces are abstract mathematical structures that generalize the concept of vectors in two or three dimensions. A vector space is a set of vectors that can be added together and multiplied by scalars, satisfying certain axioms. Concepts like linear independence, span, basis, and dimension are ...

Metadata:
{'source': 'knowledge-base2/Linear algebra.txt', 'doc_type': 'Linear algebra'}

Score: 0.5866361856460571
Text: A central object in linear algebra is the matrix, a rectangular array of numbers.

In [None]:
# Get Embeddings of top 3 chunk vectors results from cosine similarity
selected_cos_embeddings = embeddings_matrix[top_idx]

# Merge/Stack with query embedding for showing it later in same space
all_cos_embeddings = np.vstack([query_embedding, selected_cos_embeddings])

# Transform into 2D space using same predefined TSNE Object
embeddings_2d_cos = tsne.fit_transform(all_cos_embeddings)

# Debugging Embeddings for Cosine Similarity Results
print("Selected Embeddings : \n\n",selected_cos_embeddings)
print('')

print("All Embeddings \n\n",all_cos_embeddings )
print('')
print("2D Embeddings \n\n",embeddings_2d_cos )



Selected Embeddings : 

 [[-0.04863872 -0.00986886 -0.07452533 ...  0.02935886  0.01650819
  -0.00553349]
 [-0.06330035 -0.01101384 -0.05653913 ... -0.01363987  0.01144241
  -0.04564484]
 [ 0.00312493  0.04330307 -0.10415188 ...  0.0410157  -0.00191511
  -0.00465369]]

All Embeddings 

 [[-0.06299161  0.00611919 -0.10115462 ... -0.01281732  0.03399163
  -0.03131092]
 [-0.04863872 -0.00986886 -0.07452533 ...  0.02935886  0.01650819
  -0.00553349]
 [-0.06330035 -0.01101384 -0.05653913 ... -0.01363987  0.01144241
  -0.04564484]
 [ 0.00312493  0.04330307 -0.10415188 ...  0.0410157  -0.00191511
  -0.00465369]]

2D Embeddings 

 [[ 251.15245      6.6231065]
 [ 191.91019    -61.612106 ]
 [ 230.82188   -203.64743  ]
 [  45.818714   -43.02613  ]]


In [None]:
# TNSE Space Visualization for cosine similarity results
fig_cos = go.Figure()

# Defining Query Point
fig_cos.add_trace(go.Scatter(
    x=[embeddings_2d_cos[0,0]], y=[embeddings_2d_cos[0,1]],
    mode='markers',
    marker=dict(size=15, color='red'),
    name='Query'
))

# Normalize distances between 0-1
norm = matplotlib.colors.Normalize(vmin=min(cos_sim[top_idx]), vmax=max(cos_sim[top_idx]))

# Defining colors map
cmap = plt.get_cmap('coolwarm')

# Defining Colors for Results based on distance from query point
colors_cos = [
    f'rgb({int(r*255)}, {int(g*255)}, {int(b*255)})'
    for r, g, b, _ in [cmap(norm(score)) for score in cos_sim[top_idx]]
]

# Defining Results Point
fig_cos.add_trace(go.Scatter(
    x=embeddings_2d_cos[1:,0], y=embeddings_2d_cos[1:,1],
    mode='markers+text',
    marker=dict(size=12, color=colors_cos),
    text=[f"Score: {cos_sim[idx]:.2f}" for idx in top_idx],
    textposition="top center",
    name='Top 3 Results'
))

fig_cos.update_layout(title="t-SNE visualization of Cosine Similarity Search - Ahmad Hudhud - 1220718")
fig_cos.show()


## Thanks For Reviewing my code.
### by Ahmad Hudhud - 12220718