In [1]:
import re
import json
import langchain

In [2]:
from langchain_community.document_loaders import JSONLoader
loader = JSONLoader(
    file_path='temp.json',
    jq_schema='.[]',
    text_content=False
)

docs = loader.load()


for index,doc in enumerate(docs):
    temp = json.loads(doc.page_content)
    data = ""
    del doc.metadata['source']
    del doc.metadata['seq_num']    


    for key,value in temp.items():
        if(key != "description" and key != "course_link"):
            if isinstance(value, list):
                # Join string representations of list items with spaces
                value = ' '.join(str(v) for v in value)
            doc.metadata[key] = value 
    
    for key, value in temp.items():
        if key == "description":
            if value is not None:
                # Replace newlines with spaces for uniformity
                desc = value.replace('\n', ' ')
                # Remove unnecessary '*' and '-' characters
                desc = re.sub(r'[\*\-]', '', desc)
                # Remove extra spaces
                desc = re.sub(r'\s+', ' ', desc).strip()
                # Split on bullet points (•)
                bullet_parts = [part.strip() for part in re.split(r'•', desc) if part.strip()]
                sentences = []
                for part in bullet_parts:
                    # Instead of splitting on every period, only split on periods that are not part of ordered lists (e.g., "a.", "1.")
                    # We'll use a regex to split on periods that are NOT preceded by a single letter/number and a space
                    # This will keep "a. " or "1. " together
                    sub_sentences = re.split(r'(?<!\b[a-zA-Z0-9])\.(?!\d)', part)
                    for s in sub_sentences:
                        s = s.strip()
                        if s:
                            sentences.append(s)
                # Remove extra spaces from each sentence
                sentences = [re.sub(r'\s+', ' ', s).strip() for s in sentences]
                # Reconstruct with each bullet/sentence on a new line, ending with a period
                updated_description = '.\n'.join(sentences)
                if not updated_description.endswith('.'):
                    updated_description += '.'
                data += f"{key} : {updated_description}"
            else:
                data += f"{key} : "
    del doc.metadata["class_time"]
    doc.metadata["index"]: index
    doc.page_content = data
    


In [3]:
print(docs[1])

page_content='description : Brief history of cancer.
Thoughts on the metabolic and genetic basis of cancer since early 1920’s. How genetic basis of cancer became the mainstay of understanding cancer.
Oncogenes and Tumor suppressor genes.
Clonal origin of cancer.
Stem cells versus cancer stem cells.
Immunology of cancer.
Epigenetics of Cancer.
Proteomics, transcriptomics, metabolomics of cancer.
Role of mitochondria in cancer.
Reemergence of metabolic basis of cancer.
GWAS in cancer.
Cancer as an evolutionary process.
Application of game theory in cancer.
Yeast as a model to understand cancer.
Bioenergetics of cancer: rate versus efficiency.
Nongenetic heterogeneity in cancer.' metadata={'course_code': 'BB703', 'course_name': 'Cancer Genetics And Metabolism', 'department': 'Biosciences & Bioengineering', 'instructors': '', 'tags': '', 'credits': 6, 'prerequisites': '', 'is_running': False, 'venue': '', 'duration': '', 'slot': ''}


In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
)

  from .autonotebook import tqdm as notebook_tqdm


In [160]:
vector_store.delete_collection()

In [5]:
from langchain.vectorstores import Chroma

vector_store = Chroma(
    embedding_function = embedding_model,
    persist_directory='course_vector_database',
    collection_name = 'sample'
)

  vector_store = Chroma(


In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 100, chunk_overlap=0)
chunks = text_splitter.split_documents(docs)

In [7]:
print(docs[1])

page_content='description : Brief history of cancer.
Thoughts on the metabolic and genetic basis of cancer since early 1920’s. How genetic basis of cancer became the mainstay of understanding cancer.
Oncogenes and Tumor suppressor genes.
Clonal origin of cancer.
Stem cells versus cancer stem cells.
Immunology of cancer.
Epigenetics of Cancer.
Proteomics, transcriptomics, metabolomics of cancer.
Role of mitochondria in cancer.
Reemergence of metabolic basis of cancer.
GWAS in cancer.
Cancer as an evolutionary process.
Application of game theory in cancer.
Yeast as a model to understand cancer.
Bioenergetics of cancer: rate versus efficiency.
Nongenetic heterogeneity in cancer.' metadata={'course_code': 'BB703', 'course_name': 'Cancer Genetics And Metabolism', 'department': 'Biosciences & Bioengineering', 'instructors': '', 'tags': '', 'credits': 6, 'prerequisites': '', 'is_running': False, 'venue': '', 'duration': '', 'slot': ''}


In [8]:
len(chunks)

1682

In [9]:
vector_store.add_documents(chunks)

['7b9fd7af-db4c-40b8-bfa9-8a937d536caf',
 '4abaee83-fb8a-4e47-b9cc-95b90f6a61b3',
 '31524ad7-8018-4d75-90fc-ab5fa9432ec9',
 '353fd681-b1ea-4191-9c1a-4c889d010c94',
 '4af458fc-10d9-45e1-8a76-07a76a179f41',
 '09ff8a77-31e3-4dd5-b159-1383e404b7df',
 '740dc6b1-2cd1-42e0-8437-63059fcce655',
 'c977ffa5-79bb-4312-8741-a3f6b51246ba',
 '17640a18-5866-45bd-a8d6-1016f04600b5',
 '67f72eca-857e-4455-bdf2-523da4b94fc7',
 'eb6b9705-f907-4e8a-93fe-3df19484efd7',
 '6a8b6164-932f-464e-8424-b56a3331bd71',
 '26f1eada-3401-436a-a7d9-377f6a1c0cb1',
 '978c0e7b-4bd4-499c-bb46-5c4e441c15f1',
 'adcc4233-a589-42eb-a8e9-c2023a442ebd',
 '17d0d26b-f5b9-4598-8361-cd476b021f40',
 'f568fdca-01e2-4e12-a12b-0ec8cf7e22ad',
 'ec956e85-4e05-4eb4-b1eb-0457a986159b',
 '110933d3-693e-489a-b5a7-c21b0f0ed0cb',
 '35b773aa-456f-4598-92f9-d16434a4089e',
 'c032bd3d-2ce8-4dfa-b8b5-ca09ceeb705c',
 'e89a5cce-5853-484c-a6ca-5d36eae809c7',
 '29d4a0fb-7614-4df4-ba55-b9c4466d8cbf',
 'fbdf86db-1ca2-4aec-beae-d7a171837bec',
 '0d4eeb0b-226b-

In [10]:
retriever = vector_store.as_retriever(search_kwargs = {"k":3})

In [11]:
query = "AE694"

results = retriever.invoke(query)
for result in results:
    print(result,'\n')

page_content='description : IV.' metadata={'slot': '', 'duration': '', 'venue': '', 'credits': 4, 'department': 'Biosciences & Bioengineering', 'prerequisites': '', 'tags': '', 'course_code': 'BB792', 'instructors': '', 'course_name': 'Communication Skills Ii', 'is_running': False} 

page_content='description :' metadata={'instructors': 'I - Abhilash Chandy', 'credits': 6, 'duration': 'FullSemester', 'course_name': 'Essentials Of Turbulence', 'course_code': 'ME724', 'is_running': True, 'prerequisites': '', 'venue': 'ESE 104', 'department': 'Mechanical Engineering', 'slot': '5', 'tags': 'Theory'} 

page_content='description :' metadata={'credits': 6, 'tags': '', 'venue': '', 'department': 'Shailesh J. Mehta School of Management', 'course_name': 'Industrial Psychology', 'duration': '', 'instructors': '', 'prerequisites': '', 'course_code': 'MG642', 'slot': '', 'is_running': False} 

