## Data Loading

In [7]:
import os
from langchain_community.document_loaders import CSVLoader

In [8]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
csv_path = os.path.join(parent_dir, "data", "shl_scraped_catalog.csv")

loader = CSVLoader(file_path=csv_path,encoding='utf-8')

In [9]:
docs=loader.load()

In [10]:
for i in range (0,2):
    print(docs[i].page_content)

assessment_name: Global Skills Development Report
url: https://www.shl.com/products/product-catalog/view/global-skills-development-report/
remote_testing: Yes
adaptive_testing: No
test_type: A, E, B, C, D, P
description: This report is designed to be given to individuals who have completed the Global Skills Assessment (GSA). With coverage across the Great 8 Domains, this measure of self-reported behaviors offers a complete overview of their current skills. Participants receive actionable tips on leveraging their top skill strengths and how they might develop their growth skills.
duration: N/A
job_levels: Director, Entry-Level, Executive, General Population, Graduate, Manager, Mid-Professional, Front Line Manager, Supervisor,
ï»¿assessment_name: .NET Framework 4.5
url: https://www.shl.com/products/product-catalog/view/net-framework-4-5/
remote_testing: Yes
adaptive_testing: Yes
test_type: K
description: The.NET Framework 4.5 test measures knowledge of .NET environment. Designed for expe

## Data Chunking

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [12]:
splitter=RecursiveCharacterTextSplitter(
    chunk_size=900,
    chunk_overlap=100
)
splitted_text=splitter.split_documents(docs)
splitted_text[0]

Document(metadata={'source': 'c:\\Users\\Ankit\\Desktop\\SHL-GenAI-Assessment-Recommendation\\data\\shl_scraped_catalog.csv', 'row': 0}, page_content='\ufeffassessment_name: Global Skills Development Report\nurl: https://www.shl.com/products/product-catalog/view/global-skills-development-report/\nremote_testing: Yes\nadaptive_testing: No\ntest_type: A, E, B, C, D, P\ndescription: This report is designed to be given to individuals who have completed the Global Skills Assessment (GSA). With coverage across the Great 8 Domains, this measure of self-reported behaviors offers a complete overview of their current skills. Participants receive actionable tips on leveraging their top skill strengths and how they might develop their growth skills.\nduration: N/A\njob_levels: Director, Entry-Level, Executive, General Population, Graduate, Manager, Mid-Professional, Front Line Manager, Supervisor,')

## Embedding the chunked documents

In [14]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

os.chdir("..")
persist_dir = "app/chroma_shl_db"

vectordb = Chroma.from_documents(
    documents=splitted_text,
    embedding=embedding_model,
    persist_directory=persist_dir
)

print("ChromaDB created and saved successfully!")


ChromaDB created and saved successfully!
