In [1]:
#importing the necessary libraries
import os
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pinecone
import streamlit as st
import openai
from langchain.vectorstores import Pinecone

  from tqdm.autonotebook import tqdm


In [12]:
st.write("Openai api key:", st.secrets["OPENAI_API_KEY"])
st.write("Pinecone api key:", st.secrets["PINECONE_API_KEY"])

In [2]:
#loading the documents from the data directory
directory = "data"

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)

2

In [4]:
"""splitting the document into smaller chunks to ensure the size of the document is manageable and that no relevant inormation
is missed out"""
def split_docs(documents, chunk_size=1000, chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print(len(docs))

145


In [5]:
#displaying the page content of the splitted document
print(docs[10].page_content)

Single sign-on using ORCID  Track user engagement  Auto-grading of assessment questions  Badging  Data analytics  Mobile-ready  10,000+ users, autoscaling

Badging

In order for the TOPS Open Science badge to be offered as part of in- person or virtual workshops, the following minimum requirements must be met.

The event must be registered with the TOPS team.  Teach at least one module in full (Note: Although modules can stand alone, we recommend teaching Ethos of Open Science alongside your chosen module, as it provides the foundational understanding for Open Science practices and benefits to users).

Have at least one certified instructor or apply for a waiver

(application link TBD).

Must use TOPS Open Science 101 curriculum materials.  Survey course participants before and after completion, using a survey that TOPS will provide for your use.


In [6]:
#creating embeddingxs by converting the splitted chunks of text into a format the the AI model can understand
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [7]:
#checking the dimension of the embedded query
query_result = embeddings.embed_query("Hello world")
len(query_result)

384

In [8]:
#storing the embeddings in vector database pinecone
pinecone.init(      
	api_key=st.secrets["PINECONE_API_KEY"],      
	environment="gcp-starter"      
)      
index_name = "langchain-chatbot"

index = Pinecone.from_documents(docs, embeddings, index_name=index_name)

In [9]:
pinecone.init(api_key=st.secrets["PINECONE_API_KEY"], environment="gcp-starter")
pinecone.list_indexes()

['langchain-chatbot']

In [10]:
#accessing the embedding by using the similarity search function
def get_similar_docs(query, k=1, score= False):
  if score:
    similar_docs = index.similarity_search_with_score(query, k=k)
  else:
    similar_docs = index.similarity_search(query, k=k)
  return similar_docs

query = "What is open science"
similar_docs = get_similar_docs(query)
similar_docs

[Document(page_content='Open Science:\n\nOpen science is a movement that promotes transparency, collaboration, and the sharing of scientific knowledge and research findings. It aims to make scientific research more accessible, reproducible, and inclusive. Here are some key aspects of open science:\n\nOpen Access: Open access refers to making research publications, data, and other scientific materials freely available to the public. This allows anyone, regardless of their affiliation or location, to access and benefit from scientific knowledge.\n\nOpen Data: Open data involves sharing research data openly, typically in digital formats. This practice enables others to analyze, reproduce, and build upon existing research, fostering scientific progress.', metadata={'source': 'data\\open science.txt'})]