## Installing Dependencies

In [1]:
pip install langchain langchain_openai langchain_community datasets streamlit ragas PyPDF2

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Imports

In [2]:
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Get the API key from the environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")


In [3]:
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate 
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision

)

## Loading Pdf

In [4]:
pdf = r"C:\Users\abhisheka\OneDrive - USEReady Technology Private Limited\Desktop\RAGAS\Streamlit with Databricks.pdf"

if pdf is not None:
    pdf_reader = PdfReader(pdf)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()

In [5]:
#text

## Chunking, Embedding and Retrieval 

In [6]:

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
)

chunks = text_splitter.split_text(text)
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_texts(chunks, embeddings)

## Prompt Template

In [7]:
retriever = vectorstore.as_retriever()

llm = ChatOpenAI(model_name = "gpt-3.5-turbo", temperature=0)

template = """ you are a helpful pdf assistant. 
        Given the following pdf, answer the question based on the context.
        If you don't know the answer, just say that you don't know. 
        Do not make up an answer.

        Question: {question}
        Context: {context}
        
        Answer:"""

prompt = ChatPromptTemplate.from_template(template)

## RAG Chain

In [8]:
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    |StrOutputParser()
    
)

## RAGAS

In [9]:
questions = ["what are the common uses of databricks?",
                "when can we integrate databricks into streamlit?"]

ground_truths=[["Databricks is commonly used for big data analytics and processing, particularly for Apache Spark-based data processing tasks. It provides a unified analytics platform that integrates with various data sourcesand supports collaborative data science workflows. Additionally, it's utilized for machine learning, data engineering, and real-time analytics applications."],
                ["We can integrate Databricks into your Streamlit app for data processing tasks by leveraging Databricks as a backend service. For example, you can use Databricks for heavy-duty data transformations, machine learning model training, or large-scale data analysis tasks."]]


answers=[]
contexts=[]

for query in questions:
    answers.append(rag_chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])


  warn_deprecated(


In [10]:
contexts

[["STREAMLIT\nAPPS WITH \nDATABRICKS \nML apps that can have\npowerful backend support\nwith DatabricksCOMMON USES OF \nDATABRICKS\n•Databricks is commonly used for big data analytics and \nprocessing, particularly for Apache Spark-based data \nprocessing tasks. It provides a unified analytics platform \nthat integrates with various data sources and supports \ncollaborative data science workflows. Additionally, it's \nutilized for machine learning, data engineering, and \nreal-time analytics applications.",
  "1. Data Preprocessing : Databricks can handle the preprocessing of PDF files, \nextracting text and relevant information from them efficiently at scale.\n2. Data Analysis : You can use Databricks to analyze the extracted text data, \nperform natural language processing tasks, and extract insights to aid in the \nscreening process.\n3. Model Training : If you're using machine learning models, Databricks can be \nused to train these models on large datasets, optimizing performance 

## Data

In [11]:
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths}
    

In [12]:
data

{'question': ['what are the common uses of databricks?',
  'when can we integrate databricks into streamlit?'],
 'answer': ['Common uses of Databricks include big data analytics and processing, particularly for Apache Spark-based data processing tasks. It also provides a unified analytics platform that integrates with various data sources and supports collaborative data science workflows. Additionally, Databricks is utilized for machine learning, data engineering, and real-time analytics applications.',
  'You can integrate Databricks into Streamlit for data processing tasks by leveraging Databricks as a backend service.'],
 'contexts': [["STREAMLIT\nAPPS WITH \nDATABRICKS \nML apps that can have\npowerful backend support\nwith DatabricksCOMMON USES OF \nDATABRICKS\n•Databricks is commonly used for big data analytics and \nprocessing, particularly for Apache Spark-based data \nprocessing tasks. It provides a unified analytics platform \nthat integrates with various data sources and sup

In [13]:

dataset = Dataset.from_dict(data)

In [14]:
dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truths'],
    num_rows: 2
})

## Evaluation

In [15]:
result = evaluate(
    dataset,
    metrics=[context_precision, context_recall, faithfulness, answer_relevancy]
)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`


Evaluating:   0%|          | 0/8 [00:00<?, ?it/s]

In [21]:
result['faithfulness']

1.0

In [50]:
df = result.to_pandas()

In [51]:
df

Unnamed: 0,question,answer,contexts,ground_truths,ground_truth,context_precision,context_recall,faithfulness,answer_relevancy
0,what are the common uses of databricks?,Common uses of Databricks include big data ana...,[STREAMLIT\nAPPS WITH \nDATABRICKS \nML apps t...,[Databricks is commonly used for big data anal...,Databricks is commonly used for big data analy...,1.0,1.0,1.0,0.977396
1,when can we integrate databricks into streamlit?,You can integrate Databricks into Streamlit fo...,"[heavy-duty data transformations, machine \nle...",[We can integrate Databricks into your Streaml...,We can integrate Databricks into your Streamli...,1.0,1.0,1.0,0.929206
