# Introducing  RAG(Retrieval Augmented Generation)

Notebook summary:

1.Installation

2.Document Retrieval

3.Retrival Augmented Generation

**April 29,2024 upgrade from gpt-4 to gpt-4-turbo for content generation**

# 1.Installation

In [11]:
!pip install tiktoken



In [12]:
!pip install cohere



In [13]:
#Importing openai
try:
  import openai
except:
  !pip install openai
  import openai

In [14]:
#API Key
#Store you key in a file and read it(you can type it directly in the notebook but it will be visible for somebody next to you)
from google.colab import drive
drive.mount('/content/drive')
f = open("drive/MyDrive/files/api_key.txt", "r")
API_KEY=f.readline()
f.close()

#The OpenAI Key
import os
os.environ['OPENAI_API_KEY'] =API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
!pip install ipywidgets



In [16]:
!pip install beautifulsoup4 requests transformers



# 2.Document Retrieval

In [17]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline

In [18]:
def select_urls_based_on_query(user_query):
    # URLs related to 'climate'
    climate_urls = [
        "https://en.wikipedia.org/wiki/Climate_change",  # Replace with actual URLs
        "https://en.wikipedia.org/wiki/Effects_of_climate_change"
    ]

    # URLs related to 'RAG'
    rag_urls = [
        "https://en.wikipedia.org/wiki/Large_language_model",  # Replace with actual URLs
        "https://huggingface.co/blog/ray-rag"
    ]

    # Check if 'climate' is in the user query
    if "climate" in user_query.lower():
        return climate_urls

    # Check if 'RAG' is in the user query
    elif "RAG" in user_query:
        return rag_urls

    # Default return if no keyword matches
    return []

In [19]:
def fetch_and_summarize(user_query):
    urls = select_urls_based_on_query(user_query)

    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

    summaries = []
    for url in urls:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')

        # Try to extract the main article text more accurately
        # This is a generic example and might need to be adjusted for specific websites
        article = soup.find('article')
        if article:
            article_text = article.get_text()
        else:
            paragraphs = soup.find_all('p')
            article_text = ' '.join([para.get_text() for para in paragraphs])

        # Truncate if too long for the model
        if len(article_text) > 1024:
            article_text = article_text[:1024]

        summary = summarizer(article_text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
        summaries.append(summary)

    return summaries

# 3.Retrieval Augmented Generation

In [20]:
from openai import OpenAI
import ipywidgets as widgets
from IPython.display import display

client = OpenAI()

AImodel = "gpt-4-turbo" # or select another model

# Function to interact with OpenAI's model
def openai_chat(input_text, document_excerpt, web_article_summary):
    # Start the OpenAI API call to generate a chat response
    response = client.chat.completions.create(
        model=AImodel,  # Specifies the model to use
        messages=[
            {
                "role": "system",  # "system" role for providing contextual information
                "content": f"The following is an excerpt from a document about climate change: {document_excerpt}"
                # The document excerpt is now a variable passed to the function
            },
            {
                "role": "system",  # Another "system" role message
                "content": f"The following is a summary of a web article on renewable energy: {web_article_summary}"
                # The web article summary is now a variable passed to the function
            },
            {
                "role": "user",  # "user" role for the actual user query
                "content": input_text
                # The user's query or input that the model will respond to
            }
        ],
        temperature=0.1,  # Controls randomness. Lower values make responses more deterministic.
        max_tokens=150,   # Sets the maximum length of the response in terms of tokens (words/parts of words).
        top_p=0.9,        # Nucleus sampling: A higher value increases diversity of the response.
        frequency_penalty=0.5,  # Reduces repetition of the same text. Higher values discourage repetition.
        presence_penalty=0.5    # Reduces repetition of similar topics. Higher values encourage new topics.
    )
    # the response object is not a dictionary. It is an instance of the ChatCompletion class
    # to access the content property, use dot notation instead of bracket notation
    return response.choices[0].message.content

input_text = "What are the impacts of climate change?"
#input_text = "What is RAG"

# 1. you can create a function specifically for your domain with different cases:
    # Check if 'climate' is in the user query
if "climate" in input_text.lower():
        document_excerpt = "Climate change refers to significant changes in global temperatures and weather patterns over time."
    # Check if 'RAG' is in the user query
if "RAG" in input_text.lower():
        document_excerpt = "OpenAI documentation states that RAG or retrieval augmented generation can tell the model about relevant documents."


# 2. and/or you can automate the retrieval
summaries = fetch_and_summarize(input_text)
#print(summaries)
web_article_summary = summaries

iresponse = openai_chat(input_text, document_excerpt, web_article_summary)
formatted_response = iresponse.replace('\n', '<br>')  # Replace \n with HTML line breaks
display(widgets.HTML(value=formatted_response))  # Display response as HTML

HTML(value='The impacts of climate change are extensive and affect various aspects of the environment and huma…