# IRS - GEMMA

## Setup and Installations

In [1]:
%cd E:\Github_Repo\Info-Retrieve-AI

E:\Github_Repo\Info-Retrieve-AI


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
# Install required packages
!pip install -r requirements.txt

# Install TensorFlow and Keras
!pip install -q -U tensorflow
!pip install -q -U keras-nlp
!pip install -q -U keras>=3



In [3]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Import necessary libraries
import os
import config as cfg
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import pinecone
from pinecone import Pinecone, ServerlessSpec
import google.generativeai as genai

# Configure the Gemini API using config.py file
genai.configure(api_key=cfg.GOOGLE_API_KEY)

In [4]:
# Validate installation
import tensorflow as tf
import keras
print("TensorFlow version:", tf.__version__)
print("Keras version:", keras.__version__)

# Import keras_nlp for working with Gemma
import keras_nlp

ModuleNotFoundError: No module named 'distutils'

In [None]:
## Gemma Model Initialization

# Configuration for Kaggle API
os.environ['KAGGLE_KEY'] = cfg.KAGGLE_KEY
os.environ['KAGGLE_USERNAME'] = cfg.KAGGLE_USERNAME

# Select TensorFlow as the backend
os.environ["KERAS_BACKEND"] = "tensorflow"

# Pre-allocate 100% of GPU memory to minimize memory fragmentation issues
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"

# Set a random seed for reproducibility
keras.utils.set_random_seed(42)

# Initialize the Gemma model
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_2b_en")
# gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_1.1_instruct_2b_en")

# Print model summary to confirm the model is loaded correctly
gemma_lm.summary()

Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_2b_en/2/download/metadata.json...
100%|██████████| 143/143 [00:00<00:00, 38.3kB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_2b_en/2/download/task.json...
Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_2b_en/2/download/config.json...
100%|██████████| 555/555 [00:00<00:00, 1.18MB/s]
Downloading from https://www.kaggle.com/api/v1/models/keras/gemma/keras/gemma_2b_en/2/download/model.weights.h5...
100%|██████████| 4.67G/4.67G [06:31<00:00, 12.8MB/s]


## Web Scrapper

In [None]:
class BlogScraper:
    def __init__(self, url, headers):
        self.url = url
        self.headers = headers

    def scrape(self):
        response = requests.get(self.url, headers=self.headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            box = soup.find('div', class_='gridbox gridbox-170-970')
            items = box.find_all('div', class_='card-title headingC sans')

            data = []
            for index, item in enumerate(items, start=1):
                title = item.text.strip()
                link = item.find('a')['href']
                link_response = requests.get(link, headers=self.headers)
                if link_response.status_code == 200:
                    link_soup = BeautifulSoup(link_response.content, 'html.parser')
                    content = link_soup.find('div', class_='wysiwyg').get_text(separator='\n').strip()
                    data.append({"Index": index, "Heading": title, "Hyperlink": link, "Content": content})
                else:
                    print(f"Failed to fetch content for hyperlink: {link}")

            return data
        else:
            print("Failed to fetch the webpage.")
            return None

## BlogIndexer

In [None]:
# BlogIndexer
class BlogIndexer:
    def __init__(self, url, headers):
        self.scraper = BlogScraper(url, headers)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.index_name = "blog-index"
        self.index = pinecone.Index(name=self.index_name, api_key=cfg.PINECONE_API_KEY, host='https://blog-index-ntt4sfk.svc.aped-4627-b74a.pinecone.io')
        self.index.describe_index_stats()

    def index_content(self):
        data = self.scraper.scrape()
        if data:
            upsert_data = []
            for item in data:
                combined_text = f"{item['Heading']}. {item['Content']}"
                embedding = self.model.encode(combined_text, convert_to_tensor=False)
                embedding_list = embedding.tolist()
                # Include content in metadata for retrieval in the QA system
                upsert_data.append((str(item['Index']), embedding_list, {'content': item['Content']}))
            self.index.upsert(vectors=upsert_data)
            print("Content indexed successfully.")


    def view_scraped_data(self):
        data = self.scraper.scrape()
        for item in data:
            print(item)

    def test_embeddings(self):
        data = self.scraper.scrape()
        for item in data:
            embedding = self.model.encode(f"{item['Heading']}. {item['Content']}", convert_to_tensor=False)
            print(f"Index: {item['Index']}, Heading: {item['Heading']}, Embedding: {embedding[:5]}...")

In [None]:
# Usage
indexer = BlogIndexer(url='https://escalent.co/thought-leadership/blog/?industry=automotive-and-mobility', headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"})
indexer.view_scraped_data()

In [None]:
indexer.index_content()
indexer.test_embeddings()

## QASystem- Gemma

In [None]:
import pandas as pd

class QASystem:
    def __init__(self, model, indexer_instance):
        self.model = model
        self.indexer = indexer_instance
        self.logs = pd.DataFrame(columns=['Query', 'Response'])

    def query_to_embedding(self, query):
        # Generate embedding for the query
        embedding = self.indexer.model.encode(query, convert_to_tensor=False)
        return embedding.tolist()

    def retrieve_context(self, query_embedding, top_k=3):
        # Retrieve top k matching contexts from Pinecone
        query_results = self.indexer.index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
        documents = []
        for match in query_results.get('matches', []):
            documents.append(match['metadata']['content'])
        return " ".join(documents)

    def answer_query(self, query):
        print("Generating query embedding...")
        query_embedding = self.query_to_embedding(query)
        print("Retrieving context...")
        context = self.retrieve_context(query_embedding)
        if not context:
            response_text = "I couldn't find enough information to answer your question."
        else:
            augmented_query = "\n\n---\n\n".join(contexts) + "\n\n---\n\n" + query
            prompt = f"You are a Question and Answering bot designed to answer questions using the provided context. Do not answer questions that are asked outside the context. Here's the user question:\n\n{augmented_query}"
            print("Generating response based on the context...")
            response = self.model.generate(prompt, max_length=512)
            response_text = response[0]['generated_text']
            print("Response generated.")
        # Log the query and the response
        self.logs = pd.concat([self.logs, pd.DataFrame([{'Query': query, 'Response': response_text}])], ignore_index=True)
        return response_text

    def save_logs_to_csv(self, filename="gemma_query_logs.csv"):
        self.logs.to_csv(filename, index=False)
        print(f"Logs saved to {filename}.")

    def print_log(self):
        if self.logs.empty:
            print("No entries in the log.")
        else:
            print(self.logs)

In [None]:
# Usage
indexer = BlogIndexer(
    url='https://escalent.co/thought-leadership/blog/?industry=automotive-and-mobility',
    headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
)

qa_system = QASystem(gemma_lm, indexer)