In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install PyPDF2 InstructorEmbedding langchain langchain-community langchainhub langchain-mistralai transformers sentence-transformers faiss-gpu
!pip install -q -U bitsandbytes
!pip install -q -U peft
!pip install -q -U accelerate

In [None]:
!pip install fastapi
!pip install uvicorn
!pip install pyngrok
!pip install nest_asyncio
!pip install python-multipart

In [None]:
!ngrok config add-authtoken 2c2PwQHIfIGaGan2nGOCD7Hau02_m9sPZNHgzv2T4mmrhr1a

In [None]:
import PyPDF2 as pdf
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain, ConversationChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
import transformers
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain_core.runnables import RunnablePassthrough

# Initializing Model and Tokenizer for future use

In [None]:
model = AutoModelForCausalLM.from_pretrained("/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1", quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.bfloat16
))
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1")

# Function to: Generate Documents from pdf

In [None]:
def extract_documents_from_pdf(path, chunk_size = 1000, chunk_overlap = 200):
    pdf_file = pdf.PdfReader(path)
    paper = ""
    for i in range(pdf_file._get_num_pages()):
        paper += pdf_file._get_page(i).extract_text()
    paper = paper.split("\n")
    paper = "".join(paper)
    splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)
    documents = splitter.create_documents([paper])
    return documents

# Function to: Create Vectors from the Documents and returns a retriver

In [None]:
def get_retriever(documents):
    embeddings = HuggingFaceEmbeddings()
    str_list = [x.__str__() for x in documents]
    vectors = embeddings.embed_documents(str_list)
    db = FAISS.from_documents(documents, embeddings)
    return db.as_retriever(search_kwargs={"k": 4})

# Function to: Create an LLM chain that answers to the query

In [None]:
def create_llm_chain(retriever, model, tokenizer):
    text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=800,
    )
    
    
    prompt_template = """
### [INST] You are an AI assistant specialized in analyzing and answering questions based on academic papers. You have access to a large database of research papers spanning various fields. Please provide a detailed answer to the question based on the given context. If the context is not sufficient to answer the question, kindly let me know that additional information is needed, and I will try to provide more relevant context from the research paper database. Here is the context:
{context}

### QUESTION:
{question}

[/INST]
"""
#    prompt_template = """
#### [INST] 
#Instruction: Answer the question based with the help of following context:
#{context}
#
#### QUESTION:
#{question} 
#
#[/INST]
#"""
    mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
    prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
    )

    # Create llm chain 
    llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)
    return llm_chain

# Function to: Create a Chain that helps query the datastore for the relevant documents

In [None]:
def create_rag_chain(retriever, llm_chain):
    rag_chain = ( 
     {"context": retriever, "question": RunnablePassthrough()}
        | llm_chain
    )

    return rag_chain

# Function to: Use all the previous functions generated and create a LLM chain

In [None]:
def create_llm(path, model, tokenizer, chunk_size = 1000, chunk_overlap = 200):
    documents = extract_documents_from_pdf(path, chunk_size, chunk_overlap)
    retriever = get_retriever(documents)
    llm_chain = create_llm_chain(retriever, model, tokenizer)
    rag_chain = create_rag_chain(retriever, llm_chain)
    return rag_chain

llm = None

In [None]:
path = "/kaggle/input/rag_pdf/rag_pdf.pdf"
#llm = create_llm(path, model, tokenizer, 1000, 200)

In [None]:
def summarize(text):
    text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.4,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=600,
    )
    llm = HuggingFacePipeline(pipeline=text_generation_pipeline)   
    ans = llm.predict(
    f'''
    ### [INST] 
    Instruction: Summarize the following piece of information and cover only the important points, try keeping it concise and small and return the response in paragraph format:

    ### QUESTION:
    {text} 

    [/INST]
    '''
    )
    return "".join(ans.split("\n")).strip()

# Testing the LLM

In [None]:
text = '''Czech Republic’s Krystyna Pyszkova on Saturday won the Miss World 2024 title at a grand event in Mumbai. Reigning Miss World Karolina Bielawska from Poland crowned her successor at the star-studded finale here.
Apart from studying for dual degrees in Law and Business Administration, Krystyna also works as a model. She established the Krystyna Pyszko Foundation and remains actively involved in its initiatives, according to the Miss World website.
After Pyszkova, Miss Lebanon Yasmina Zaytoun was named the first runner-up. 
India, which hosted the event after 28 years, was represented by 22-year-old Sini Shetty. Mumbai-born Shetty, who was crowned Femina Miss India World in 2022, was unable to make it to the top 4 of the contest. India has won the prestigious title six times — Reita Faria (1966), Aishwarya Rai Bachchan (1994), Diana Hayden (1997), Yukta Mookhey (1999), Priyanka Chopra Jonas (2000), and Manushi Chillar (2017). The 71st Miss World pageant, which witnessed the participation of contestants from 112 countries of the world, was held at the Jio World Convention Centre in BKC here.
Part of the 12-judge panel for the finale were film producer Sajid Nadiadwala; actors Kriti Sanon, Pooja Hegde; cricketer Harbhajan Singh; news personality Rajat Sharma, social worker Amruta Fadnavis; Vineet Jain, MD of Bennett, Coleman & Co. Limited; Julia Morley, Chairperson and CEO of the Miss World Organization; Jamil Saidi, Strategic Partner & Host – Miss World India, and three former Miss Worlds, including Chillar.
Filmmaker Karan Johar and former Miss World Megan Young hosted the event, which kickstarted on a high note with performances by singers Shaan, Neha Kakkar, and Tony Kakkar. A video message by Chopra Jonas highlighting the importance of ’beauty with purpose’, a tagline associated with the Miss World pageant, was also played at the event.
The cast of Sanjay Leela Bhansali’s maiden web series Heeramandi: The Diamond Bazaar” — Manisha Koirala, Sonakshi Sinha, Aditi Rao Hydari, Richa Chadha, Sharmin Segal, and Sanjeeda Sheikh — also walked the stage with 13 fast-track Miss World contestants on the show’s newly released song “Sakal Ban”. The month-long Miss World event featured a series of rigorous competitions, including talent showcases, sports challenges, and charitable initiatives — all aimed at highlighting the qualities that make these competitors the ambassadors of change.
'''
text2 = '''
Diff types of dataset used in ML
In machine learning, there are several types of datasets commonly used for training and evaluating models:
Training Dataset: This dataset is used to train the machine learning model. It consists of a set of input-output pairs that the model learns from during the training process.
Validation Dataset: This dataset is used to tune the hyperparameters of the model and to evaluate its performance during training. It helps prevent overfitting by providing an independent dataset that the model hasn't seen during training.
Test Dataset: The test dataset is used to evaluate the final performance of the trained model. It serves as an unseen dataset to assess how well the model generalizes to new, unseen data.
Cross-Validation Dataset: In k-fold cross-validation, the dataset is divided into k subsets. The model is trained and evaluated k times, each time using a different subset as the validation set and the remaining data for training. This helps provide a more reliable estimate of the model's performance.
These are some of the common types of datasets used in machine learning, each tailored to different types of tasks and data structures.
'''
len(text2)

In [None]:
ans = summarize(text2)
print(len(ans))
ans

# Creating a FastAPI Server

In [None]:
from fastapi import FastAPI, Request, File, UploadFile
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
import nest_asyncio
from pyngrok import ngrok

In [None]:
app = FastAPI()
origins = ["*"]
app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

In [None]:
UPLOAD_DIRECTORY = "/kaggle/working/pdfs"  # Directory where PDF files will be saved

# Create the upload directory if it doesn't exist
os.makedirs(UPLOAD_DIRECTORY, exist_ok=True)

In [None]:
@app.get("/")
def read_root():
    return {"Hello": "World"}

#get the input from the request body and return the response
@app.post("/summarize/")
async def get_summary(request: Request):
    data = await request.json()
    input_text = data['text']
    if len(input_text) == 0:
        return {"error": "Input text is empty"}
    else:
        return {"summary": summarize(input_text)}

In [None]:
@app.post("/upload/")
async def upload_file(file: UploadFile = File(...)):
    global llm
    with open(os.path.join(UPLOAD_DIRECTORY, file.filename), "wb") as buffer:
        buffer.write(await file.read())
    llm = create_llm("/kaggle/working/pdfsw/"+file.filename, model, tokenizer, 1000, 200)
    return {"res":"Created and Linked PDF with LLM"}
    
#get the input from the request body and return the response
@app.post("/rag_pdf/")
async def queryPDF(request: Request):
    global llm
    data = await request.json()
    input_text = data['text']
    res = llm.invoke(input_text).get("text")
    print(res)
    return {"res":res}

In [None]:
import os

In [None]:
ngrok_tunnel = ngrok.connect(8000)
print("Public Url: "+ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)