# RAG Approach

In [None]:
!pip install -q weaviate-client langchain tiktoken pypdf accelerate rapidocr-onnxruntime bitsandbytes accelerate xformers einops langchain faiss-cpu transformers sentence-transformers

In [None]:
WEAVIATE_CLUSTER="<Cluster-Path>"
WEAVIATE_API_KEY="<API Key>"

In [None]:
from langchain.vectorstores import Weaviate
import weaviate

WEAVIATE_URL = WEAVIATE_CLUSTER
WEAVIATE_API_KEY = WEAVIATE_API_KEY

client = weaviate.Client(
    url=WEAVIATE_URL, auth_client_secret=weaviate.AuthApiKey(WEAVIATE_API_KEY)
)

In [None]:
# fixing unicode error in google colab
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install -q sentence-transformers

In [None]:
# specify embedding model (using huggingface sentence transformer)
from langchain.embeddings import HuggingFaceEmbeddings
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
#model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
  model_name=embedding_model_name,
  #model_kwargs=model_kwargs
)

# you can load multiple types of pdf using the langchain just check with the document

https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf/

In [None]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("/content/RAG_No_Titles.pdf", extract_images=True)
pages = loader.load()

In [None]:
pages

In [None]:
# Split text into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
docs = text_splitter.split_documents(pages)

In [None]:
docs

In [None]:
vector_db = Weaviate.from_documents(
    docs, embeddings, client=client, by_text=False
)

## Text Summarization

In [None]:
from typing import List
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, BitsAndBytesConfig
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import FAISS

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:", device)
if device == 'cuda':
    print(torch.cuda.get_device_name(0))

# >>> Device: cuda
# >>> Tesla T4

In [None]:
from langchain import HuggingFacePipeline
summarization_pipeline = transformers.pipeline(
    model="facebook/bart-large-cnn",
    tokenizer=AutoTokenizer.from_pretrained("facebook/bart-large-cnn"),
    task="summarization",
    max_length=250,  # Set a larger max_length value
)

LLM = HuggingFacePipeline(pipeline=summarization_pipeline)

### For the Dataset

In [None]:
import pandas as pd
from tqdm import tqdm
import re

# Function to preprocess text
def preprocess_text(text):
    # Remove special characters
    text = re.sub(r"[^\w\s]", "", text)
    # Additional preprocessing steps can be added here
    return text

# Function to process each text and perform summarization
def process_text(text):
    doc1 = vector_db.similarity_search(text, k=3)[0].page_content
    doc2 = vector_db.similarity_search(text, k=3)[1].page_content
    doc3 = vector_db.similarity_search(text, k=3)[2].page_content

    # Preprocess each document separately
    doc1 = preprocess_text(doc1)
    doc2 = preprocess_text(doc2)
    doc3 = preprocess_text(doc3)

    # Concatenate the preprocessed documents
    data = doc1 + doc2 + doc3
    data = "Summarize This Data in 2 to 5 sentences : " + data
    summary = LLM.invoke(data)
    return summary

# Read the dataset with specific index range
start_index = int(input("Enter the Start index : "))
end_index = int(input("Enter the End index : "))
dataset_path = "ML_Project_main_hate_fake_with_index.xlsx"
data = pd.read_excel(dataset_path)
data.set_index("index", inplace=True)
data = data.loc[start_index:end_index]

# Check if "Pre_Processed_English_text" column exists
if "Pre_Processed_English_text" in data.columns:
    # Apply the function to each text in the column and store the summaries
    summaries = []
    for i, text in tqdm(enumerate(data["Pre_Processed_English_text"]), desc="Processing texts", unit="text"):
        try:
            summary = process_text(text)
            summaries.append(summary)
        except Exception as e:
            print(f"Error processing text at index {i}: {e}")
            break

    # Add the summaries to the dataframe
    data["Summary"] = summaries

    # Preprocess the summaries to remove illegal characters
    data["Summary"] = data["Summary"].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))

    # Preprocess the summaries further to remove illegal characters for Excel
    data["Summary"] = data["Summary"].apply(lambda x: re.sub(r'[\x00-\x1F\x7F-\x9F]', '', x))

    # Optionally, save the updated dataframe
    data.to_excel("ML_Project_main_hate_fake_with_summaries.xlsx", index=True)

else:
    print("Column 'Pre_Processed_English_text' not found in the dataset.")


In [None]:
from google.colab import files

files.download("ML_Project_main_hate_fake_with_summaries.xlsx")