In [1]:
%%capture --no-display
!pip install PyPDF2
!pip install python-docx
!pip install transformers


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [2]:
import os
import warnings
warnings.filterwarnings("ignore")
import re
import string
import pandas as pd
from PyPDF2 import PdfReader
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import textwrap
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:
# Document Preprocessing

# Text Extraction Function
def extract_text(filepath):
    ext = os.path.splitext(filepath)[1].lower()
    if ext == '.txt':
        with open(filepath, 'r', encoding='utf-8') as file:
            return file.read()
    elif ext == '.pdf':
        reader = PdfReader(filepath)
        return ' '.join(page.extract_text() for page in reader.pages)
    elif ext == '.docx':
        doc = Document(filepath)
        return '\n'.join([para.text for para in doc.paragraphs])
    elif ext == '.csv':
        df = pd.read_csv(filepath)
        return df.to_string()
    else:
        raise ValueError(f"Unsupported file format: {ext}")

# Text Normalization Function
def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra whitespace
    text = '\n'.join([re.sub(r'\s+', ' ', line) for line in text.splitlines()])
    return text

# Paragraph Segmentation Function
def segment_text(text):
    paragraphs = text.split('\n')
    return [para.strip() for para in paragraphs if para.strip()]

# Main Preprocessing Function
def preprocess_document(filepath):
    try:
        raw_text = extract_text(filepath)
        print(f"Extracted text from {filepath}:\n{raw_text}\n")
        normalized_text = normalize_text(raw_text)
        print(f"Normalized text from {filepath}:\n{normalized_text}\n")
        paragraphs = segment_text(normalized_text)
        print(f"Segmented paragraphs from {filepath}:\n{paragraphs}\n")
        return paragraphs
    except Exception as e:
        print(f"Error processing file {filepath}: {e}")
        return []


In [4]:

# Retrieval System Development

def retrieve_relevant_chunks(documents, query, top_k=5):
    # Vectorization using TF-IDF
    vectorizer = TfidfVectorizer(max_features=50000)
    doc_vectors = vectorizer.fit_transform(documents)
    query_vector = vectorizer.transform([query])

    # Compute Cosine Similarities
    similarities = cosine_similarity(query_vector, doc_vectors).flatten()

    # Retrieve Top Relevant Chunks
    top_k_indices = similarities.argsort()[-top_k:][::-1]

    top_chunks = [(documents[idx], similarities[idx]) for idx in top_k_indices]

    return top_chunks

In [25]:
# Generation Model Integration

# Function to format input for the language model
def format_input(query, retrieved_context):
    """
    Combines the user query with the retrieved context to provide input for the model.
    """
    return f"Answer Only the provided query without any comments or more words\nContext: {retrieved_context}\nQuery: {query}\nResponse:"

# Function to generate a response using the language model
def generate_response(query, retrieved_context, model, tokenizer, max_length=500): # Increased max_length to 500
    """
    Generates a response based on the query and retrieved context.
    """
    # Prepare the input
    formatted_input = format_input(query, retrieved_context)

    # Tokenize input
    inputs = tokenizer(formatted_input, return_tensors="pt")

    # Generate output
    output = model.generate(
        inputs['input_ids'],
        max_length=max_length, # Using the updated max_length
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_p=0.9,
        temperature=0.7,
    )

    # Decode the response
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    return response


In [13]:
# Combining Retrieval and Generation

# Step 1: Preprocess the document
file_path = "/content/Test.txt"
paragraphs = preprocess_document(file_path)

# Step 2: Split document into manageable chunks
chunk_size = 500
documents = textwrap.wrap(' '.join(paragraphs), width=chunk_size)
print(f"Total number of chunks: {len(documents)}\n")


Extracted text from /content/Test.txt:
Artificial Intelligence (AI) is a transformative technology that has revolutionized multiple industries. AI refers to the ability of machines to simulate human intelligence by performing tasks such as recognizing patterns, making decisions, and understanding natural language. The applications of AI are vast and varied, ranging from healthcare to entertainment, from finance to transportation.

In healthcare, AI is being used to assist in diagnosing diseases, particularly in the field of radiology. Machine learning algorithms can analyze medical images such as X-rays, CT scans, and MRIs to detect abnormalities like tumors, fractures, and other conditions. AI is also used in predicting patient outcomes, recommending personalized treatment plans, and even identifying potential drug candidates.

AI is also playing a significant role in the financial industry. Banks and financial institutions are using AI algorithms to detect fraudulent activities, asse

In [14]:
# Step 3: Define query and retrieve relevant chunks
query = "What are the applications of artificial intelligence in healthcare?"
top_chunks = retrieve_relevant_chunks(documents, query, top_k=5)

print(f"\nQuery: {query}\n")
print("Top Relevant Chunks:")
for idx, (chunk, score) in enumerate(top_chunks):
    print(f"(Score: {score:.2f})\nChunk {idx + 1}: {chunk[:200]}...\n{'*'*200} ")


Query: What are the applications of artificial intelligence in healthcare?

Top Relevant Chunks:
(Score: 0.46)
Chunk 1: artificial intelligence ai is a transformative technology that has revolutionized multiple industries ai refers to the ability of machines to simulate human intelligence by performing tasks such as re...
******************************************************************************************************************************************************************************************************** 
(Score: 0.20)
Chunk 2: is likely to play an even greater role in shaping the future of humanity in conclusion ai is already making a significant impact across a wide range of industries its applications in healthcare financ...
******************************************************************************************************************************************************************************************************** 
(Score: 0.13)
Chunk 3: increasingly importan

In [15]:
# Step 4: Load pre-trained language model
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Step 5: Combine retrieval and generation
combined_context = ' '.join([chunk for chunk, _ in top_chunks])
response = generate_response(query, combined_context, model, tokenizer)
print("\nGenerated Response:")
print(response)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Generated Response:
Context: artificial intelligence ai is a transformative technology that has revolutionized multiple industries ai refers to the ability of machines to simulate human intelligence by performing tasks such as recognizing patterns making decisions and understanding natural language the applications of ai are vast and varied ranging from healthcare to entertainment from finance to transportation in healthcare ai is being used to assist in diagnosing diseases particularly in the field of radiology machine is likely to play an even greater role in shaping the future of humanity in conclusion ai is already making a significant impact across a wide range of industries its applications in healthcare finance automotive entertainment education and natural language processing are just the beginning as ai technology continues to evolve it will undoubtedly lead to new breakthroughs and innovations that will transform the way we live and work however it is important to address th

# Testing and Evaluation

In [26]:
# Example test case for testing the RAG system
def test_rag_system():
    # Example test case
    test_query = "How is AI used in education?"
    test_retrieved_context = retrieve_relevant_chunks(documents, query, top_k=1)[0][0]
    print("\nTest Query:", test_query)
    print("*"*200)
    print("Retrieved Context:", test_retrieved_context)
    print("*"*200)
    generation = generate_response(test_query, test_retrieved_context, model, tokenizer)
    print("Generated Response:", generation)

test_rag_system()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Test Query: How is AI used in education?
********************************************************************************************************************************************************************************************************
Retrieved Context: artificial intelligence ai is a transformative technology that has revolutionized multiple industries ai refers to the ability of machines to simulate human intelligence by performing tasks such as recognizing patterns making decisions and understanding natural language the applications of ai are vast and varied ranging from healthcare to entertainment from finance to transportation in healthcare ai is being used to assist in diagnosing diseases particularly in the field of radiology machine
********************************************************************************************************************************************************************************************************
Generated Response: Answer Only the prov

In [40]:
import re

text = """Generated Response: Answer Only the provided query without any comments or more words
Context: artificial intelligence ai is a transformative technology that has revolutionized multiple industries ai refers to the ability of machines to simulate human intelligence by performing tasks such as recognizing patterns making decisions and understanding natural language the applications of ai are vast and varied ranging from healthcare to entertainment from finance to transportation in healthcare ai is being used to assist in diagnosing diseases particularly in the field of radiology machine
Query: How is AI used in education?
Response: AI is used for education in many different ways. It is the most important tool in a school education. The most common use of AI in schools is to teach children how to use computers to solve problems.
The most popular use is in science and technology. In science, AI can be used as a tool to help scientists understand the world around them. AI has been used by scientists to understand how the universe works. Scientists can use AI to learn about the laws of physics and to predict the future. This is also the main use for AI for the education of children. Science is an important field for children to study. Children are taught to think and act in ways that are not possible in other fields. For example, they are exposed to different kinds of information. They are able to see the patterns of the natural world and understand what is happening in it. These are the kinds that children learn to do. However, the more children are trained in these fields, and the better they can understand and use these kinds, it is possible that they will be able, in some way, to become scientists."""

# Extract everything starting from "Response"
result = re.search(r"Response:.*$", text[50:], re.DOTALL)
if result:
    print(result.group(0))

Response: AI is used for education in many different ways. It is the most important tool in a school education. The most common use of AI in schools is to teach children how to use computers to solve problems.
The most popular use is in science and technology. In science, AI can be used as a tool to help scientists understand the world around them. AI has been used by scientists to understand how the universe works. Scientists can use AI to learn about the laws of physics and to predict the future. This is also the main use for AI for the education of children. Science is an important field for children to study. Children are taught to think and act in ways that are not possible in other fields. For example, they are exposed to different kinds of information. They are able to see the patterns of the natural world and understand what is happening in it. These are the kinds that children learn to do. However, the more children are trained in these fields, and the better they can understa