In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
import numpy as np
import faiss
import torch
import random
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [21]:
with open("/content/random.txt", 'r') as f:
  documents = f.readlines()

In [22]:
documents = [doc.strip() for doc in documents if doc.strip() != '']
documents.sort()
documents

['1. Text Extraction (OCR)',
 '2. Error Detection and Correction',
 '3. Text Comparison and Refinement',
 '4. Optional: Google or LanguageTool Integration',
 '5. Final Response Generation',
 'A list of identified errors',
 'API Development and User Interface',
 'API: We will develop a REST API to handle each step: text extraction, error correction, text comparison, and final response generation. The API allows easy integration with other systems and can be used by anyone to process handwritten text from images.',
 'Building a system that processes images of handwritten text in Indian languages, identifies errors, and suggests corrections. The system is built with an API and user interface for easy interaction.',
 'CrewaI orchestrates the entire workflow, from text extraction to correction and response generation. LangChain organizes the different AI models and ensures that each task is done in the correct order, providing a seamless experience for the user.',
 'First, we extract handwr

In [23]:
np.random.seed(101)
torch.manual_seed(101)
random.seed(101)

# Document Embedding

In [55]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
doc_embeddings = model.encode(documents)
print(doc_embeddings)

[[-0.05316069  0.05555414 -0.00919429 ...  0.0272691   0.0599813
  -0.04487196]
 [-0.01159597  0.06390091  0.01319832 ...  0.07713197  0.00241299
  -0.05842974]
 [-0.02194517  0.02161855  0.02577686 ...  0.02913478  0.05337152
   0.00896772]
 ...
 [-0.13060412 -0.02226    -0.03629162 ...  0.07001153  0.02418887
   0.03272324]
 [-0.07493397 -0.00990006  0.01441692 ...  0.10979135  0.11299946
  -0.06084405]
 [-0.04083433  0.05712456  0.05311821 ...  0.04117409 -0.0238548
  -0.00155993]]


In [26]:
doc_embeddings = np.array(doc_embeddings)
print(doc_embeddings, "\n", doc_embeddings.shape)

[[-0.05316069  0.05555414 -0.00919429 ...  0.0272691   0.0599813
  -0.04487196]
 [-0.01159597  0.06390091  0.01319832 ...  0.07713197  0.00241299
  -0.05842974]
 [-0.02194517  0.02161855  0.02577686 ...  0.02913478  0.05337152
   0.00896772]
 ...
 [-0.13060412 -0.02226    -0.03629162 ...  0.07001153  0.02418887
   0.03272324]
 [-0.07493397 -0.00990006  0.01441692 ...  0.10979135  0.11299946
  -0.06084405]
 [-0.04083433  0.05712456  0.05311821 ...  0.04117409 -0.0238548
  -0.00155993]] 
 (26, 384)


# FAISS Vector DB

In [27]:
embedding_dim = doc_embeddings.shape[1]

# gpu = faiss.StandardGpuResource()   # use this when you have installed faiss-gpu
index = faiss.IndexFlatL2(embedding_dim)   # for static documents , use hierarchical datastructure instead of L2(Euclidean)
faiss.normalize_L2(doc_embeddings)
print(index.is_trained)

True


In [28]:
index.add(doc_embeddings)

# Retrieve relevant documents

In [60]:
query = "what is text extraction?"
query_embedding = model.encode([query])
faiss.normalize_L2(query_embedding) #normalize iff the documents gets normalized

top_k = 5
_, top_indices = index.search(query_embedding, top_k)
print(top_indices.shape)

(1, 5)


In [61]:
for idx in top_indices[0]:
  print(f"Document {idx}: {documents[idx]}\n")

Document 0: 1. Text Extraction (OCR)

Document 14: IndicTrOCR for extracting text

Document 10: First, we extract handwritten text from images using IndicTrOCR, an OCR model for Indian languages. This model converts the image into machine-readable text. The process is managed through a REST API developed with CrewaI.

Document 21: The original text

Document 24: We use IndicBERT, a pre-trained AI model, to detect and correct errors in the extracted text. The API will send the extracted text to IndicBERT for grammatical and spelling error correction, and the corrected text is returned in the API response.



# Generate context aware response

In [62]:
context = " ".join([documents[idx] for idx in top_indices[0]])
print(context)
prompt = f"Question: {query}\n\n context: {context}\n\n Answer: "

path = "gpt2"
gpt_model = GPT2LMHeadModel.from_pretrained(path)
tokenizer = GPT2Tokenizer.from_pretrained(path)

1. Text Extraction (OCR) IndicTrOCR for extracting text First, we extract handwritten text from images using IndicTrOCR, an OCR model for Indian languages. This model converts the image into machine-readable text. The process is managed through a REST API developed with CrewaI. The original text We use IndicBERT, a pre-trained AI model, to detect and correct errors in the extracted text. The API will send the extracted text to IndicBERT for grammatical and spelling error correction, and the corrected text is returned in the API response.


In [63]:
print(prompt)

Question: what is text extraction?

 context: 1. Text Extraction (OCR) IndicTrOCR for extracting text First, we extract handwritten text from images using IndicTrOCR, an OCR model for Indian languages. This model converts the image into machine-readable text. The process is managed through a REST API developed with CrewaI. The original text We use IndicBERT, a pre-trained AI model, to detect and correct errors in the extracted text. The API will send the extracted text to IndicBERT for grammatical and spelling error correction, and the corrected text is returned in the API response.

 Answer: 


In [64]:
input_ids = tokenizer.encode(prompt, return_tensors='pt')
print(input_ids)

tensor([[24361,    25,   644,   318,  2420, 22236,    30,   628,  4732,    25,
           352,    13,  8255,  5683,  7861,   357,  4503,    49,     8,  1423,
           291,  2898,  4503,    49,   329, 37895,  2420,  3274,    11,   356,
          7925, 45916,  2420,   422,  4263,  1262,  1423,   291,  2898,  4503,
            49,    11,   281,   440,  9419,  2746,   329,  3942,  8950,    13,
           770,  2746, 26161,   262,  2939,   656,  4572,    12, 46155,  2420,
            13,   383,  1429,   318,  5257,   832,   257, 30617,  7824,  4166,
           351, 17652,    64,    40,    13,   383,  2656,  2420,   775,   779,
          1423,   291, 13246,    51,    11,   257,   662,    12, 35311,  9552,
          2746,    11,   284,  4886,   290,  3376,  8563,   287,   262, 21242,
          2420,    13,   383,  7824,   481,  3758,   262, 21242,  2420,   284,
          1423,   291, 13246,    51,   329, 14599, 44935,   290, 24993,  4049,
         17137,    11,   290,   262, 19267,  2420,  

# num beams

generally, transformers generate one token per time step from the softmax layer of decoder, here we are allowing the model to predict num_beams = 5 means -> 5 output words get generated from the softmax layer of decoder. since, it is different from temperature, dont get confused with temperature.

In [69]:
output_ids = gpt_model.generate(input_ids, max_length=200, num_beams=5, early_stopping=True)
print(output_ids)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[24361,    25,   644,   318,  2420, 22236,    30,   628,  4732,    25,
           352,    13,  8255,  5683,  7861,   357,  4503,    49,     8,  1423,
           291,  2898,  4503,    49,   329, 37895,  2420,  3274,    11,   356,
          7925, 45916,  2420,   422,  4263,  1262,  1423,   291,  2898,  4503,
            49,    11,   281,   440,  9419,  2746,   329,  3942,  8950,    13,
           770,  2746, 26161,   262,  2939,   656,  4572,    12, 46155,  2420,
            13,   383,  1429,   318,  5257,   832,   257, 30617,  7824,  4166,
           351, 17652,    64,    40,    13,   383,  2656,  2420,   775,   779,
          1423,   291, 13246,    51,    11,   257,   662,    12, 35311,  9552,
          2746,    11,   284,  4886,   290,  3376,  8563,   287,   262, 21242,
          2420,    13,   383,  7824,   481,  3758,   262, 21242,  2420,   284,
          1423,   291, 13246,    51,   329, 14599, 44935,   290, 24993,  4049,
         17137,    11,   290,   262, 19267,  2420,  

In [70]:
response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"response: {response}")

response: Question: what is text extraction?

 context: 1. Text Extraction (OCR) IndicTrOCR for extracting text First, we extract handwritten text from images using IndicTrOCR, an OCR model for Indian languages. This model converts the image into machine-readable text. The process is managed through a REST API developed with CrewaI. The original text We use IndicBERT, a pre-trained AI model, to detect and correct errors in the extracted text. The API will send the extracted text to IndicBERT for grammatical and spelling error correction, and the corrected text is returned in the API response.

 Answer: __________________

1. Text Extraction (OCR) IndicTrOCR for extracting text First, we extract handwritten text from images using IndicTrOCR, an OCR model for Indian languages. This model converts the image into machine-readable text. The process is managed through a REST API
