In [1]:
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import os

# Path to your Tesseract-OCR installation
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update this path if necessary

# Function to convert PDF to images and extract text using OCR
def extract_text_from_pdf(pdf_path):
    # Convert PDF pages to images
    images = convert_from_path(pdf_path)
    text = ""
    for img in images:
        # Use pytesseract to do OCR on the image
        img_text = pytesseract.image_to_string(img)
        text += img_text + "\n"
    return text

# Function to create a mapping DataFrame from extracted text
def create_mapping_from_extracted_text(extracted_text, unique_words):
    # Split the text into lines
    lines = extracted_text.splitlines()
    
    # Prepare a mapping DataFrame
    mappings = []
    for word in unique_words:
        # Find the line that contains the current unique Sanskrit word
        for line in lines:
            if word in line:
                # Assuming the format is "Sanskrit Word - Transliteration - Meaning"
                parts = line.split('-')
                if len(parts) == 3:
                    transliteration = parts[1].strip()
                    meaning = parts[2].strip()
                    mappings.append([word, transliteration, meaning])
                break  # Exit loop after finding the word

    # Create a DataFrame
    df = pd.DataFrame(mappings, columns=['Unique Sanskrit Word', 'Transliteration', 'Meaning'])
    return df

# Main execution
if __name__ == "__main__":
    pdf_path = 'dictallcheck.pdf'  # Update with your PDF path
    csv_path = 'unique_sanskrit_words_ordered.csv'  # Update with your CSV path
    
    # Read unique Sanskrit words from the CSV file
    unique_sanskrit_words = pd.read_csv(csv_path)['Unique_Sanskrit_Words'].tolist()
    
    # Extract text from the PDF
    extracted_text = extract_text_from_pdf(pdf_path)
    
    # Print extracted text for review
    print("Extracted Text:")
    print(extracted_text)
    
    # Create mapping from the extracted text and unique Sanskrit words
    mapping_df = create_mapping_from_extracted_text(extracted_text, unique_sanskrit_words)
    
    # Show the first few entries of the DataFrame
    print("\nMapping DataFrame:")
    print(mapping_df.head(100))  # Display first 100 entries as a test


Extracted Text:
Online Sanskrit Dictionary
February 12, 2003

Introduction

The following is a list of Sanskrit words printed in Devanagari with its transliterated form
and a short meaning provided as a reference source. This cannot be a substitute for a good
printed Sanskrit-English dictionary. However, we anticipate this to aid a student of Sanskrit in

the on-line world.

The list of words is a compilation from various sources such as messages on sanskrit-digest,
translated documents such as Bhagavadgita, atharvashiirshha, raamarakshaa et cetera, and other
files accessible on the web. The words are encoded in ITRANS transliteration scheme so as to

print them in Devanagari.

There is a copyright on this file to the extent of preventing misuse on other internet sites

and ego-trips of individuals.

We recommend not to copy and post this file on any other site since we periodically update
and correct this list and we do not want different versions of file floating around the internet.

In [3]:
import fitz  # PyMuPDF
import re
import pandas as pd

# Open the PDF file
pdf_path = 'dictallcheck.pdf'
doc = fitz.open(pdf_path)

# Extract text from page 3 (index 2 for 0-based indexing)
page_text = doc[2].get_text("text")

# Initialize lists to store transliterations and meanings
transliterations = []
meanings = []

# Regular expression to match transliterations in parentheses and meanings after "="
pattern = re.compile(r'\(([^)]+)\)\s*=\s*(.+)')

# Process each line individually to maintain order
for line in page_text.splitlines():
    # Search for transliteration and meaning pairs in each line
    match = pattern.search(line)
    if match:
        transliteration = match.group(1).strip()  # Text within parentheses
        meaning = match.group(2).strip()          # Text after '=' sign
        # Append to lists if they are not page numbers or irrelevant data
        if not re.search(r'\bpage\b|\bPage\b|\d+', transliteration) and not re.search(r'\bpage\b|\bPage\b|\d+', meaning):
            transliterations.append(transliteration)
            meanings.append(meaning)

# Create a DataFrame and save to CSV
df = pd.DataFrame({'Transliteration': transliterations, 'Meaning': meanings})
df.to_csv('ordered_transliterations_meanings.csv', index=False)

# Display the extracted data
for transliteration, meaning in zip(transliterations, meanings):
    print(f"Transliteration: {transliteration}, Meaning: {meaning}")


In [4]:
import fitz  # PyMuPDF
import re
import pandas as pd

# Open the PDF file
pdf_path = 'dictallcheck.pdf'
doc = fitz.open(pdf_path)

# Check the number of pages in the PDF
print(f"Number of pages in PDF: {doc.page_count}")

# Extract text from page 3 (index 2 for 0-based indexing)
page_text = doc[2].get_text("text")

# Initialize lists to store transliterations and meanings
transliterations = []
meanings = []

# Regular expression to match transliterations in parentheses and meanings after "="
pattern = re.compile(r'\(([^)]+)\)\s*=\s*(.+)')

# Process each line individually to maintain order
for line in page_text.splitlines():
    # Search for transliteration and meaning pairs in each line
    match = pattern.search(line)
    if match:
        transliteration = match.group(1).strip()  # Text within parentheses
        meaning = match.group(2).strip()          # Text after '=' sign
        # Append to lists if they are not page numbers or irrelevant data
        if not re.search(r'\bpage\b|\bPage\b|\d+', transliteration) and not re.search(r'\bpage\b|\bPage\b|\d+', meaning):
            transliterations.append(transliteration)
            meanings.append(meaning)
            print(f"Found: Transliteration: {transliteration}, Meaning: {meaning}")  # Debug print

# Create a DataFrame and save to CSV
if transliterations and meanings:  # Check if lists are not empty
    df = pd.DataFrame({'Transliteration': transliterations, 'Meaning': meanings})
    df.to_csv('ordered_transliterations_meanings.csv', index=False)
    print("Data saved to 'ordered_transliterations_meanings.csv'")
else:
    print("No data extracted. Please check the PDF content and regex patterns.")

# Close the document
doc.close()


Number of pages in PDF: 221
No data extracted. Please check the PDF content and regex patterns.


In [5]:
import fitz  # PyMuPDF
import re
import pandas as pd

# Open the PDF file
pdf_path = 'dictallcheck.pdf'
doc = fitz.open(pdf_path)

# Extract text from page 3 (index 2 for 0-based indexing)
page_text = doc[2].get_text("text")

# Print the extracted text for inspection
print("Extracted Text from Page 3:")
print(page_text)

# Initialize lists to store transliterations and meanings
transliterations = []
meanings = []

# Regular expression to match transliterations in parentheses and meanings after "="
# Modify this pattern based on the actual format of your text
pattern = re.compile(r'\(([^)]+)\)\s*=\s*(.+)')  # Original pattern

# Process each line individually
for line in page_text.splitlines():
    print(f"Line: {line}")  # Log each line
    match = pattern.search(line)  # Try to find matches
    if match:
        transliteration = match.group(1).strip()  # Text within parentheses
        meaning = match.group(2).strip()          # Text after '=' sign
        if not re.search(r'\bpage\b|\bPage\b|\d+', transliteration) and not re.search(r'\bpage\b|\bPage\b|\d+', meaning):
            transliterations.append(transliteration)
            meanings.append(meaning)
            print(f"Found: Transliteration: {transliteration}, Meaning: {meaning}")  # Debug print

# Create a DataFrame and save to CSV if data was found
if transliterations and meanings:
    df = pd.DataFrame({'Transliteration': transliterations, 'Meaning': meanings})
    df.to_csv('ordered_transliterations_meanings.csv', index=False)
    print("Data saved to 'ordered_transliterations_meanings.csv'")
else:
    print("No data extracted. Please check the PDF content and regex patterns.")

# Close the document
doc.close()


Extracted Text from Page 3:
 

	
	 !
"$#&%('*),+.-0/213546
7
tJ




0Q;=\)2^0Q47;FIMD
K
2>67,94
798;:=<

 NMIMIM03I


.
@9A
H1jeD=D=IMa9D
A
2^\
>

I


,92^.
>@?
8
ACB

I
_
,7Ne67H
@
I


IbG>2>IM4

\
A
.<IMD}H1.1;F81N|67H
A
4W.128`2>,/.O0Q2>D
A
D
A
U
@
IM,/.<H

>
?
8
A
BED,:GFG<(HIKJ

I
_
,7Ne69H
@
IM4
@
IMIM03;
v
IM\
_
@c

a
A
IY0Q;F,7G.
@9A
E
A
IYU~2^,ML
IM,5Ne69H
@
I@NcE;[.
@
E
@
;=8
@
.
@7ACA
D
A
U
@
IM,/.
>
?
O

I
_
,9G^I


a}2e47jTlc2T0QG>IM,
>
?
O
H

I
_
,7G>I
v
I


q
A
D=4
>
?
OPQ?
F
ORTS

I
_
,9G^IMH<I
_
,7G>;=,9IM6


EJ;=.
@
>
?
O
UWV

I
_
,7G>IM;
k%

DF;=\ba9Hlea}2545jrU9IY03.<H
>
?
O
U,XKY
A[Z
A@\
:
?
P]XKY,R_^
FG`
V

I
_
,9G^IM;FHO.<67H
@9@7?
6
X
ITI
_
,9H1ITHO.<IM,P6767a
@
;
k%

@
I
X
;=,9GoH<IY.1;FH
q
A
4rE;=.
@
HO.10Q2^,9GoD=;F\%acHba
>
?
cER

I
_
,1IT,7I

IM,72>;=,/.<\
A
,/.
>
?
Y
<
?
O

I
_
,/.1IY0QI
_
,7G>I


;F,7,
A
0xa}2e47j2^0d;=,9,
A
0x,9IY.1670
A
l
B
AA
D=;F,7G>Hl5;=,9H1;F4


In [6]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import pandas as pd

# Open the PDF file
pdf_path = 'dictallcheck.pdf'
doc = fitz.open(pdf_path)

# Initialize lists to store transliterations and meanings
transliterations = []
meanings = []

# Loop through each page and convert to an image
for page_num in range(len(doc)):
    # Get the page and convert it to an image
    page = doc[page_num]
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    
    # Use Tesseract to do OCR on the image
    text = pytesseract.image_to_string(img)

    # Print extracted text for debugging
    print(f"Extracted Text from Page {page_num + 1}:")
    print(text)
    
    # Process the extracted text for transliterations and meanings
    # Adjust regex pattern as necessary based on the extracted text format
    pattern = re.compile(r'\(([^)]+)\)\s*=\s*(.+)')
    
    for line in text.splitlines():
        match = pattern.search(line)
        if match:
            transliteration = match.group(1).strip()
            meaning = match.group(2).strip()
            transliterations.append(transliteration)
            meanings.append(meaning)

# Create a DataFrame and save to CSV if data was found
if transliterations and meanings:
    df = pd.DataFrame({'Transliteration': transliterations, 'Meaning': meanings})
    df.to_csv('ordered_transliterations_meanings.csv', index=False)
    print("Data saved to 'ordered_transliterations_meanings.csv'")
else:
    print("No data extracted. Please check the PDF content and regex patterns.")

# Close the document
doc.close()


Extracted Text from Page 1:
Online Sanskrit Dictionary
February 12, 2003

te transliterated form
cannot be a substitute for a good

‘The following
‘and a short meaning provided as a reference source, Thi
printed Sanskrit-English dictionary

the on-line world,

of Sanskrit words printed in Devanagart wit

nha, raamarakshaa et ectera, and other
files accessible on the web. The words are encoded in ITRANS transliterat
ping chem in Devanagari

scheme x0 a8 to

‘There is a copyright on this file to the extent of preventing misuse on other internet sites
and ego-trips of individuals.

We recommend not to copy and post this fle on any other site since we periodically update
list and we do not wane different versions of file floating around the internet.
We have soem people copying this work and calling of their own.

and correct th

‘Wo request you to provide corrections, and more importantly many suc addi
your own collection.

kkh g gh N’

ch ohh j jh IN

TThD DLN

tihddha

bohm

yr Lv sh sh

In [7]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import re
import pandas as pd

# Open the PDF file
pdf_path = 'dictallcheck.pdf'
doc = fitz.open(pdf_path)

# Initialize lists to store transliterations and meanings
transliterations = []
meanings = []

# Loop through each page and convert to an image
for page_num in range(len(doc)):
    # Get the page and convert it to an image
    page = doc[page_num]
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    
    # Use Tesseract to perform OCR on the image
    text = pytesseract.image_to_string(img)

    # Print extracted text for debugging
    print(f"Extracted Text from Page {page_num + 1}:")
    print(text)
    
    # Regular expression to match transliterations in parentheses and meanings after "="
    pattern = re.compile(r'\(([^)]+)\)\s*=\s*(.+)')

    # Process the extracted text for transliterations and meanings
    for line in text.splitlines():
        match = pattern.search(line)
        if match:
            transliteration = match.group(1).strip()
            meaning = match.group(2).strip()
            transliterations.append(transliteration)
            meanings.append(meaning)

# Create a DataFrame and save to CSV if data was found
if transliterations and meanings:
    df = pd.DataFrame({'Transliteration': transliterations, 'Meaning': meanings})
    df.to_csv('ordered_transliterations_meanings.csv', index=False)
    print("Data saved to 'ordered_transliterations_meanings.csv'")
else:
    print("No data extracted. Please check the PDF content and regex patterns.")

# Close the document
doc.close()


Extracted Text from Page 1:
Online Sanskrit Dictionary
February 12, 2003

te transliterated form
cannot be a substitute for a good

‘The following
‘and a short meaning provided as a reference source, Thi
printed Sanskrit-English dictionary

the on-line world,

of Sanskrit words printed in Devanagart wit

nha, raamarakshaa et ectera, and other
files accessible on the web. The words are encoded in ITRANS transliterat
ping chem in Devanagari

scheme x0 a8 to

‘There is a copyright on this file to the extent of preventing misuse on other internet sites
and ego-trips of individuals.

We recommend not to copy and post this fle on any other site since we periodically update
list and we do not wane different versions of file floating around the internet.
We have soem people copying this work and calling of their own.

and correct th

‘Wo request you to provide corrections, and more importantly many suc addi
your own collection.

kkh g gh N’

ch ohh j jh IN

TThD DLN

tihddha

bohm

yr Lv sh sh

KeyboardInterrupt: 

In [9]:
import pdfplumber
import pandas as pd

# Load the CSV file containing unique Sanskrit words
input_csv_path = 'unique_sanskrit_words_ordered.csv'
words_df = pd.read_csv(input_csv_path)

# Load the PDF file
pdf_path = 'dictallcheck.pdf'

# Create a dictionary to hold the words and their corresponding page numbers
word_page_dict = {word: [] for word in words_df['Unique_Sanskrit_Words']}

# Function to find page numbers for each word
def find_word_page_numbers(pdf_path, words):
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            # Extract text from the page
            text = page.extract_text()
            if text:  # Check if the page contains any text
                for word in words:
                    if word in text and page_number not in word_page_dict[word]:
                        word_page_dict[word].append(page_number)

# Run the search for words
find_word_page_numbers(pdf_path, words_df['Unique_Sanskrit_Words'])

# Convert the dictionary to a DataFrame
word_page_df = pd.DataFrame([(word, pages) for word, pages in word_page_dict.items()],
                             columns=['Sanskrit_Word', 'Page_Numbers'])

# Format Page_Numbers for easier reading
word_page_df['Page_Numbers'] = word_page_df['Page_Numbers'].apply(lambda x: ', '.join(map(str, x)))

# Save results to a new CSV file
word_page_df.to_csv('sanskrit_words_with_page_numbers.csv', index=False)

print(word_page_df)


      Sanskrit_Word Page_Numbers
0                 ॐ             
1              ॐकार             
2                 अ             
3             अंकुश             
4     अंकुशधारिणम्‌             
...             ...          ...
9284   शल्वृपोप्णहट             
9285          ष्णं;             
9286      ॐल्टपाश्प             
9287          पिणत)             
9288        हस्तपाद             

[9289 rows x 2 columns]


In [10]:
import pandas as pd

# Load the two CSV files
sanskrit_words_df = pd.read_csv('unique_sanskrit_words_ordered.csv')  # Contains Sanskrit words
transliteration_meaning_df = pd.read_csv('sanskrit_dataset_modified.csv')  # Contains transliterations and meanings

# Assuming the transliteration_meaning_df has columns: 'Transliteration' and 'Meaning'
# and sanskrit_words_df has a column 'Unique_Sanskrit_Words' that corresponds to the transliteration.

# Merge the two DataFrames based on the common key (assuming Unique_Sanskrit_Words corresponds to Transliteration)
merged_df = pd.merge(transliteration_meaning_df, sanskrit_words_df, left_on='Transliteration', right_on='Unique_Sanskrit_Words', how='outer')

# Optionally, you can drop the 'Unique_Sanskrit_Words' column if it's redundant
merged_df.drop(columns=['Unique_Sanskrit_Words'], inplace=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('merged_transliteration_meaning_sanskrit.csv', index=False)

print(merged_df)


      Transliteration                            Meaning
0                 NaN                                NaN
1                 NaN                                NaN
2                 AUM                   Primordial Sound
3              Dayate                             to fly
4       DukRiJNkaraNa  grammatic formula “DukRi.nkaraNa”
...               ...                                ...
18393             NaN                                NaN
18394             NaN                                NaN
18395             NaN                                NaN
18396             NaN                                NaN
18397             NaN                                NaN

[18398 rows x 2 columns]


In [11]:
import pandas as pd

# Load your data
df = pd.read_csv('unique_sanskrit_words_ordered.csv')

# Calculate the total number of characters in the "Unique_Sanskrit_Words" column
total_characters = df['Unique_Sanskrit_Words'].str.len().sum()
print(f"Total characters: {total_characters}")


Total characters: 64082


In [15]:
import fitz  # PyMuPDF for PDF text extraction
import pandas as pd
import pytesseract
from PIL import Image, ImageDraw, ImageFont
import io
import re

# Configure Tesseract path if necessary
# pytesseract.pytesseract.tesseract_cmd = r'/path/to/tesseract'  # Uncomment and specify path if needed

def convert_pdf_to_images(pdf_path):
    """
    Convert each page of the PDF into an image.
    """
    images = []
    with fitz.open(pdf_path) as doc:
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes("png")))
            images.append(img)
    return images

def extract_sanskrit_text(image):
    """
    Perform OCR on the image to extract exact Sanskrit words.
    """
    sanskrit_text = pytesseract.image_to_string(image, lang="san")
    sanskrit_words = re.findall(r'[\u0900-\u097F]+', sanskrit_text)  # Devanagari Unicode range
    return sanskrit_words


def create_image_from_text(text, font_path="arial.ttf", font_size=32):
    """
    Create an image from text to perform OCR for transliteration.
    """
    font = ImageFont.truetype(font_path, font_size)
    text_bbox = font.getbbox(text)  # Get bounding box of the text
    text_width, text_height = text_bbox[2] - text_bbox[0], text_bbox[3] - text_bbox[1]
    image = Image.new("L", (text_width + 20, text_height + 20), 255)  # 255 for white background
    draw = ImageDraw.Draw(image)
    draw.text((10, 10), text, font=font, fill=0)  # 0 for black text
    return image




def extract_transliteration(word):
    """
    Convert Sanskrit word to image and perform OCR for transliteration.
    """
    word_image = create_image_from_text(word)
    transliteration = pytesseract.image_to_string(word_image, lang="eng")
    return transliteration.strip()

def pdf_to_csv(pdf_path, csv_path, font_path="arial.ttf"):
    """
    Convert encoded PDF to CSV with exact Sanskrit words and their transliterations.
    """
    images = convert_pdf_to_images(pdf_path)
    all_data = []

    for image in images:
        sanskrit_words = extract_sanskrit_text(image)
        
        for word in sanskrit_words:
            transliteration = extract_transliteration(word)
            all_data.append({
                "Sanskrit Word": word,
                "Transliteration": transliteration,
            })

    df = pd.DataFrame(all_data, columns=["Sanskrit Word", "Transliteration"])
    df.to_csv(csv_path, index=False, encoding='utf-8')
    print(f"CSV file saved as: {csv_path}")

# Usage example
pdf_path = "dictallcheck.pdf"  # Replace with the path to your encoded PDF
csv_path = "output_test.csv"  # Replace with desired output CSV file path
pdf_to_csv(pdf_path, csv_path)


CSV file saved as: output_test.csv


In [16]:
import fitz  # PyMuPDF for PDF text extraction
import pandas as pd
import pytesseract
from PIL import Image
import io
import re

# Configure Tesseract path if necessary
# pytesseract.pytesseract.tesseract_cmd = r'/path/to/tesseract'  # Uncomment and specify path if needed

def convert_pdf_page_to_image(pdf_path, page_num):
    """
    Convert a specific page of the PDF into an image.
    """
    with fitz.open(pdf_path) as doc:
        page = doc.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.open(io.BytesIO(pix.tobytes("png")))
    return img

def extract_sanskrit_words(image, page_number):
    """
    Perform OCR on the image and extract Sanskrit words before "(" in each line.
    """
    sanskrit_text = pytesseract.image_to_string(image, lang="san")  # Sanskrit OCR
    sanskrit_words = []
    
    # Process each line, only capturing Sanskrit words before "("
    for line in sanskrit_text.splitlines():
        match = re.match(r'^([\u0900-\u097F]+)', line)  # Devanagari characters before "("
        if match:
            word = match.group(1).strip()
            sanskrit_words.append({"Sanskrit Word": word, "Page Number": page_number})
    return sanskrit_words

def pdf_page_to_csv(pdf_path, csv_path, page_num=2):
    """
    Extract Sanskrit words from a specific page in PDF and save to CSV.
    """
    # Convert specified PDF page to image
    image = convert_pdf_page_to_image(pdf_path, page_num - 1)  # Page number starts from 0
    sanskrit_data = extract_sanskrit_words(image, page_num)
    
    # Create DataFrame and save to CSV
    df = pd.DataFrame(sanskrit_data, columns=["Sanskrit Word", "Page Number"])
    df.to_csv(csv_path, index=False, encoding='utf-8')
    print(f"CSV file saved as: {csv_path}")

# Usage example
pdf_path = "dictallcheck.pdf"  # Replace with the path to your encoded PDF
csv_path = "test_page3.csv"  # Replace with desired output CSV file path
pdf_page_to_csv(pdf_path, csv_path, page_num=3)


CSV file saved as: test_page3.csv


In [17]:
import fitz  # PyMuPDF
import pandas as pd
import pytesseract
from PIL import Image
import io
import re

# Configure Tesseract path if necessary
# pytesseract.pytesseract.tesseract_cmd = r'/path/to/tesseract'  # Uncomment and specify path if needed

def convert_pdf_page_to_image(pdf_path, page_num):
    """
    Convert a specific page of the PDF into an image.
    """
    with fitz.open(pdf_path) as doc:
        page = doc.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.open(io.BytesIO(pix.tobytes("png")))
    return img

def extract_sanskrit_words(image, page_number):
    """
    Perform OCR on the image and extract only Sanskrit words at the beginning of each line.
    """
    sanskrit_text = pytesseract.image_to_string(image, lang="san")  # Sanskrit OCR
    sanskrit_words = []
    
    # Process each line, extracting only Sanskrit words at the start of the line
    for line in sanskrit_text.splitlines():
        # Capture Sanskrit words (Devanagari script) until the first "(" or any non-Sanskrit characters
        match = re.match(r'^([\u0900-\u097F]+)', line)
        if match:
            word = match.group(1).strip()
            sanskrit_words.append({"Sanskrit Word": word, "Page Number": page_number})
    
    return sanskrit_words

def pdf_page_to_csv(pdf_path, csv_path, page_num=2):
    """
    Extract Sanskrit words from a specific page in PDF and save to CSV.
    """
    # Convert specified PDF page to image
    image = convert_pdf_page_to_image(pdf_path, page_num - 1)  # Page number starts from 0
    sanskrit_data = extract_sanskrit_words(image, page_num)
    
    # Create DataFrame and save to CSV
    df = pd.DataFrame(sanskrit_data, columns=["Sanskrit Word", "Page Number"])
    df.to_csv(csv_path, index=False, encoding='utf-8')
    print(f"CSV file saved as: {csv_path}")

# Usage example
pdf_path = "dictallcheck.pdf"  # Replace with the path to your encoded PDF
csv_path = "test1_page3.csv"  # Replace with desired output CSV file path
pdf_page_to_csv(pdf_path, csv_path, page_num=3)


CSV file saved as: test1_page3.csv


In [20]:
import pytesseract
import fitz  # PyMuPDF
from pdf2image import convert_from_path
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd
import re
import io

# Set Tesseract executable path if needed (adjust path as per your installation)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def convert_pdf_page_to_image(pdf_path, page_num):
    """
    Convert a specific page of the PDF into an image.
    """
    with fitz.open(pdf_path) as doc:
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)  # Increase DPI for better resolution
        img = Image.open(io.BytesIO(pix.tobytes("png")))
    return img

def preprocess_image(image):
    """
    Enhance and preprocess the image for better OCR accuracy.
    """
    # Convert to grayscale
    image = image.convert("L")

    # Enhance contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2)

    # Resize image to improve OCR detection of smaller characters
    image = image.resize((image.width * 2, image.height * 2), Image.LANCZOS)

    # Apply a slight blur to reduce noise
    image = image.filter(ImageFilter.MedianFilter())
    
    return image

def extract_sanskrit_words(image, page_number):
    """
    Perform OCR on the image and extract only Sanskrit words at the beginning of each line.
    """
    # Preprocess the image for better OCR accuracy
    image = preprocess_image(image)

    # OCR with Sanskrit language setting
    sanskrit_text = pytesseract.image_to_string(image, lang="san")  # Sanskrit OCR

    # Extract Sanskrit words before "(" and keep only the first word in each line
    sanskrit_words = []
    for line in sanskrit_text.splitlines():
        match = re.match(r"^(\S+)\s*\(", line)  # Match Sanskrit words before "("
        if match:
            sanskrit_word = match.group(1)
            sanskrit_words.append({"Sanskrit Word": sanskrit_word, "Page Number": page_number})
    
    return sanskrit_words

def pdf_page_to_csv(pdf_path, csv_path, page_num=3):
    """
    Extract Sanskrit words from a specific page in PDF and save to CSV.
    """
    # Convert specified PDF page to image
    image = convert_pdf_page_to_image(pdf_path, page_num - 1)  # Page number starts from 0
    sanskrit_data = extract_sanskrit_words(image, page_num)

    # Create DataFrame and save to CSV
    df = pd.DataFrame(sanskrit_data, columns=["Sanskrit Word", "Page Number"])
    df.to_csv(csv_path, index=False)
    print(f"Data saved to {csv_path}")

# Usage example
pdf_path = "dictallcheck.pdf"  # Replace with the path to your encoded PDF
csv_path = "output1_page3.csv"  # Replace with desired output CSV file path
pdf_page_to_csv(pdf_path, csv_path, page_num=3)


Data saved to output1_page3.csv


In [21]:
import pytesseract
import fitz  # PyMuPDF
from pdf2image import convert_from_path
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd
import re
import io

# Set Tesseract executable path if needed (adjust path as per your installation)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def convert_pdf_page_to_image(pdf_path, page_num):
    """
    Convert a specific page of the PDF into an image.
    """
    with fitz.open(pdf_path) as doc:
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)  # Increase DPI for better resolution
        img = Image.open(io.BytesIO(pix.tobytes("png")))
    return img

def preprocess_image(image):
    """
    Enhance and preprocess the image for better OCR accuracy.
    """
    # Convert to grayscale
    image = image.convert("L")

    # Enhance contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2)

    # Resize image to improve OCR detection of smaller characters
    image = image.resize((image.width * 2, image.height * 2), Image.LANCZOS)

    # Apply a slight blur to reduce noise
    image = image.filter(ImageFilter.MedianFilter())
    
    return image

def extract_sanskrit_data(image, page_number):
    """
    Perform OCR on the image and extract Sanskrit word, transliteration, and meaning.
    """
    # Preprocess the image for better OCR accuracy
    image = preprocess_image(image)

    # OCR with Sanskrit language setting
    sanskrit_text = pytesseract.image_to_string(image, lang="san")  # Sanskrit OCR

    # Extract Sanskrit word, transliteration, and meaning
    sanskrit_data = []
    for line in sanskrit_text.splitlines():
        # Match Sanskrit word, transliteration, and meaning
        match = re.match(r"^(\S+)\s*\(([^)]+)\)\s*=\s*(.+)", line)
        
        if match:
            sanskrit_word = match.group(1)  # Sanskrit word before "("
            transliteration = match.group(2)  # Text inside parentheses
            meaning = match.group(3)  # Text after "="
            sanskrit_data.append({
                "Sanskrit Word": sanskrit_word,
                "Transliteration": transliteration,
                "Meaning": meaning,
                "Page Number": page_number
            })
    
    return sanskrit_data

def pdf_page_to_csv(pdf_path, csv_path, page_num=3):
    """
    Extract Sanskrit words, transliterations, and meanings from a specific page in PDF and save to CSV.
    """
    # Convert specified PDF page to image
    image = convert_pdf_page_to_image(pdf_path, page_num - 1)  # Page number starts from 0
    sanskrit_data = extract_sanskrit_data(image, page_num)

    # Create DataFrame and save to CSV
    df = pd.DataFrame(sanskrit_data, columns=["Sanskrit Word", "Transliteration", "Meaning", "Page Number"])
    df.to_csv(csv_path, index=False)
    print(f"Data saved to {csv_path}")

# Usage example
pdf_path = "dictallcheck.pdf"  # Replace with the path to your encoded PDF
csv_path = "output_page3.csv"  # Replace with desired output CSV file path
pdf_page_to_csv(pdf_path, csv_path, page_num=3)


Data saved to output_page3.csv


In [22]:
import pytesseract
import fitz  # PyMuPDF
from pdf2image import convert_from_path
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd
import re
import io

# Set Tesseract executable path if needed (adjust path as per your installation)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def convert_pdf_page_to_image(pdf_path, page_num):
    """
    Convert a specific page of the PDF into an image.
    """
    with fitz.open(pdf_path) as doc:
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)  # Increase DPI for better resolution
        img = Image.open(io.BytesIO(pix.tobytes("png")))
    return img

def preprocess_image(image, lang="san"):
    """
    Enhance and preprocess the image for better OCR accuracy.
    Use specific preprocessing depending on language.
    """
    # Convert to grayscale
    image = image.convert("L")

    # Enhance contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2)

    # Resize image to improve OCR detection of smaller characters
    image = image.resize((image.width * 2, image.height * 2), Image.LANCZOS)

    # Apply a slight blur to reduce noise
    image = image.filter(ImageFilter.MedianFilter())

    return image

def extract_sanskrit_data(image, page_number):
    """
    Perform OCR on the image and extract Sanskrit word, transliteration, and meaning.
    """
    # Preprocess the image for better OCR accuracy
    image = preprocess_image(image, lang="san")

    # OCR for Sanskrit and English
    sanskrit_text = pytesseract.image_to_string(image, lang="san+eng")

    # Extract Sanskrit word, transliteration, and meaning
    sanskrit_data = []
    for line in sanskrit_text.splitlines():
        # Match Sanskrit word, transliteration, and meaning
        match = re.match(r"^(\S+)\s*\(([^)]+)\)\s*=\s*(.+)", line)

        if match:
            sanskrit_word = match.group(1)  # Sanskrit word before "("
            transliteration = match.group(2)  # Text inside parentheses
            meaning = match.group(3)  # Text after "="

            # Filter out any non-English characters from transliteration and meaning
            transliteration = re.sub(r"[^a-zA-Z\s]", "", transliteration)
            meaning = re.sub(r"[^a-zA-Z\s]", "", meaning)
            
            sanskrit_data.append({
                "Sanskrit Word": sanskrit_word,
                "Transliteration": transliteration.strip(),
                "Meaning": meaning.strip(),
                "Page Number": page_number
            })
    
    return sanskrit_data

def pdf_page_to_csv(pdf_path, csv_path, page_num=3):
    """
    Extract Sanskrit words, transliterations, and meanings from a specific page in PDF and save to CSV.
    """
    # Convert specified PDF page to image
    image = convert_pdf_page_to_image(pdf_path, page_num - 1)  # Page number starts from 0
    sanskrit_data = extract_sanskrit_data(image, page_num)

    # Create DataFrame and save to CSV
    df = pd.DataFrame(sanskrit_data, columns=["Sanskrit Word", "Transliteration", "Meaning", "Page Number"])
    df.to_csv(csv_path, index=False)
    print(f"Data saved to {csv_path}")

# Usage example
pdf_path = "dictallcheck.pdf"  # Replace with the path to your encoded PDF
csv_path = "output_page3.csv"  # Replace with desired output CSV file path
pdf_page_to_csv(pdf_path, csv_path, page_num=3)


Data saved to output_page3.csv


In [23]:
import os
import pytesseract
import fitz  # PyMuPDF
from pdf2image import convert_from_path
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd
import re
import io

# Set Tesseract executable path if needed (adjust path as per your installation)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def convert_pdf_page_to_image(pdf_path, page_num):
    """
    Convert a specific page of the PDF into an image.
    """
    with fitz.open(pdf_path) as doc:
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)  # Increase DPI for better resolution
        img = Image.open(io.BytesIO(pix.tobytes("png")))
    return img

def preprocess_image(image, lang="san"):
    """
    Enhance and preprocess the image for better OCR accuracy.
    Use specific preprocessing depending on language.
    """
    # Convert to grayscale
    image = image.convert("L")

    # Enhance contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2)

    # Resize image to improve OCR detection of smaller characters
    image = image.resize((image.width * 2, image.height * 2), Image.LANCZOS)

    # Apply a slight blur to reduce noise
    image = image.filter(ImageFilter.MedianFilter())

    return image

def extract_sanskrit_data(image, page_number):
    """
    Perform OCR on the image and extract Sanskrit word, transliteration, and meaning.
    """
    # Preprocess the image for better OCR accuracy
    image = preprocess_image(image, lang="san")

    # OCR for Sanskrit and English
    sanskrit_text = pytesseract.image_to_string(image, lang="san+eng")

    # Extract Sanskrit word, transliteration, and meaning
    sanskrit_data = []
    for line in sanskrit_text.splitlines():
        # Match Sanskrit word, transliteration, and meaning
        match = re.match(r"^(\S+)\s*\(([^)]+)\)\s*=\s*(.+)", line)

        if match:
            sanskrit_word = match.group(1)  # Sanskrit word before "("
            transliteration = match.group(2)  # Text inside parentheses
            meaning = match.group(3)  # Text after "="

            # Filter out any non-English characters from transliteration and meaning
            transliteration = re.sub(r"[^a-zA-Z\s]", "", transliteration)
            meaning = re.sub(r"[^a-zA-Z\s]", "", meaning)
            
            sanskrit_data.append({
                "Sanskrit Word": sanskrit_word,
                "Transliteration": transliteration.strip(),
                "Meaning": meaning.strip(),
                "Page Number": page_number
            })
    
    return sanskrit_data

def save_all_pages_to_csv(pdf_path, output_folder):
    """
    Extract Sanskrit words, transliterations, and meanings from all pages in the PDF and save each to a separate CSV.
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Open the PDF
    with fitz.open(pdf_path) as doc:
        total_pages = doc.page_count
        
        # Process each page
        for page_num in range(total_pages):
            print(f"Processing page {page_num + 1} of {total_pages}...")
            
            # Convert current PDF page to image
            image = convert_pdf_page_to_image(pdf_path, page_num)
            
            # Extract Sanskrit data
            sanskrit_data = extract_sanskrit_data(image, page_num + 1)
            
            # Create DataFrame and save to CSV
            df = pd.DataFrame(sanskrit_data, columns=["Sanskrit Word", "Transliteration", "Meaning", "Page Number"])
            csv_path = os.path.join(output_folder, f"page_{page_num + 1}.csv")
            df.to_csv(csv_path, index=False)
            print(f"Page {page_num + 1} saved to {csv_path}")

# Usage example
pdf_path = "dictallcheck.pdf"  # Replace with the path to your encoded PDF
output_folder = "output_csv_files"  # Replace with the desired output folder name
save_all_pages_to_csv(pdf_path, output_folder)


Processing page 1 of 221...
Page 1 saved to output_csv_files\page_1.csv
Processing page 2 of 221...
Page 2 saved to output_csv_files\page_2.csv
Processing page 3 of 221...
Page 3 saved to output_csv_files\page_3.csv
Processing page 4 of 221...
Page 4 saved to output_csv_files\page_4.csv
Processing page 5 of 221...
Page 5 saved to output_csv_files\page_5.csv
Processing page 6 of 221...
Page 6 saved to output_csv_files\page_6.csv
Processing page 7 of 221...
Page 7 saved to output_csv_files\page_7.csv
Processing page 8 of 221...
Page 8 saved to output_csv_files\page_8.csv
Processing page 9 of 221...
Page 9 saved to output_csv_files\page_9.csv
Processing page 10 of 221...
Page 10 saved to output_csv_files\page_10.csv
Processing page 11 of 221...
Page 11 saved to output_csv_files\page_11.csv
Processing page 12 of 221...
Page 12 saved to output_csv_files\page_12.csv
Processing page 13 of 221...
Page 13 saved to output_csv_files\page_13.csv
Processing page 14 of 221...
Page 14 saved to output

In [24]:
import os
import pandas as pd

# Set the path to the folder containing the CSV files
input_folder = 'output_csv_files'
output_folder = 'sorted_csv_files'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Loop through all files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith('.csv'):
        # Create the full path to the input file
        input_file_path = os.path.join(input_folder, filename)
        
        # Read the CSV file
        df = pd.read_csv(input_file_path)
        
        # Sort the DataFrame (adjust the column name as needed)
        sorted_df = df.sort_values(by=df.columns[0])  # Sorting by the first column
        
        # Create the full path for the output file
        output_file_path = os.path.join(output_folder, filename)
        
        # Save the sorted DataFrame to a new CSV file
        sorted_df.to_csv(output_file_path, index=False)

print("Sorting completed. Sorted files are saved in:", output_folder)


Sorting completed. Sorted files are saved in: sorted_csv_files


In [3]:
import os
import pytesseract
import fitz  # PyMuPDF
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd
import re
import io

# Set Tesseract executable path if needed (adjust path as per your installation)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def convert_pdf_page_to_image(pdf_path, page_num):
    """
    Convert a specific page of the PDF into an image.
    """
    with fitz.open(pdf_path) as doc:
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)  # Increase DPI for better resolution
        img = Image.open(io.BytesIO(pix.tobytes("png")))
    return img

def preprocess_image(image, lang="san"):
    """
    Enhance and preprocess the image for better OCR accuracy.
    Use specific preprocessing depending on language.
    """
    # Convert to grayscale
    image = image.convert("L")

    # Enhance contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2)

    # Resize image to improve OCR detection of smaller characters
    image = image.resize((image.width * 2, image.height * 2), Image.LANCZOS)

    # Apply a slight blur to reduce noise
    image = image.filter(ImageFilter.MedianFilter())

    return image

def extract_sanskrit_data(image, page_number):
    """
    Perform OCR on the image and extract Sanskrit word, transliteration, and meaning.
    """
    # Preprocess the image for better OCR accuracy
    image = preprocess_image(image, lang="san")

    # OCR for Sanskrit and English
    sanskrit_text = pytesseract.image_to_string(image, lang="san+eng")

    # Extract Sanskrit word, transliteration, and meaning
    sanskrit_data = []
    for line in sanskrit_text.splitlines():
        # Match Sanskrit word, transliteration, and meaning
        match = re.match(r"^(\S+)\s*\(([^)]+)\)\s*=\s*(.+)", line)

        if match:
            sanskrit_word = match.group(1)  # Sanskrit word before "("
            transliteration = match.group(2)  # Text inside parentheses
            meaning = match.group(3)  # Text after "="

            # Filter out any non-English characters from transliteration and meaning
            transliteration = re.sub(r"[^a-zA-Z\s]", "", transliteration)
            meaning = re.sub(r"[^a-zA-Z\s]", "", meaning)
            
            sanskrit_data.append({
                "Sanskrit Word": sanskrit_word,
                "Transliteration": transliteration.strip(),
                "Meaning": meaning.strip(),
                "Page Number": page_number
            })
    
    return sanskrit_data

def save_all_pages_to_single_csv(pdf_path, output_csv):
    """
    Extract Sanskrit words, transliterations, and meanings from all pages in the PDF and save them all to a single CSV.
    """
    # List to accumulate all data from each page
    all_data = []
    
    # Open the PDF
    with fitz.open(pdf_path) as doc:
        total_pages = doc.page_count
        
        # Process each page
        for page_num in range(total_pages):
            print(f"Processing page {page_num + 1} of {total_pages}...")
            
            # Convert current PDF page to image
            image = convert_pdf_page_to_image(pdf_path, page_num)
            
            # Extract Sanskrit data and append to all_data
            sanskrit_data = extract_sanskrit_data(image, page_num + 1)
            all_data.extend(sanskrit_data)
    
    # Create DataFrame with all data and save to a single CSV
    df = pd.DataFrame(all_data, columns=["Sanskrit Word", "Transliteration", "Meaning", "Page Number"])
    df.to_csv(output_csv, index=False)
    print(f"All pages saved to {output_csv}")

# Usage example
pdf_path = "dictallcheck.pdf"  # Replace with the path to your encoded PDF
output_csv = "dataset_final.csv"  # The final CSV file to save all data
save_all_pages_to_single_csv(pdf_path, output_csv)


Processing page 1 of 221...
Processing page 2 of 221...
Processing page 3 of 221...
Processing page 4 of 221...
Processing page 5 of 221...
Processing page 6 of 221...
Processing page 7 of 221...
Processing page 8 of 221...
Processing page 9 of 221...
Processing page 10 of 221...
Processing page 11 of 221...
Processing page 12 of 221...
Processing page 13 of 221...
Processing page 14 of 221...
Processing page 15 of 221...
Processing page 16 of 221...
Processing page 17 of 221...
Processing page 18 of 221...
Processing page 19 of 221...
Processing page 20 of 221...
Processing page 21 of 221...
Processing page 22 of 221...
Processing page 23 of 221...
Processing page 24 of 221...
Processing page 25 of 221...
Processing page 26 of 221...
Processing page 27 of 221...
Processing page 28 of 221...
Processing page 29 of 221...
Processing page 30 of 221...
Processing page 31 of 221...
Processing page 32 of 221...
Processing page 33 of 221...
Processing page 34 of 221...
Processing page 35 of 2

In [4]:
import os
import pytesseract
import fitz  # PyMuPDF
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd
import re
import io

# Set Tesseract executable path if needed (adjust path as per your installation)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def convert_pdf_page_to_image(pdf_path, page_num):
    """
    Convert a specific page of the PDF into an image.
    """
    with fitz.open(pdf_path) as doc:
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=400)  # Further increase DPI for better resolution
        img = Image.open(io.BytesIO(pix.tobytes("png")))
    return img

def preprocess_image(image, lang="san"):
    """
    Enhance and preprocess the image for better OCR accuracy.
    """
    # Convert to grayscale
    image = image.convert("L")

    # Enhance contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.5)  # Increase contrast enhancement for clarity

    # Resize image to improve OCR detection of smaller characters
    image = image.resize((image.width * 2, image.height * 2), Image.LANCZOS)

    # Apply a slight blur to reduce noise
    image = image.filter(ImageFilter.MedianFilter(size=3))

    return image

def extract_sanskrit_data(image, page_number):
    """
    Perform OCR on the image and extract Sanskrit word, transliteration, and meaning.
    """
    # Preprocess the image for better OCR accuracy
    image = preprocess_image(image, lang="san")

    # OCR with specific configuration to handle word segmentation
    custom_config = r'--psm 6 -c tessedit_char_blacklist=|!@#'  # Further segmentation tuning
    sanskrit_text = pytesseract.image_to_string(image, lang="san+eng", config=custom_config)

    # Enhanced regex to capture punctuation in transliteration and meaning
    sanskrit_data = []
    for line in sanskrit_text.splitlines():
        # Match Sanskrit word, transliteration, and meaning with improved punctuation support
        match = re.match(r"^(\S+)\s*\(([^)]+)\)\s*=\s*(.+)", line)

        if match:
            sanskrit_word = match.group(1)  # Sanskrit word before "("
            transliteration = match.group(2)  # Text inside parentheses, allowing punctuation
            meaning = match.group(3)  # Text after "=", allowing punctuation

            sanskrit_data.append({
                "Sanskrit Word": sanskrit_word,
                "Transliteration": transliteration.strip(),
                "Meaning": meaning.strip(),
                "Page Number": page_number
            })
    
    return sanskrit_data

def save_single_page_to_csv(pdf_path, page_num, output_csv):
    """
    Extract Sanskrit words, transliterations, and meanings from a single page in the PDF and save to CSV for testing.
    """
    # Convert PDF page to image
    image = convert_pdf_page_to_image(pdf_path, page_num)
    
    # Extract Sanskrit data from the page
    sanskrit_data = extract_sanskrit_data(image, page_num + 1)
    
    # Create DataFrame and save to CSV
    df = pd.DataFrame(sanskrit_data, columns=["Sanskrit Word", "Transliteration", "Meaning", "Page Number"])
    df.to_csv(output_csv, index=False)
    print(f"Page {page_num + 1} saved to {output_csv}")

# Test with a single page
pdf_path = "dictallcheck.pdf"  # Replace with the path to your encoded PDF
output_csv = "test_page_output.csv"  # The CSV file for testing a single page
test_page_num = 3  # Adjust this to the page number you want to test (0 = first page)
save_single_page_to_csv(pdf_path, test_page_num, output_csv)


Page 4 saved to test_page_output.csv


In [5]:
import os
import pytesseract
import fitz  # PyMuPDF
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd
import re
import io

# Set Tesseract executable path if needed (adjust path as per your installation)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def convert_pdf_page_to_image(pdf_path, page_num):
    """
    Convert a specific page of the PDF into an image.
    """
    with fitz.open(pdf_path) as doc:
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=400)  # Further increase DPI for better resolution
        img = Image.open(io.BytesIO(pix.tobytes("png")))
    return img

def preprocess_image(image, lang="san"):
    """
    Enhance and preprocess the image for better OCR accuracy.
    """
    # Convert to grayscale
    image = image.convert("L")

    # Enhance contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.5)  # Increase contrast enhancement for clarity

    # Resize image to improve OCR detection of smaller characters
    image = image.resize((image.width * 2, image.height * 2), Image.LANCZOS)

    # Apply a slight blur to reduce noise
    image = image.filter(ImageFilter.MedianFilter(size=3))

    return image

def extract_sanskrit_data(image, page_number):
    """
    Perform OCR on the image and extract Sanskrit word, transliteration, and meaning.
    """
    # Preprocess the image for better OCR accuracy
    image = preprocess_image(image, lang="san")

    # OCR with specific configuration to handle word segmentation
    custom_config = r'--psm 6 -c tessedit_char_blacklist=|!@#'  # Further segmentation tuning
    sanskrit_text = pytesseract.image_to_string(image, lang="san+eng", config=custom_config)

    # Enhanced regex to capture punctuation in transliteration and meaning
    sanskrit_data = []
    for line in sanskrit_text.splitlines():
        # Match Sanskrit word, transliteration, and meaning with improved punctuation support
        match = re.match(r"^(\S+)\s*\(([^)]+)\)\s*=\s*(.+)", line)

        if match:
            sanskrit_word = match.group(1)  # Sanskrit word before "("
            transliteration = match.group(2)  # Text inside parentheses, allowing punctuation
            meaning = match.group(3)  # Text after "=", allowing punctuation

            sanskrit_data.append({
                "Sanskrit Word": sanskrit_word,
                "Transliteration": transliteration.strip(),
                "Meaning": meaning.strip(),
                "Page Number": page_number
            })
    
    return sanskrit_data

def save_all_pages_to_single_csv(pdf_path, output_csv):
    """
    Extract Sanskrit words, transliterations, and meanings from all pages in the PDF and save to a single CSV.
    """
    all_data = []  # Collect data from all pages

    # Open the PDF
    with fitz.open(pdf_path) as doc:
        total_pages = doc.page_count
        
        # Process each page
        for page_num in range(total_pages):
            print(f"Processing page {page_num + 1} of {total_pages}...")
            
            # Convert current PDF page to image
            image = convert_pdf_page_to_image(pdf_path, page_num)
            
            # Extract Sanskrit data
            sanskrit_data = extract_sanskrit_data(image, page_num + 1)
            
            # Append data to the main list
            all_data.extend(sanskrit_data)

    # Create DataFrame from all collected data and save to a single CSV
    df = pd.DataFrame(all_data, columns=["Sanskrit Word", "Transliteration", "Meaning", "Page Number"])
    df.to_csv(output_csv, index=False)
    print(f"All pages saved to {output_csv}")

# Usage example
pdf_path = "dictallcheck.pdf"  # Replace with the path to your encoded PDF
output_csv = "dataset_final1.csv"  # The final CSV file containing all pages' data
save_all_pages_to_single_csv(pdf_path, output_csv)


Processing page 1 of 221...
Processing page 2 of 221...
Processing page 3 of 221...
Processing page 4 of 221...
Processing page 5 of 221...
Processing page 6 of 221...
Processing page 7 of 221...
Processing page 8 of 221...
Processing page 9 of 221...
Processing page 10 of 221...
Processing page 11 of 221...
Processing page 12 of 221...
Processing page 13 of 221...
Processing page 14 of 221...
Processing page 15 of 221...
Processing page 16 of 221...
Processing page 17 of 221...
Processing page 18 of 221...
Processing page 19 of 221...
Processing page 20 of 221...
Processing page 21 of 221...
Processing page 22 of 221...
Processing page 23 of 221...
Processing page 24 of 221...
Processing page 25 of 221...
Processing page 26 of 221...
Processing page 27 of 221...
Processing page 28 of 221...
Processing page 29 of 221...
Processing page 30 of 221...
Processing page 31 of 221...
Processing page 32 of 221...
Processing page 33 of 221...
Processing page 34 of 221...
Processing page 35 of 2

In [7]:
print(data.columns)



Index(['Sanskrit Word', 'Transliteration', 'Meaning', 'Page Number'], dtype='object')


In [8]:
import os
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Load the main dataset CSV
data = pd.read_csv('dataset_final1.csv')

# Trim whitespace from column names (in case of extra spaces)
data.columns = data.columns.str.strip()

# Initialize an empty list to store anomalies
anomalies = []

# Define the path to the folder containing page-wise Sanskrit data
pages_folder = 'pages'

# Loop through each row in the main dataset
for index, row in data.iterrows():
    page_number = row['Page Number']
    main_sanskrit_word = row['Sanskrit Word']

    # Construct the file path for the current page number
    page_file_path = os.path.join(pages_folder, f'page_{page_number}.csv')
    
    # Check if the page file exists
    if os.path.exists(page_file_path):
        # Load the page-specific data
        page_data = pd.read_csv(page_file_path)
        
        # Extract the Sanskrit words column from the page file (assuming it's named 'Sanskrit Word')
        page_sanskrit_words = page_data['Sanskrit Word'].tolist()
        
        # Check if the main Sanskrit word is in the page's word list
        if main_sanskrit_word not in page_sanskrit_words:
            # Try fuzzy matching to find the closest match
            closest_match, match_score = process.extractOne(main_sanskrit_word, page_sanskrit_words, scorer=fuzz.ratio)

            # If the match score is high (e.g., > 85), assume it's a close enough match and correct it
            if match_score > 85:
                print(f"Correcting '{main_sanskrit_word}' to '{closest_match}' on page {page_number}")
                data.at[index, 'Sanskrit Word'] = closest_match
            else:
                # If no close match is found, log the anomaly
                anomalies.append({
                    'Page Number': page_number,
                    'Sanskrit Word': main_sanskrit_word,
                    'issue': 'No close match found on page'
                })
    else:
        # Log an anomaly if the page file doesn't exist
        anomalies.append({
            'Page Number': page_number,
            'Sanskrit Word': main_sanskrit_word,
            'issue': 'Page file not found'
        })

# Convert the anomalies to a DataFrame and append it to the original data
anomalies_df = pd.DataFrame(anomalies)
final_data = pd.concat([data, anomalies_df], ignore_index=True)

# Save the final DataFrame with corrections and anomalies to a new CSV
final_data.to_csv('dataset_corrected.csv', index=False)
print("Process completed. Check 'dataset_corrected.csv' for results.")


Process completed. Check 'dataset_corrected.csv' for results.


In [11]:
import os
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re

# Load the main dataset CSV
data = pd.read_csv('dataset_final1.csv')

# Trim whitespace from column names (in case of extra spaces)
data.columns = data.columns.str.strip()

# Initialize a list to store unmatched words to append at the end of the CSV
unmatched_words = []

# Define the path to the folder containing page-wise Sanskrit data
pages_folder = 'pages'

# Sanskrit character range pattern to identify Sanskrit text
sanskrit_pattern = re.compile(r'[\u0900-\u097F]+')

# Loop through each row in the main dataset
for index, row in data.iterrows():
    main_sanskrit_word = row['Sanskrit Word']
    
    # Check if 'Sanskrit Word' does not contain Sanskrit characters (indicating it's in English or another language)
    if pd.notna(main_sanskrit_word) and not sanskrit_pattern.search(main_sanskrit_word):
        page_number = row['Page Number']
        
        # Construct the file path for the current page number
        page_file_path = os.path.join(pages_folder, f'page_{page_number}.csv')
        
        # Check if the page file exists
        if os.path.exists(page_file_path):
            # Load the page-specific data
            page_data = pd.read_csv(page_file_path)
            
            # Extract the Sanskrit words column from the page file (assuming it's named 'Sanskrit Word')
            page_sanskrit_words = page_data['Sanskrit Word'].tolist()
            
            # Try fuzzy matching to find the closest match in the page's Sanskrit words list
            closest_match, match_score = process.extractOne(main_sanskrit_word, page_sanskrit_words, scorer=fuzz.ratio)

            # If the match score is high (e.g., > 85), assume it's a close enough match and correct it
            if match_score > 85:
                print(f"Correcting '{main_sanskrit_word}' to '{closest_match}' on page {page_number}")
                data.at[index, 'Sanskrit Word'] = closest_match
            else:
                # If no close match is found, keep the row as it is in unmatched_words
                unmatched_words.append(row)
        else:
            # If the page file does not exist, keep the row as it is in unmatched_words
            unmatched_words.append(row)

# Convert unmatched words to a DataFrame with the exact rows
unmatched_df = pd.DataFrame(unmatched_words)

# Concatenate the main data with unmatched words DataFrame, adding unmatched rows at the end
final_data = pd.concat([data, unmatched_df], ignore_index=True)

# Save the final DataFrame with corrections and unmatched words at the end to a new CSV
final_data.to_csv('dataset_corrected.csv', index=False)
print("Process completed. Check 'dataset_corrected.csv' for results.")


Process completed. Check 'dataset_corrected.csv' for results.


In [12]:
import os
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re

# Load the main dataset CSV
data = pd.read_csv('dataset_final1.csv')

# Trim whitespace from column names (in case of extra spaces)
data.columns = data.columns.str.strip()

# Initialize a list to store unmatched words to append at the end of the CSV
unmatched_words = []

# Define the path to the folder containing page-wise Sanskrit data
pages_folder = 'pages'

# Sanskrit character range pattern to identify Sanskrit text
sanskrit_pattern = re.compile(r'[\u0900-\u097F]+')

# Loop through each row in the main dataset
for index, row in data.iterrows():
    main_sanskrit_word = row['Sanskrit Word']
    
    # Check if 'Sanskrit Word' does not contain Sanskrit characters (indicating it's in English or another language)
    if pd.notna(main_sanskrit_word) and not sanskrit_pattern.search(main_sanskrit_word):
        page_number = row['Page Number']
        
        # Construct the file path for the current page number
        page_file_path = os.path.join(pages_folder, f'page_{page_number}.csv')
        
        # Check if the page file exists
        if os.path.exists(page_file_path):
            # Load the page-specific data
            page_data = pd.read_csv(page_file_path)
            
            # Extract the Sanskrit words column from the page file (assuming it's named 'Sanskrit Word')
            page_sanskrit_words = page_data['Sanskrit Word'].tolist()
            
            # Try fuzzy matching to find the closest match in the page's Sanskrit words list
            closest_match, match_score = process.extractOne(main_sanskrit_word, page_sanskrit_words, scorer=fuzz.ratio)

            # If the match score is high (e.g., > 85), assume it's a close enough match and correct it
            if match_score > 85:
                print(f"Correcting '{main_sanskrit_word}' to '{closest_match}' on page {page_number}")
                data.at[index, 'Sanskrit Word'] = closest_match
            else:
                # If no close match is found, move the row to unmatched_words and remove from original position
                unmatched_words.append(row)
                data = data.drop(index)
        else:
            # If the page file does not exist, move the row to unmatched_words and remove from original position
            unmatched_words.append(row)
            data = data.drop(index)

# Convert unmatched words to a DataFrame with the exact rows
unmatched_df = pd.DataFrame(unmatched_words)

# Concatenate the main data with unmatched words DataFrame, adding unmatched rows at the end
final_data = pd.concat([data, unmatched_df], ignore_index=True)

# Save the final DataFrame with corrections and unmatched words at the end to a new CSV
final_data.to_csv('dataset_corrected1.csv', index=False)
print("Process completed. Check 'dataset_corrected.csv' for results.")


Process completed. Check 'dataset_corrected.csv' for results.


In [4]:
import pandas as pd
import fitz  # PyMuPDF for PDF handling
import pytesseract
from PIL import Image  # Import Image from Pillow

# Configure pytesseract for OCR
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Update the path as needed

# Function to perform OCR on a specific page of a PDF
def extract_text_from_pdf(pdf_path, page_number):
    try:
        doc = fitz.open(pdf_path)
        if 1 <= page_number <= len(doc):
            page = doc[page_number - 1]  # Pages are 0-indexed
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)  # Use Pillow to convert Pixmap to Image
            text = pytesseract.image_to_string(img, lang='san')  # Sanskrit OCR
            return text
        else:
            return None
    except Exception as e:
        print(f"Error during OCR on page {page_number}: {e}")
        return None

# Load Excel file
excel_path = "notprocessedvalues.xlsx"
pdf_path = "dictallcheck.pdf"
output_excel_path = "corrected_sanskrit_words.xlsx"

df = pd.read_excel(excel_path)

# Iterate over each row in the DataFrame
for idx, row in df.iterrows():
    word = row['Sanskrit Word']
    page_number = int(row['Page Number'])

    # Extract text from the PDF page
    ocr_text = extract_text_from_pdf(pdf_path, page_number)

    if ocr_text:
        # Simplistic approach to find corrected text (takes the first line of OCR text)
        corrected_word = None
        for line in ocr_text.splitlines():
            line = line.strip()
            if line:  # Ensure line has content
                corrected_word = line
                break

        if corrected_word and corrected_word != word:
            print(f"Row {idx}: Correcting '{word}' to '{corrected_word}'")
            df.at[idx, 'Sanskrit Word'] = corrected_word

# Save the corrected DataFrame to a new Excel file
df.to_excel(output_excel_path, index=False)


Row 0: Correcting 'Apa' to '(णव जा उत गलप'
Row 1: Correcting 'awattt:' to '(णव जा उत गलप'
Row 2: Correcting 'ag' to 'अकुल्ाविदाः (भ'
Row 3: Correcting 'weit' to 'अकुल्ाविदाः (भ'
Row 4: Correcting 'aes:' to 'अकुल्ाविदाः (भ'
Row 5: Correcting 'wag' to '०९१५४'
Row 6: Correcting 'aq' to '(वागत) = प ००८०५ णा ला'
Row 7: Correcting 'agile:' to '(वागत) = प ००८०५ णा ला'
Row 8: Correcting 'afacad' to '(वागत) = प ००८०५ णा ला'
Row 9: Correcting 'afastt' to 'यच (११५०९४२)'
Row 10: Correcting 'ayq' to 'यच (११५०९४२)'
Row 11: Correcting 'afte' to 'अनित्यः (  ।'
Row 12: Correcting 'aarat' to 'अनित्यः (  ।'
Row 13: Correcting 'aqestq' to 'अनित्यः (  ।'
Row 14: Correcting 'TUS:' to 'न्वित (५९१७१) = (20) ए०५व ०९ १५००८'
Row 15: Correcting 'wedadat' to 'न्वित (५९१७१) = (20) ए०५व ०९ १५००८'
Row 16: Correcting 'afst' to 'अन्धी (भातः) = (०५) ०८५५०,'
Row 17: Correcting 'wrist:' to 'अमलान्‌ (*१०१।१११.१) = १५५'
Row 18: Correcting 'wAdied' to 'अमलान्‌ (*१०१।१११.१) = १५५'
Row 19: Correcting 'aftq' to 'अरण्य; (षत) = (

In [6]:
import pandas as pd
import fitz  # PyMuPDF for PDF handling
import pytesseract
from PIL import Image

# Paths to files
incorrect_words_path = "notprocessedvalues.xlsx"  # Your Excel file with incorrect words
reference_words_path = "unique_sanskrit_words_ordered.csv"  # Reference file in CSV (no header)
pdf_path = "dictallcheck.pdf"
output_path = "corrected_with_reference.xlsx"

# Load Excel file with incorrect words
df_incorrect = pd.read_excel(incorrect_words_path)

# Load reference CSV without a header
df_reference = pd.read_csv(reference_words_path, header=None)  # No header
reference_words = df_reference[0].dropna().tolist()  # Extract the first column as a list

# Function to perform OCR on a specific page
def extract_text_with_preprocessing(pdf_path, page_number):
    try:
        doc = fitz.open(pdf_path)
        if 1 <= page_number <= len(doc):
            page = doc[page_number - 1]
            pix = page.get_pixmap(dpi=300)  # Enhance resolution
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            img = img.convert("L")  # Convert to grayscale
            img = img.point(lambda x: 0 if x < 128 else 255, '1')  # Binarization
            text = pytesseract.image_to_string(img, lang='san')  # Sanskrit OCR
            return text.splitlines()  # Return as list of lines
        else:
            return None
    except Exception as e:
        print(f"Error during OCR on page {page_number}: {e}")
        return None

# Iterate through the incorrect words DataFrame
for idx, row in df_incorrect.iterrows():
    word = row['Sanskrit Word']
    page_number = int(row['Page Number'])

    # Perform OCR on the specified page
    ocr_text_lines = extract_text_with_preprocessing(pdf_path, page_number)

    if ocr_text_lines:
        # Match OCR lines with the reference words
        corrected_word = None
        for line in ocr_text_lines:
            line = line.strip()
            if line in reference_words:
                corrected_word = line
                break

        # Update the incorrect word if a match is found
        if corrected_word and corrected_word != word:
            print(f"Row {idx}: Correcting '{word}' to '{corrected_word}'")
            df_incorrect.at[idx, 'Sanskrit Word'] = corrected_word

# Save the corrected DataFrame
df_incorrect.to_excel(output_path, index=False)
print(f"Corrected file saved to {output_path}")


Row 14: Correcting 'TUS:' to '13'
Row 15: Correcting 'wedadat' to '13'
Corrected file saved to corrected_with_reference.xlsx


In [8]:
import pandas as pd

# Function to convert Excel to CSV
def excel_to_csv(excel_file, csv_file):
    # Load the Excel file
    try:
        df = pd.read_excel(excel_file)
        # Save it as a CSV file with UTF-8 encoding
        df.to_csv(csv_file, index=False, encoding='utf-8')
        print(f"Excel file '{excel_file}' has been successfully converted to CSV: '{csv_file}'")
    except Exception as e:
        print(f"Error while converting Excel to CSV: {e}")

# Example usage
excel_file = 'DATASET.xlsx'  # Update with the path to your Excel file
csv_file = 'DATASET.csv'  # Update with the desired output CSV file name

# Call the function to convert
excel_to_csv(excel_file, csv_file)


Excel file 'DATASET.xlsx' has been successfully converted to CSV: 'DATASET.csv'
