<a href="https://colab.research.google.com/github/DhruvK278/Invoice-OCR/blob/main/Invoice_OCR_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install pytesseract pillow pandas
!sudo apt-get install tesseract-ocr -y

import pytesseract
from pytesseract import Output
from PIL import Image
import pandas as pd
from google.colab import files

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [2]:
# =============================
# Step 1: Upload image
# =============================
print("📂 Please upload your invoice image...")
uploaded = files.upload()

image_path = list(uploaded.keys())[0]
img = Image.open(image_path)

📂 Please upload your invoice image...


Saving invoice.jpg to invoice.jpg


In [3]:
# Step 2: OCR full data
# =============================
ocr_data = pytesseract.image_to_data(img, output_type=Output.DATAFRAME)
ocr_data = ocr_data.dropna(subset=['text'])

In [4]:
# Step 3: Find header row dynamically
# =============================
header_row = ocr_data[ocr_data['text'].str.contains("Description", case=False, na=False)]
if header_row.empty:
    raise ValueError("❌ Could not find header row. Check OCR output.")

header_y = header_row.iloc[0]['top']

# Get all words in header line
header_words = ocr_data[(ocr_data['top'] >= header_y - 5) & (ocr_data['top'] <= header_y + 5)]

# Create sorted list of x positions for columns
col_positions = sorted(header_words['left'].tolist())

# Map column names from OCR
column_names = []
for _, row in header_words.iterrows():
    column_names.append(row['text'])

# Keep only rows below header
table_data = ocr_data[ocr_data['top'] > header_y + 5]


In [5]:
# Step 4: Group words into lines
# =============================
lines = []
current_line = []
last_top = None

for _, row in table_data.iterrows():
    if last_top is None or abs(row['top'] - last_top) <= 10:
        current_line.append(row)
    else:
        lines.append(current_line)
        current_line = [row]
    last_top = row['top']
if current_line:
    lines.append(current_line)

In [6]:
# Step 5: Assign words to closest header column
# =============================
def assign_columns(line, header_positions, headers):
    row_data = {h: "" for h in headers}
    for _, word in pd.DataFrame(line).iterrows():
        distances = [abs(word['left'] - hp) for hp in header_positions]
        col_idx = distances.index(min(distances))
        header = headers[col_idx]
        row_data[header] += (" " if row_data[header] else "") + word['text']
    return row_data

rows = [assign_columns(line, col_positions, column_names) for line in lines]


In [9]:
import re

def merge_multiline_rows(rows, key_col="No.", desc_col="Description"):
    merged_rows = []
    for row in rows:
        if not row[key_col].strip():
            # Continuation line — append to last row's description
            if merged_rows:
                merged_rows[-1][desc_col] += " " + row[desc_col]
        else:
            merged_rows.append(row)
    return merged_rows

# Assign columns (same as before)
rows = [assign_columns(line, col_positions, column_names) for line in lines]

# Merge multiline descriptions
rows = merge_multiline_rows(rows, key_col=column_names[0], desc_col=column_names[1])

# Build DataFrame
df = pd.DataFrame(rows)
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

print("\n✅ --- Cleaned Table with Merged Descriptions ---\n")
print(df)



✅ --- Cleaned Table with Merged Descriptions ---

       No.                                        Description  \
0      ils  6'x3' Marble Counter Dining Table Top Multi Fu...   
1       2.  30 Inches Floral Art Table Top Round Marble Ta...   
2        a  6'x3' Black Marble Inlaid Center Table Top Pea...   
3       4.  Luxurious Pattern Stone Table Top Marble Confe...   
4       Sy  36 Inches Marble Dinning Top Inlay Meeting Tab...   
5       6.  Semi precious Stone Dinning Table Top Marble K...   
6       Ee                                  Used t shirt Blue   
7  SUMMARY                                                      

                  Qty UM Net price worth VAT [%] Gross  
0         Height 4,00                                   
1  Inlaid Coffee 5,00                                   
2         Dining 4,00                                   
3        Dinning 1,00                                   
4          Table 1,00                                   
5    Work Inlaid 5,00

In [10]:
# Step 6: Create DataFrame
# =============================
df = pd.DataFrame(rows)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

print("\n✅ --- Parsed Table ---\n")
print(df)


✅ --- Parsed Table ---

       No.                                        Description  \
0      ils  6'x3' Marble Counter Dining Table Top Multi Fu...   
1       2.  30 Inches Floral Art Table Top Round Marble Ta...   
2        a  6'x3' Black Marble Inlaid Center Table Top Pea...   
3       4.  Luxurious Pattern Stone Table Top Marble Confe...   
4       Sy  36 Inches Marble Dinning Top Inlay Meeting Tab...   
5       6.  Semi precious Stone Dinning Table Top Marble K...   
6       Ee                                  Used t shirt Blue   
7  SUMMARY                                                      

                  Qty UM Net price worth VAT [%] Gross  
0         Height 4,00                                   
1  Inlaid Coffee 5,00                                   
2         Dining 4,00                                   
3        Dinning 1,00                                   
4          Table 1,00                                   
5    Work Inlaid 5,00                          

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [11]:
# =============================
# 📌 Step 8: Create clean DataFrame
# =============================
df = pd.DataFrame(rows)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

print("\n✅ --- Parsed Table ---\n")
print(df)


✅ --- Parsed Table ---

       No.                                        Description  \
0      ils  6'x3' Marble Counter Dining Table Top Multi Fu...   
1       2.  30 Inches Floral Art Table Top Round Marble Ta...   
2        a  6'x3' Black Marble Inlaid Center Table Top Pea...   
3       4.  Luxurious Pattern Stone Table Top Marble Confe...   
4       Sy  36 Inches Marble Dinning Top Inlay Meeting Tab...   
5       6.  Semi precious Stone Dinning Table Top Marble K...   
6       Ee                                  Used t shirt Blue   
7  SUMMARY                                                      

                  Qty UM Net price worth VAT [%] Gross  
0         Height 4,00                                   
1  Inlaid Coffee 5,00                                   
2         Dining 4,00                                   
3        Dinning 1,00                                   
4          Table 1,00                                   
5    Work Inlaid 5,00                          

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
