In [1]:
import pandas as pd

import camelot

import re
from glob import glob
from pprint import pprint
from datetime import datetime

from typing import List, Tuple

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [2]:
class Extract:
    def __checkColumnName(self, col: pd.Series):
        return (
            col.astype(str)
            .apply(
                lambda string: " ".join(
                    list(
                        filter(
                            lambda x: not x.isspace() and x,
                            string.lower().replace("\n", " ").split(" "),
                        )
                    )
                )
            )
            .isin(list(map(lambda x: x.lower(), self.columns)))
            .any()
        )

    def __init__(self, columns: List[str]):
        self.columns: List[str] = columns
        self.processedDataFrame: pd.DataFrame = pd.DataFrame(columns=self.columns)

    def extractedTableFromPDF(self, filePath):
        pages = camelot.read_pdf(filePath, pages="all")
        df = pd.DataFrame(columns=self.columns)
        for table in pages:
            if len(table.df.columns) >= len(self.columns):
                table.df.columns = self.columns
            df = pd.concat([df, table.df], axis=0, ignore_index=True)
        return df

    def __extractTable(self, row: pd.Series) -> bool:
        return str(row[self.columns[0]]).isnumeric()

    def __extractLabelDate(self, row: str) -> Tuple[List[str], List[datetime]]:
        rawDate = row.split("\n")

        dateFormat = "%d %b, %Y"
        labels, dates = rawDate[:4], [rawDate[4]] + list(
            map(lambda date: datetime.strptime(date, dateFormat), rawDate[5:])
        )

        return (labels, dates)

    def extractDataFrame(self, df: pd.DataFrame):
        labels, dates = self.__extractLabelDate(df.iloc[2, 0])

        df = df[df.columns[df.apply(self.__checkColumnName, axis=0)]].copy()

        df.columns = self.columns
        df = df[df.apply(self.__extractTable, axis=1)].copy()

        for label, date in zip(labels, dates):
            df[label] = date

        return df

    def add(self, dataFrame: pd.DataFrame):
        dataFrame = dataFrame.dropna(how="all", axis=1)
        if not dataFrame.empty:
            self.processedDataFrame = pd.concat(
                [self.processedDataFrame, dataFrame], ignore_index=True
            )

    def save(self, fileName):
        self.processedDataFrame.reset_index(drop=True).to_csv(fileName)

In [None]:
extract = Extract(
    [
        "Sl No.",
        "Product Name",
        "HSN",
        "Qty. Del.",
        "Price Per Unit",
        "UoM",
        "Total",
    ]
)

extractionError = []

k = 1
for i in glob("./../data/raw/Blink It/*.pdf"):

    print(f"{k}: '{i}'")
    k += 1

    try:
        df = extract.extractedTableFromPDF(i)
        if not df.empty:
            df = extract.extractDataFrame(df)

        if not df.empty:
            extract.add(df)
    except Exception as e:
        extractionError.append((i, e))

extract.processedDataFrame

1: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1269666-1.pdf'
2: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1274368-1.pdf'
3: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1276694-1.pdf'
4: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1279272-1.pdf'
5: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1281280-1.pdf'
6: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1283870-1.pdf'
7: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1286137-1.pdf'
8: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1288444-1.pdf'
9: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1291612-1.pdf'
10: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1293847-1.pdf'
11: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1294697-1.pdf'
12: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1295966-1.pdf'
13: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1296693-1.pdf'
14: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1298679-1.pdf'
15: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1300959-1.pdf'
16: './../data/raw/Blink It\Invoice-CPCMH25-GRN-1303813-1.pdf'
1

Unnamed: 0,Sl No.,Product Name,HSN,Qty. Del.,Price Per Unit,UoM,Total,Invoice Number,Order Date,Invoice Date,Delivery Date
0,1,"BH-Baby Bottle Gourd\nSmall, 1 Piece (250 - 35...",07099390,14,11.5,Kilogram,₹ 161.00,2025/18238/1,2024-12-05,2024-12-05,2024-12-05
1,2,"BH - Fresh Cabbage, 400\ngm - 600 gm (1 Unit)",07049000,31,18,Per piece,₹ 558.00,2025/18238/1,2024-12-05,2024-12-05,2024-12-05
2,3,"BH - Cauliflower, 1 Unit\n(400 - 600 gm)",07041000,31,18,Per piece,₹ 558.00,2025/18238/1,2024-12-05,2024-12-05,2024-12-05
3,4,"BH-Desi Tomato, 505 -\n515 gm",07020000,32,31,Count,₹ 992.00,2025/18238/1,2024-12-05,2024-12-05,2024-12-05
4,5,"BH - Fresh Garlic\n(Lehsun), 200 gm",07049000,19,96,Count,₹ 1824.00,2025/18238/1,2024-12-05,2024-12-05,2024-12-05
...,...,...,...,...,...,...,...,...,...,...,...
8896,3,"BH - Fresh Cabbage, 400\ngm - 600 gm (1 Unit)",07049000,209,18,Per piece,₹ 3762.00,2026/18238/227,2025-06-29,2025-06-29,2025-06-29
8897,4,"BH - Cauliflower, 300 gm\n- 500 gm (1 Unit)",07031010,170,47,Count,₹ 7990.00,2026/18238/227,2025-06-29,2025-06-29,2025-06-29
8898,5,"BH - Fresh Green\nCapsicum, 250 - 280 gm",07041000,277,23.5,Count,₹ 6509.50,2026/18238/227,2025-06-29,2025-06-29,2025-06-29
8899,6,"BH - Fresh Cucumber\n(Kheera), 500 - 700 gm",07049000,506,18.8,Count,₹ 9512.80,2026/18238/227,2025-06-29,2025-06-29,2025-06-29


In [5]:
extract.processedDataFrame.reset_index(drop=True).to_csv(
    "./../data/cleaned/Blinkit.csv",
)