In [2]:
import pandas as pd
from datetime import datetime

from os.path import join

from pprint import pprint

from typing import List, Tuple

import re

In [3]:
RAW_FOLDER_PATH = "./../data/raw"
CLEANED_FOLDER_PATH = "./../data/cleaned"

fileName = "Zepto 10-07-2025 - 20-06-2025.xlsx"

fileName = "Swiggy 22-06-2025 - 10-07-2025.xlsx"
fileName = "Swiggy 01-01-2025 - 15-04-2025.xlsx"
fileName = "Swiggy 09-04-2025 - 22-06-2025.xlsx"

excelBook: dict[str, pd.DataFrame] = pd.read_excel(
    join(RAW_FOLDER_PATH, fileName), sheet_name=None
)

In [None]:
class Extract:
    def __checkColumnName(self, col: pd.Series):
        return (
            col.astype(str)
            .apply(
                lambda string: " ".join(
                    list(
                        filter(
                            lambda x: not x.isspace() and x,
                            string.lower().replace("\n", "").split(" "),
                        )
                    )
                )
            )
            .isin(list(map(lambda x: x.lower(), self.columns)))
            .any()
        )

    def __init__(self, column: List[str]):
        self.columns: List[str] = column
        self.processedDataFrame: pd.DataFrame = pd.DataFrame(columns=self.columns)

    def __extractTable(self, row: pd.Series) -> bool:
        return str(row[self.columns[0]]).isnumeric()

    def __extractDateLocation(self, rawString: str) -> Tuple[datetime, str] | False:
        datePattern = "^(\d){1,2}[-/](\d){1,2}[-/](\d){4}"
        dateMached = re.match(datePattern, rawString)

        if dateMached:
            dateFormat = "%d-%m-%Y"
            date = datetime.strptime(
                re.sub(r"[-/]", "-", dateMached.group(0)), dateFormat
            )
            area = " ".join(
                list(
                    filter(
                        lambda x: x.isalpha(),
                        rawString[dateMached.span()[1] :].split(" "),
                    )
                )
            ).title()
            return (date, area)
        return (False, False)

    def extractDataFrame(self, sheetName: str, sheetData: pd.DataFrame):
        # sheetData.columns = self.columns[:-2]
        date, location = self.__extractDateLocation(sheetName)
        if date == False:
            print(sheetName)
            return pd.DataFrame()

        sheetData = sheetData[
            sheetData.columns[sheetData.apply(self.__checkColumnName, axis=0)]
        ].copy()

        sheetData.columns = self.columns
        sheetData = sheetData[sheetData.apply(self.__extractTable, axis=1)].copy()

        sheetData["Date"] = date
        sheetData["Location"] = location

        return sheetData

    def add(self, dataFrame: pd.DataFrame):
        dataFrame = dataFrame.dropna(how="all", axis=1)
        if not dataFrame.empty:
            self.processedDataFrame = pd.concat(
                [self.processedDataFrame, dataFrame], ignore_index=True
            )

    def save(self, fileName):
        self.processedDataFrame.to_csv(fileName, index=False)

In [5]:
extract = Extract(
    [
        # "No",
        "Article Code",
        "Item Description",
        # "Uom",
        "UoM",
        "Dispatched Qty",
        # "Invoice Qty.",
        # "Received Qty",
        "Rate",
        "Total Amount",
        # "Amount",
        # "Date",
        # "Location",
    ]
)

errorSheets: List[str] = []


for sheetName, sheetData in excelBook.items():
    if sheetData.empty:
        errorSheets.append(sheetName)
        continue

    # print(sheetData)
    df = extract.extractDataFrame(sheetName, sheetData)
    try:
        # pass
        if not df.empty:
            extract.add(df)
        else:
            errorSheets.append(sheetName)
    except:
        print(sheetData, sheetData.columns.__len__())
        # errorSheets.append(sheetName)
        break


print("The following sheets are not parsed: ")
pprint(errorSheets)

The following sheets are not parsed: 
[]


In [6]:
extract.save(join(CLEANED_FOLDER_PATH, fileName.rsplit(".", 1)[0]) + ".csv")