In [1]:
import re
from collections import Counter
from datetime import datetime
from glob import glob
from math import floor
from os.path import join
from pprint import pprint
from typing import List, Tuple

import numpy as np
import pandas as pd

In [2]:
def nameExtracter(rightNamesList: List[str], wrongName: str) -> str:
    chances = [0] * len(rightNamesList)

    counterOriginal = Counter(wrongName.lower())
    for i, rightName in enumerate(rightNamesList):
        counterLocation = Counter(rightName.lower())
        chances[i] = (
            sum((counterOriginal & counterLocation).values())
            * 2
            / (sum(counterOriginal.values()) + sum(counterLocation.values()))
        )
    return rightNamesList[chances.index(max(chances))]


prev = 0


def pretty(label: str, completed: float, total: int, length: int = 30) -> None:
    global prev
    print(
        " " * prev * 2
        + f"\r{label} ["
        + "=" * int(completed / total * length)
        + "-" * int((total - completed) / total * length)
        + f"] {int(completed / total * 100)}%",
        end="",
    )
    prev = length


def generateInvoiceId(date: datetime, locationCode: str, invoiceVersion: int):
    return f'{date.strftime("%d%m%Y")}U{locationCode}{invoiceVersion}'


def generatePONo(date: datetime, storeId: str, supplierId: str):
    return f'{date.strftime("%Y%m%d")}-{storeId}-{supplierId}'

##### Hard Code


In [3]:
RAW_FOLDER_PATH = "./../../upgrade_mandi/data/raw"
CLEANED_FOLDER_PATH = "./../../upgrade_mandi/data/cleaned"

SWIGGY_RAW = join(RAW_FOLDER_PATH, "swiggy")

domains = {
    "Swiggy": {
        "path": {
            "raw": "swiggy",
            "cleand": "swiggy",
        },
        "input-columns": [
            "Article Code",
            "Item Description",
            "UoM",
            "Dispatched Qty",
            "Rate",
            "Total Amount",
        ],
        "output-columns": [
            "Sr",
            "Article Code",
            "Item Description",
            "UoM",
            "Dispatched Qty",
            "Recieved Qty",
            "Rate",
            "Total Amount",
        ],
        "locations": {
            "Ayodhya Nagar": {
                "shipping-address": "Gadewar Lawns Plot No.31; 32; 33; 36; 37 And 38; K. H. No; 72/2; Situated At Gadewar Lawn; Shri Ram Wadi",
                "retailer": "Rajidi Retail Pvt Ltd",
                "code": "AN",
                "storeId": "1403419",
            },
            "Byramji": {
                "shipping-address": "Unit nos - 59 to 71 Lower Ground Floor Ginger Square' City Survey No - 1049",
                "retailer": "Rajidi Retail Pvt Ltd",
                "code": "B",
                "storeId": "1392084",
            },
            "Dharampeth": {
                "shipping-address": "Plot No. 151; CTS No. 135 Puja Sabhagrah; Ravi Nagar Square; Ram Nagar",
                "retailer": "Swinsta Ent Private Limited",
                "code": "DH",
                "storeId": "1397624",
            },
            "Mahal": {
                "shipping-address": "Unit no - G-1; Plot no.58; sardar patel timber Dhantoli; NAGPUR- 440027",
                "retailer": "Rajidi Retail Pvt Ltd",
                "code": "MH",
                "storeId": "1393571",
            },
            "Manish Nagar": {
                "shipping-address": "Ground floor 'Jayanti Mansion III; Manish nagar  Nagpur Maharashtra",
                "retailer": "Rajidi Retail Pvt Ltd",
                "code": "MN",
                "storeId": "1392532",
            },
            "Nandanvan": {
                "shipping-address": "Vinayak Tower; Lower Ground Floor; Survey No.212 Gurudev Nagar Main Road; New Nanadanvan",
                "retailer": "Swinsta Ent Private Limited",
                "code": "NA",
                "storeId": "1397035",
            },
            "Sai Mandir": {
                "shipping-address": "Khasra No 18/2; city Survey No.718; House No. 781/B; Situated at Village Ajni",
                "retailer": "Swinsta Ent Private Limited",
                "code": "S",
                "storeId": "1399707",
            },
        },
    },
    "Zepto": {
        "path": {
            "raw": "zepto",
            "cleand": "zepto",
        },
        "input-columns": [
            "No",
            "Article Name",
            "UoM",
            "Invoice Qty.",
            "Rate",
            "Amount",
        ],
        "locations": {
            "Gokulpeth": {"shipping-address": "", "retailer": "Dorgheria"},
            "Mahada": {"shipping-address": "", "retailer": "Dorgheria"},
            "Khamla": {"shipping-address": "", "retailer": "Dorgheria"},
            "Garoba Maidan": {"shipping-address": "", "retailer": "Dorgheria"},
            "Raghuji Nagar": {"shipping-address": "", "retailer": "Dorgheria"},
            "Zingabai Takli": {"shipping-address": "", "retailer": "Dorgheria"},
            "Bhupesh Nagar": {"shipping-address": "", "retailer": "Dorgheria"},
            "Besa": {"shipping-address": "", "retailer": "Dorgheria"},
        },
    },
}

##### Imports


In [4]:
excelBooks: dict[str, List[dict[str, pd.DataFrame]]] = {
    domain: [
        pd.read_excel(path, sheet_name=None)
        for path in glob(
            join(RAW_FOLDER_PATH, domains[domain]["path"]["raw"], "*.xlsx")
        )
    ]
    for domain in domains
}

print(
    f"Found {len(excelBooks)} domains with {sum([len(excelBooks[domain]) for domain in excelBooks])} files."
)

pprint(
    {
        domain: [
            path
            for path in glob(
                join(RAW_FOLDER_PATH, domains[domain]["path"]["raw"], "*.xlsx")
            )
        ]
        for domain in domains
    }
)

Found 2 domains with 4 files.
{'Swiggy': ['./../../upgrade_mandi/data/raw\\swiggy\\Swiggy 09-04-2025 to  '
            '22-06-2025.xlsx',
            './../../upgrade_mandi/data/raw\\swiggy\\Swiggy 22-06-2025 to '
            'Current.xlsx',
            './../../upgrade_mandi/data/raw\\swiggy\\Swiggy 28-01-2025 to '
            '08-04-2025.xlsx'],
 'Zepto': ['./../../upgrade_mandi/data/raw\\zepto\\Zepto 10-07-2025 - '
           '20-06-2025.xlsx']}


##### Extractor


In [7]:
class Extract:
    def __checkColumnName(self, col: pd.Series):
        return (
            col.astype(str)
            .apply(
                lambda string: " ".join(
                    list(
                        filter(
                            lambda x: not x.isspace() and x,
                            string.lower().replace("\n", "").split(" "),
                        )
                    )
                )
            )
            .isin(list(map(lambda x: x.lower(), self.columns)))
            .any()
        )

    def __extractTable(self, row: pd.Series) -> bool:
        return str(row[self.columns[0]]).isnumeric()

    def __extractDateLocation(self, rawString: str) -> Tuple[datetime, str] | False:
        datePattern = "^(\d){1,2}[-/](\d){1,2}[-/](\d){4}"
        dateMached = re.match(datePattern, rawString)

        if dateMached:
            dateFormat = "%d-%m-%Y"
            date = datetime.strptime(
                re.sub(r"[-/]", "-", dateMached.group(0)), dateFormat
            ).strftime("%d-%m-%Y")

            area = nameExtracter(
                list(self.locations.keys()),
                rawString,
            ).title()

            return (date, area)
        return (False, False)

    # dict[str, dict[str, str]]
    def __init__(self, columns: List[str], locations: dict, vendorName: str):
        self.columns: List[str] = columns
        self.locations = locations
        self.vendorName = vendorName

        self.locationParsed = {x: [] for x in self.locations}
        self.processedDataFrame: pd.DataFrame = pd.DataFrame(columns=self.columns)

    def extractDataFrame(self, sheetName: str, sheetData: pd.DataFrame):
        # sheetData.columns = self.columns[:-2]
        date, location = self.__extractDateLocation(sheetName)
        if date == False:
            return pd.DataFrame()

        sheetData = sheetData[
            sheetData.columns[sheetData.apply(self.__checkColumnName, axis=0)]
        ].copy()

        sheetData.columns = self.columns
        sheetData = sheetData[sheetData.apply(self.__extractTable, axis=1)].copy()

        invoiceVersion = 1
        numbers = re.findall(r"\d+(?:\.\d+)?", sheetName[11:])
        if numbers:
            invoiceVersion = int(numbers[0])

        sheetData["Date"] = date
        sheetData["Location"] = location
        sheetData["Invoice Version"] = invoiceVersion
        sheetData["Retailer"] = self.locations[location]["retailer"]
        sheetData["Vendor Name"] = self.vendorName
        sheetData["Shipping Address"] = self.locations[location]["shipping-address"]
        sheetData["PO No"] = generatePONo(
            datetime.strptime(date, "%d-%m-%Y"),
            self.locations[location]["storeId"],
            "74227878",
        )

        sheetData["Invoice Version"].astype(int)

        self.locationParsed[location].append(sheetName)

        return sheetData

    def add(self, dataFrame: pd.DataFrame):
        dataFrame = dataFrame.dropna(how="all", axis=1)
        if not dataFrame.empty:
            self.processedDataFrame = pd.concat(
                [self.processedDataFrame, dataFrame], ignore_index=True
            )

    def save(self, fileName, split=-1):
        self.processedDataFrame.to_csv(f"{fileName}.csv", index=False)

##### Run


In [8]:
extractedDomains: dict[str, Extract] = {}

errorSheets: dict[str, List[dict[str, str]]] = {domain: [] for domain in domains}

for domain in domains:
    if domain == "Zepto":
        continue

    extract = Extract(
        domains[domain]["input-columns"],
        domains[domain]["locations"],
        "Upgrade Mandi",
    )
    for excelBook in excelBooks[domain]:
        count = 0
        for sheetName, sheetData in excelBook.items():
            if sheetData.empty:
                errorSheets[domain].append({sheetName: "Empty sheet"})
                continue

            df = extract.extractDataFrame(sheetName, sheetData)
            try:
                if not df.empty:
                    extract.add(df)
                else:
                    errorSheets[domain].append({sheetName: "Empty DataFrame"})
            except Exception as e:
                errorSheets[domain].append({sheetName: str(e)})
                break
            count += 1
            pretty(domain, count, len(excelBook), 30)
        print()
    print(f"Processed {domain} data.\n")

    extractedDomains[domain] = extract

print("The following sheets are not parsed: ")
pprint(errorSheets)

Processed Swiggy data.

The following sheets are not parsed: 
{'Swiggy': [], 'Zepto': []}


In [None]:
extractedDomains["Swiggy"].processedDataFrame.head(1)

Unnamed: 0,Article Code,Item Description,UoM,Dispatched Qty,Rate,Total Amount,Date,Location,Invoice Version,Retailer,Vendor Name,Shipping Address,PO No
0,8439,Beans Diced,200g,2,40,80,06-22-2025,Manish Nagar,1.0,Rajidi Retail Pvt Ltd,Upgrade Mandi,Ground floor 'Jayanti Mansion III; Manish naga...,20250622-1392532-74227878


In [None]:
extractedDomains["Swiggy"].processedDataFrame["Total Amount"] = (
    extractedDomains["Swiggy"]
    .processedDataFrame.apply(lambda row: row["Dispatched Qty"] * row["Rate"], axis=1)
    .astype(int)
)

In [None]:
for domain in extractedDomains:
    extractedDomains[domain].save(join(CLEANED_FOLDER_PATH, domain.lower(), domain))

##### Test
