In [1]:
import re
from collections import Counter
from datetime import datetime
from glob import glob
from math import floor
from os.path import join
from pprint import pprint
from typing import List, Tuple

import numpy as np
import pandas as pd

In [2]:
def nameExtracter(rightNamesList: List[str], wrongName: str) -> str:
    chances = [0] * len(rightNamesList)

    counterOriginal = Counter(wrongName.lower())
    for i, rightName in enumerate(rightNamesList):
        counterLocation = Counter(rightName.lower())
        chances[i] = (
            sum((counterOriginal & counterLocation).values())
            * 2
            / (sum(counterOriginal.values()) + sum(counterLocation.values()))
        )
    return rightNamesList[chances.index(max(chances))]


prev = 0


def pretty(label: str, completed: float, total: int, length: int = 30) -> None:
    global prev
    print(
        " " * prev * 2
        + f"\r{label} ["
        + "=" * int(completed / total * length)
        + "-" * int((total - completed) / total * length)
        + f"] {int(completed / total * 100)}%",
        end="",
    )
    prev = length

##### Hard Code


In [3]:
RAW_FOLDER_PATH = "./../data/raw"
CLEANED_FOLDER_PATH = "./../data/cleaned"

domains = ["Swiggy", "Zepto"]

##### Imports


In [4]:
excelBooks: dict[str, List[dict[str, pd.DataFrame]]] = {
    domain: [
        pd.read_excel(path, sheet_name=None)
        for path in glob(join(RAW_FOLDER_PATH, "*.xlsx"))
        if domain == nameExtracter(domains, path)
    ]
    for domain in domains
}

print(
    f"Found {len(excelBooks)} domains with {sum([len(excelBooks[domain]) for domain in excelBooks])} files."
)

pprint(
    {
        domain: [
            path
            for path in glob(join(RAW_FOLDER_PATH, "*.xlsx"))
            if domain == nameExtracter(domains, path)
        ]
        for domain in domains
    }
)

Found 2 domains with 4 files.
{'Swiggy': ['./../data/raw\\Swiggy 01-01-2025 - 15-04-2025.xlsx',
            './../data/raw\\Swiggy 09-04-2025 - 22-06-2025.xlsx',
            './../data/raw\\Swiggy 22-06-2025 - 10-07-2025.xlsx'],
 'Zepto': ['./../data/raw\\Zepto 10-07-2025 - 20-06-2025.xlsx']}


##### Extractor


In [5]:
class Extract:
    def __checkColumnName(self, col: pd.Series):
        return (
            col.astype(str)
            .apply(
                lambda string: " ".join(
                    list(
                        filter(
                            lambda x: not x.isspace() and x,
                            string.lower().replace("\n", "").split(" "),
                        )
                    )
                )
            )
            .isin(list(map(lambda x: x.lower(), self.columns)))
            .any()
        )

    def __extractTable(self, row: pd.Series) -> bool:
        return str(row[self.columns[0]]).isnumeric()

    def __extractDateLocation(self, rawString: str) -> Tuple[datetime, str] | False:
        datePattern = "^(\d){1,2}[-/](\d){1,2}[-/](\d){4}"
        dateMached = re.match(datePattern, rawString)

        if dateMached:
            dateFormat = "%d-%m-%Y"
            date = datetime.strptime(
                re.sub(r"[-/]", "-", dateMached.group(0)), dateFormat
            ).strftime("%m-%d-%Y")

            # print(self.locations.keys())
            area = nameExtracter(
                list(self.locations.keys()),
                rawString,
            ).title()

            return (date, area)
        return (False, False)

    # dict[str, dict[str, str]]
    def __init__(self, columns: List[str], locations: List[str], vendorName: str):
        self.columns: List[str] = columns
        self.locations = locations
        self.vendorName = vendorName

        self.locationParsed = {x: [] for x in self.locations}
        self.processedDataFrame: pd.DataFrame = pd.DataFrame(columns=self.columns)

    def extractDataFrame(self, sheetName: str, sheetData: pd.DataFrame):
        # sheetData.columns = self.columns[:-2]
        date, location = self.__extractDateLocation(sheetName)
        if date == False:
            print(sheetName)
            return pd.DataFrame()

        sheetData = sheetData[
            sheetData.columns[sheetData.apply(self.__checkColumnName, axis=0)]
        ].copy()

        sheetData.columns = self.columns
        sheetData = sheetData[sheetData.apply(self.__extractTable, axis=1)].copy()

        invoiceVersion = 1
        numbers = re.findall(r"\d+(?:\.\d+)?", sheetName[11:])
        if numbers:
            invoiceVersion = int(numbers[0])

        sheetData["Date"] = date
        sheetData["Location"] = location
        self.locationParsed[location].append(sheetName)
        sheetData["Invoice Version"] = invoiceVersion
        sheetData["Retailer"] = self.locations[location]["retailer"]
        sheetData["Vendor Name"] = self.vendorName
        sheetData["Shipping Address"] = self.locations[location]["shipping-address"]
        sheetData["Invoice Version"].astype(int)

        return sheetData

    def add(self, dataFrame: pd.DataFrame):
        dataFrame = dataFrame.dropna(how="all", axis=1)
        if not dataFrame.empty:
            self.processedDataFrame = pd.concat(
                [self.processedDataFrame, dataFrame], ignore_index=True
            )

    def save(self, fileName):
        self.processedDataFrame.to_csv(fileName, index=False)

##### Run


In [6]:
inputParameters = {
    "Swiggy": {
        "columns": [
            "Article Code",
            "Item Description",
            "UoM",
            "Dispatched Qty",
            "Rate",
            "Total Amount",
        ],
        "locations": {
            "Nandanvan": {
                "shipping-address": 'Vinayak Tower", Lower Ground Floor, Survey No.212 Gurudev Nagar Main Road, New Nanadanvan',
                "retailer": "Swinsta",
            },
            "Dharampeth": {
                "shipping-address": "Plot No. 151, CTS No. 135Puja Sabhagrah, Ravi Nagar Square, Ram Nagar",
                "retailer": "Swinsta",
            },
            "Mahal": {
                "shipping-address": "Unit no - G-1, Plot no.58, sardar patel timber Dhantoli, NAGPUR- 440027",
                "retailer": "Rajidi",
            },
            "Ayodhya Nagar": {
                "shipping-address": "Gadewar Lawns Plot No.31,32,33,36,37And 38,K. H. No, 72/2, Situated At Gadewar Lawn , Shri Ram Wadi",
                "retailer": "Rajidi",
            },
            "Sai Mandir": {
                "shipping-address": "Khasra No 18/2, city Survey No.718, House No. 781/B, Situated at Village Ajni",
                "retailer": "Swinsta",
            },
            "Manish Nagar": {
                "shipping-address": 'Ground floor "Jayanti Mansion III"Manish nagar  Nagpur Maharashtra',
                "retailer": "Rajidi",
            },
            "Byramji": {
                "shipping-address": 'Unit nos - 59 to 71 Lower Ground FloorGinger Square" City Survey No - 1049',
                "retailer": "Rajidi",
            },
        },
    },
    "Zepto": {
        "columns": [
            "No",
            "Article Name",
            "UoM",
            "Invoice Qty.",
            "Rate",
            "Amount",
        ],
        "locations": {
            "Gokulpeth": {"shipping-address": "", "retailer": "Dorgheria"},
            "Mahada": {"shipping-address": "", "retailer": "Dorgheria"},
            "Khamla": {"shipping-address": "", "retailer": "Dorgheria"},
            "Garoba Maidan": {"shipping-address": "", "retailer": "Dorgheria"},
            "Raghuji Nagar": {"shipping-address": "", "retailer": "Dorgheria"},
            "Zingabai Takli": {"shipping-address": "", "retailer": "Dorgheria"},
            "Bhupesh Nagar": {"shipping-address": "", "retailer": "Dorgheria"},
            "Besa": {"shipping-address": "", "retailer": "Dorgheria"},
        },
    },
}

extractedDomains: dict[str, pd.DataFrame] = {}

errorSheets: dict[str, List[str]] = {domain: [] for domain in domains}

for domain in domains:
    # if domain == "Swiggy":
    #     continue
    extract = Extract(
        inputParameters[domain]["columns"],
        inputParameters[domain]["locations"],
        "Upgrade Mandi",
    )
    for excelBook in excelBooks[domain]:
        count = 0
        for sheetName, sheetData in excelBook.items():
            if sheetData.empty:
                errorSheets.append(sheetName)
                continue
            # print(sheetName)

            try:
                df = extract.extractDataFrame(sheetName, sheetData)
                # pass
                if not df.empty:
                    extract.add(df)
                else:
                    errorSheets.append(sheetName)
            except:
                # print(sheetData, sheetData.columns.__len__())
                # print(f"Error processing sheet: {sheetName}")
                errorSheets[domain].append(sheetName)
                break
            count += 1
            pretty(domain, count, len(excelBook), 30)
        print()
    print(f"Processed {domain} data.\n")

    extractedDomains[domain] = extract

print("The following sheets are not parsed: ")
pprint(errorSheets)

Processed Swiggy data.

Processed Zepto data.

The following sheets are not parsed: 
{'Swiggy': [], 'Zepto': []}


In [19]:
for domain in extractedDomains:
    extractedDomains[domain].save(join(CLEANED_FOLDER_PATH, f"{domain}.csv"))

##### Test


In [63]:
ser = pd.DataFrame(
    {
        "Po": [
            (
                # sheetData.iloc[:, 5][3]
                # if str(sheetData.iloc[:, 3][3]) == "nan"
                # else
                sheetData.iloc[:, 3][3]
            )
            for sheetData in excelBooks["Swiggy"][0].values()
        ],
        "loc": [(sheetName) for sheetName in excelBooks["Swiggy"][0].keys()],
    }
)
ser[ser["Po"].apply(lambda po: str(po) == "nan")]["loc"]
# ser.apply(lambda row: row["loc"] if str(row["Po"]) == "nan" else row["Po"], axis=1)

3       08-04-2025 DHARAMPETH
4            08-04-2025 MAHAL
9       07-04-2025 DHARAMPETH
10           07-04-2025 MAHAL
20           06-04-2025 MAHAL
                ...          
342          04-02-2025 MAHAL
348      03-02-2025 DHARMPETH
352      01-02-2025 DHARMPETH
361    31-01-2025  DHARAMPETH
367         30-01-2025 Dharam
Name: loc, Length: 112, dtype: object

In [None]:
df = pd.DataFrame(
    {
        "loc": [
            "".join(x.iloc[:, 0][1:4]) for x in list(excelBooks["Swiggy"][0].values())
        ],
        "add": list(excelBooks["Swiggy"][0].keys()),
        "count": [x for x in range(len(excelBooks["Swiggy"][0].keys()))],
    }
)

df.groupby("loc")["count"].sum()

loc
Shipping Address :-Gadewar Lawns Plot No.31,32,33,36,37And 38,K. H. No, 72/2, Situated At Gadewar Lawn , Shri Ram Wadi      905
Shipping Address :-Ground floor "Jayanti Mansion III"Manish nagar  Nagpur Maharashtra                                     11092
Shipping Address :-Khasra No 18/2, city Survey No.718, House No. 781/B, Situated at Village Ajni,                         11512
Shipping Address :-Plot No. 151, CTS No. 135Puja Sabhagrah , Ravi Nagar Square, Ram Nagar                                 11664
Shipping Address :-Unit no - G-1, Plot no.58 ,sardar patel timber Dhantoli , NAGPUR- 440027                               11231
Shipping Address :-Unit nos - 59 to 71 Lower Ground FloorGinger Square" City Survey No - 1049                             12421
Shipping Address :-Vinayak Tower" , Lower Ground Floor, Survey No.212 Gurudev Nagar Main Road , New Nanadanvan ,          11300
Name: count, dtype: int64

In [None]:
extractedDomains["Swiggy"].processedDataFrame.columns

Index(['Article Code', 'Item Description', 'UoM', 'Dispatched Qty', 'Rate',
       'Total Amount', 'Date', 'Location', 'Invoice Version', 'Retailer',
       'Vendor Name'],
      dtype='object')

In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle
from reportlab.lib import colors


def create_styled_table_pdf(filename):
    doc = SimpleDocTemplate(filename, pagesize=letter)
    elements = []
    data = [
        ["Header 1", "Header 2", "Header 3"],
        ["Row 1, Col 1", "Row 1, Col 2", "Row 1, Col 3"],
        ["Row 2, Col 1", "Row 2, Col 2", "Row 2, Col 3"],
    ]
    table = Table(data)
    style = TableStyle(
        [
            ("BACKGROUND", (0, 0), (-1, 0), colors.grey),
            ("TEXTCOLOR", (0, 0), (-1, 0), colors.whitesmoke),
            ("ALIGN", (0, 0), (-1, -1), "CENTER"),
            ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
            ("BOTTOMPADDING", (0, 0), (-1, 0), 12),
            ("BACKGROUND", (0, 1), (-1, -1), colors.beige),
            ("GRID", (0, 0), (-1, -1), 1, colors.black),
        ]
    )
    table.setStyle(style)
    elements.append(table)
    doc.build(elements)


create_styled_table_pdf("styled_table_report.pdf")

In [None]:
list(excelBooks["Swiggy"][0].values())[0].iloc[:, 0][1:4]

1                                  Shipping Address :-
2    Vinayak Tower" , Lower Ground Floor, Survey No...
3          Gurudev Nagar Main Road , New Nanadanvan , 
Name: SWINSTA ENT. PRIVATE LIMITATE, dtype: object

In [None]:
{
    "Nandanvan": ,
    
	"Byramji": ,
    
	"Dharampeth": ,
    "Mahal": ,
    "Ayodha Nagar": ,
    "Sai Mandir": ,
    "Manish Nagar": ,
}

In [None]:
for sheetName, sheetData in df.items():
    if (
        "RAJIDI RETAILS PRIVATE LIMITED" not in sheetData.columns
        and "SWINSTA ENT. PRIVATE LIMITATE" not in sheetData.columns
    ):
        print(sheetName)