In [42]:
import pandas as pd
import os
import re
import regex


def _readData(targetTable :str,method:str) -> dict:
    
    if method == "LOCAL":
        dir = os.getcwd() + "/conversion_table/"
        match targetTable:
            case "BRAND":
                dir = dir + "brand_conversion.csv"
            case "COUNTRY":
                dir = dir + "country_conversion.csv"
            case "SPEC":
                dir = dir + "spec_conversion.csv"
            case "PRODUCT":
                dir = dir + "product_conversion.csv"
            case "WAREHOUSE":
                dir = dir + "warehouse_conversion.csv"
            case "SUPPLIER":
                dir = dir + "supplier_conversion.csv"
            case "CATEGORY":
                dir = dir + "product_conversion.csv"
            case "PACKING":
                dir = dir + "packing_conversion.csv"
            case "WEIGHTUNIT":
                dir = dir + "weightunit_conversion.csv"
            case _:
                raise TypeError("Undefined type : " + targetTable )

        df = pd.read_csv(dir)
        if targetTable == "PRODUCT":
            #product is a special case. There is a "category" column which affects the algorithm 
            df = df.drop(["category"], axis=1)
        elif targetTable == "CATEGORY":
            df = df[['category', 'Standard_product']]
            category_dict = df.groupby('category')['Standard_product'].apply(list).to_dict()
            return category_dict
        
        df = df.map(lambda x: x.lower() if isinstance(x, str) else x).T    
        new_columns = df.iloc[0]
        df_new = df[1:].reset_index(drop=True)
        df_new.columns = new_columns
        result_dict = df_new.to_dict('list')
        dataDict = {k: [i for i in v if type(i) != float] for k, v in result_dict.items()}
        return dataDict
    
    elif method == "AWS":
        return
            

    return 

def _compareString(stringValue: str, textlist: list) -> bool:
    stringValueLower = stringValue.lower()  # Convert stringValue to lowercase
    for text in textlist:
        if str(text).lower() in stringValueLower:  # Convert text to lowercase and compare
            return True
    return False

def _match(concatText: str, dataDict: dict) -> str:
    concatTextLower = concatText.lower()  # Convert concatText to lowercase for case-insensitive comparison
    for key, value in dataDict.items():
        if _compareString(concatTextLower, list(value)) or key.lower() in concatTextLower:
            return key
    return None

def _matchList(concatText: str, dataDict: dict) -> list:
    returnList = []
    concatTextLower = concatText.lower()  # Convert concatText to lowercase for case-insensitive comparison
    for key, value in dataDict.items():
        if _compareString(concatTextLower, list(value)) or (key.lower() in concatTextLower):
            returnList.append(key)
            
    return returnList


def getBrand(concatText:str) -> str :
    # this function return the strandard brand name 
    standardName = ""
    brandDict = _readData("BRAND","LOCAL")
    standardName = _match(concatText,brandDict)

    if standardName == "":
        print("no _match is found :" + concatText)
    
    if standardName is not None:
        return standardName.upper()
    
    return standardName

def getCountry(concatText:str) -> str :
    standardCountry = ""
    CountryDict = _readData("COUNTRY","LOCAL")
    standardCountry = _match(concatText,CountryDict)

    if standardCountry == "":
        print("no _match is found :" + concatText)
    return standardCountry

def getSpec(concatText:str) -> list:
    standardSpec = []
    specDict = _readData("SPEC","LOCAL")
    standardSpec = _matchList(concatText,specDict)
    if not standardSpec:
        print("no _match is found")
    return [s.upper() if isinstance(s, str) else s for s in standardSpec]

def getProduct(concatText:str) -> str:
    standardProduct = ""
    productDict = _readData("PRODUCT","LOCAL")
    standardProduct = _match(concatText, productDict)
    if standardProduct == "":
        print("no _match is found")
    if standardProduct is not None:
        return standardProduct.upper()

    return ""
    

def getWarehoue(concatText:str) -> list:
    # Searches the given string `concatText` for warehouse names and returns a list of found names.
    # Initializes an empty list `standardWarehoue` to store the matching warehouse names.
    # Retrieves a dictionary or list of warehouse names from a local data source through `_readData` with "WAREHOUSE" and "LOCAL" as parameters.
    # Calls `_matchList` to search `concatText` against `warehouseDict`, looking for matches.
    # If matches are found, `standardWarehoue` is updated with the list of matching names; if not, prints a message indicating no match.
    # Returns a list of identified warehouse names, which can be empty if no matches are found.

    standardWarehoue = []
    warehouseDict = _readData("WAREHOUSE","LOCAL")
    standardWarehoue = _matchList(concatText,warehouseDict)
    if not standardWarehoue:
        print("no _match is found")
    return standardWarehoue

def getSupplier(concatText:str) ->str:
    # Searches the given string for a supplier name and returns it.
    # Initializes an empty string `supplier` to store the found supplier name.
    # Retrieves a dictionary or list of supplier names from a local data source via `_readData` with "SUPPLIER" and "LOCAL" as arguments.
    # Uses `_matchList` function to compare `concatText` with `supplierDict` for any matching supplier names.
    # If a matching supplier name is found, `supplier` is set to this name; if not, a message is printed indicating no match.
    # Returns the name of the matched supplier, or an empty string if no match is found.

    supplier = ""
    supplierDict = _readData("SUPPLIER","LOCAL")
    supplier = _match(concatText,supplierDict)
    if supplier == "":
        print("no _match is found")
    return supplier
    

def getCategory(concatText:str) -> str:
    product = getProduct(concatText)
    if product != "":
        categoryDict = _readData("CATEGORY","LOCAL")
        category = _match(product,categoryDict)
        if category is not None:
            return category.upper()
        

    return ""

def getPacking(concatText:str) ->str:
    # Searches the given string `concatText` for packing types and returns the found type.
    # Initializes an empty string `packing` for storing the result.
    # Retrieves a dictionary or list of known packing types from a local data source via `_readData` with "PACKING" and "LOCAL" as parameters.
    # Uses `_matchList` to search `concatText` against `packingDict` for a match.
    # If a match is found, `packing` is updated with the matching packing type; if not, a message indicating no match is printed.
    # Returns the identified packing type, or an empty string if no match is found.

    packing = ""
    packingDict = _readData("PACKING","LOCAL")
    packing = _match(concatText,packingDict)
    if packing == "":
        print("no _match is found")
    return packing

def getPrice(concatText: str) -> float:
    # Direct and simplified regex pattern focusing on capturing numeric values
    # Optionally preceded by $ and followed by zero or more spaces and then units or the end of the string
    pattern = r"\$?\s*(\d+(?:\.\d+)?)\s*(?:KG|LB|/KG|/P|b|/lb|/磅|/包|/k|\Z)?"
    
    # Searching for all matches, considering case insensitivity for units
    matches = re.findall(pattern, concatText, flags=re.IGNORECASE)

    if matches:
        # Assuming the first match is the relevant price
        try:
            return float(matches[0])
        except ValueError:
            print(f"Conversion issue with: '{concatText}', found: {matches[0]}")
            return 0.0

    print(f"No valid price found in: '{concatText}'")
    return None
    

def getWeightUnit(concatText:str) -> str:
    # This function searches for and returns the weight unit found in a given string `concatText`.
    # It initializes an empty string `weightUnit` to store the found weight unit.
    # The `weightUnitDict` is obtained by calling `_readData` with parameters "WEIGHTUNIT" and "LOCAL",
    # which presumably reads a predefined list or dictionary of weight units from a local data source.
    # `_matchList` function is then called with `concatText` and `weightUnitDict` to search for a matching weight unit in the text.
    # If a match is found, `weightUnit` is set to the matching unit; otherwise, a message is printed indicating no match was found.
    # Finally, the found weight unit (or an empty string if no match is found) is returned.

    weightUnit = []
    weightUnitDict = _readData("WEIGHTUNIT","LOCAL")
    weightUnit = _matchList(concatText,weightUnitDict)
    if not weightUnit :
        print("no _match is found")
    return weightUnit[-1]



In [48]:
test_cases = [
    "$113.0",
    "Price is: $45.50",
    "Cost: $99/KG",
    "It's $1.99/LB this week.",
    "$100 is the total.",
    "The price is $ 123.45 per unit.",
    "A discount at $5 off",
    "Now only $0.99!",
    "Save on our deal of $100.00 savings",
    "This item costs $ 50",
    "$75",
    "$123.99 per package",
    "It was $100.00 before the discount.",
    "Expect to pay around $40/kg for quality goods.",
    "Prices start at $9.99/lb for our premium product line.",
    "Just $199.99 for a limited time!",
    "Get it now for the low price of $250!",
    "On sale: $150!",
    "Originally $300, now just $150!",
    "Special promotion: only $0.99 per lb!",
    "40/lb",
    "50/",
    "30",
    "西冷美麗華4k+3條庄留單prime22.90P 23.00P"
    # Add more test cases as needed
]

# Running the test loop
for case in test_cases:
    result = getPrice(case)
    print(f"Extracted from '{case}': {result}")

Extracted from '$113.0': 113.0
Extracted from 'Price is: $45.50': 45.5
Extracted from 'Cost: $99/KG': 99.0
Extracted from 'It's $1.99/LB this week.': 1.99
Extracted from '$100 is the total.': 100.0
Extracted from 'The price is $ 123.45 per unit.': 123.45
Extracted from 'A discount at $5 off': 5.0
Extracted from 'Now only $0.99!': 0.99
Extracted from 'Save on our deal of $100.00 savings': 100.0
Extracted from 'This item costs $ 50': 50.0
Extracted from '$75': 75.0
Extracted from '$123.99 per package': 123.99
Extracted from 'It was $100.00 before the discount.': 100.0
Extracted from 'Expect to pay around $40/kg for quality goods.': 40.0
Extracted from 'Prices start at $9.99/lb for our premium product line.': 9.99
Extracted from 'Just $199.99 for a limited time!': 199.99
Extracted from 'Get it now for the low price of $250!': 250.0
Extracted from 'On sale: $150!': 150.0
Extracted from 'Originally $300, now just $150!': 300.0
Extracted from 'Special promotion: only $0.99 per lb!': 0.99
Ext

In [None]:
df = pd.read_csv(os.getcwd() + "/conversion_table/"+"product_conversion.csv")
df = df.drop(["category"], axis=1)
df

In [None]:
warehouseDict = readData("WAREHOUSE","LOCAL")
specDict = readData("SPEC","LOCAL")
brandDict = readData("BRAND","LOCAL")
productDict = readData("PRODUCT","LOCAL")
display(productDict)

In [None]:
df = pd.read_csv(os.getcwd() + "/conversion_table/category_conversion.csv")
category_dict = df.groupby('category')['Standard_product'].apply(list).to_dict()
category_dict

In [None]:
getPrice("$113.0")


In [None]:
print(matchList("西冷美麗華4k+3條庄留單prime22.90P 23.00P",specDict))

In [None]:
def use_regex(input_text):
    pattern = re.compile(r"[A-Za-z0-9]+牛下肩肉眼卷\([^)]*\)[A-Za-z]+", re.IGNORECASE)
    return pattern.match(input_text)

print(use_regex("dfa牛下肩肉眼卷daf"))