In [66]:
# the get method is currently using local csv in long run it should be updated to AWS
import pandas as pd
import os
import re
import regex


def _readData(targetTable :str,method:str) -> dict:
    
    if method == "LOCAL":
        dir = os.getcwd() + "/conversion_table/"
        match targetTable:
            case "BRAND":
                dir = dir + "brand_conversion.csv"
            case "COUNTRY":
                dir = dir + "country_conversion.csv"
            case "SPEC":
                dir = dir + "spec_conversion.csv"
            case "PRODUCT":
                dir = dir + "product_conversion.csv"
            case "WAREHOUSE":
                dir = dir + "warehouse_conversion.csv"
            case "SUPPLIER":
                dir = dir + "supplier_conversion.csv"
            case "CATEGORY":
                dir = dir + "category_conversion.csv"
            case "PACKING":
                dir = dir + "packing_conversion.csv"
            case "WEIGHTUNIT":
                dir = dir + "weightunit_conversion.csv"
            case _:
                raise TypeError("Undefined type : " + targetTable )

        df = pd.read_csv(dir)
        if targetTable == "PRODUCT":
            #product is a special case. There is a "category" column which affects the algorithm 
            df = df.drop(["category"], axis=1)
        elif targetTable == "CATEGORY":
            category_dict = df.groupby('category')['Standard_product'].apply(list).to_dict()
            return category_dict
        
        df = df.map(lambda x: x.lower() if isinstance(x, str) else x).T    
        new_columns = df.iloc[0]
        df_new = df[1:].reset_index(drop=True)
        df_new.columns = new_columns
        result_dict = df_new.to_dict('list')
        dataDict = {k: [i for i in v if type(i) != float] for k, v in result_dict.items()}
        return dataDict
    
    elif method == "AWS":
        return
            

    return 

def _compareString(stringValue:str, textlist:list)-> str:
    stringValueLower = stringValue.lower()  # Convert stringValue to lowercase
    for text in textlist:
        if str(text).lower() in stringValueLower:  # Convert text to lowercase and compare
            return True
    return False

def _match(concatText:str, dataDict:dict,) -> str:
    for key, value in dataDict.items():
        if _compareString(concatText,list(value)) or key in concatText:
            return key
    return None

def _matchList(concatText:str, dataDict:dict,) -> list:
    returnList = []
    for key, value in dataDict.items():
        if _compareString(concatText,list(value)) or (key in concatText):
            returnList.append(key)
            
    return returnList


def getBrand(concatText:str) -> str :
    # this function return the strandard brand name 
    standardName = ""
    brandDict = _readData("BRAND","LOCAL")
    standardName = _match(concatText,brandDict)

    if standardName == "":
        print("no _match is found")
    
    return standardName

def getCountry(concatText:str) -> str :
    standardCountry = ""
    CountryDict = _readData("COUNTRY","LOCAL")
    standardCountry = _match(concatText,CountryDict)

    if standardCountry == "":
        print("no _match is found")
    return standardCountry

def getSpec(concatText:str) -> list:
    standardSpec = []
    specDict = _readData("SPEC","LOCAL")
    standardSpec = _matchList(concatText,specDict)
    if not standardSpec:
        print("no _match is found")
    return standardSpec

def getProduct(concatText:str) -> str:
    standardProduct = ""
    productDict = _readData("PRODUCT","LOCAL")
    standardProduct = _match(concatText, productDict)
    if standardProduct == "":
        print("no _match is found")
    return standardProduct

def getWarehoue(concatText:str) -> list:
    # Searches the given string `concatText` for warehouse names and returns a list of found names.
    # Initializes an empty list `standardWarehoue` to store the matching warehouse names.
    # Retrieves a dictionary or list of warehouse names from a local data source through `_readData` with "WAREHOUSE" and "LOCAL" as parameters.
    # Calls `_matchList` to search `concatText` against `warehouseDict`, looking for matches.
    # If matches are found, `standardWarehoue` is updated with the list of matching names; if not, prints a message indicating no match.
    # Returns a list of identified warehouse names, which can be empty if no matches are found.

    standardWarehoue = []
    warehouseDict = _readData("WAREHOUSE","LOCAL")
    standardWarehoue = _matchList(concatText,warehouseDict)
    if not standardWarehoue:
        print("no _match is found")
    return standardWarehoue

def getSupplier(concatText:str) ->str:
    # Searches the given string for a supplier name and returns it.
    # Initializes an empty string `supplier` to store the found supplier name.
    # Retrieves a dictionary or list of supplier names from a local data source via `_readData` with "SUPPLIER" and "LOCAL" as arguments.
    # Uses `_matchList` function to compare `concatText` with `supplierDict` for any matching supplier names.
    # If a matching supplier name is found, `supplier` is set to this name; if not, a message is printed indicating no match.
    # Returns the name of the matched supplier, or an empty string if no match is found.

    supplier = ""
    supplierDict = _readData("SUPPLIER","LOCAL")
    supplier = _match(concatText,supplierDict)
    if supplier == "":
        print("no _match is found")
    return supplier
    

def getCategory(concatText:str) -> str:
    product = getProduct(concatText)
    if product != "":
        categoryDict = _readData("CATEGORY","LOCAL")
        category = _match(concatText,categoryDict)
        return category

    return ""

def getPacking(concatText:str) ->str:
    # Searches the given string `concatText` for packing types and returns the found type.
    # Initializes an empty string `packing` for storing the result.
    # Retrieves a dictionary or list of known packing types from a local data source via `_readData` with "PACKING" and "LOCAL" as parameters.
    # Uses `_matchList` to search `concatText` against `packingDict` for a match.
    # If a match is found, `packing` is updated with the matching packing type; if not, a message indicating no match is printed.
    # Returns the identified packing type, or an empty string if no match is found.

    packing = ""
    packingDict = _readData("PACKING","LOCAL")
    packing = _match(concatText,packingDict)
    if packing == "":
        print("no _match is found")
    return packing

def getPrice(concatText:str) -> float:
    # This function takes a single string argument named `concatText` and returns a floating-point number.
    # It utilizes a regular expression (regex) to search the input text for price figures.
    # The regex pattern is designed to identify numbers (with or without decimal points) that are immediately
    # followed by specific markers or units such as '$', '/', 'KG', 'LB', '/KG', '/P', 'b', '/lb', '/磅', '/包', or '/k',
    # ensuring a broad range of price formats can be recognized and extracted. These markers indicate currency symbols
    # or units of measure but are not included in the match.
    # If a price is found, the first matched number is converted to a float and returned.
    # If no price information is detected, a message is printed to indicate no match was found, and 0.0 is returned.
    # The primary use of this function is to efficiently extract numerical price information from strings that contain
    # pricing data in various formats.
    
    
    pattern = r'(?:(?<=\$)\d+(?:\.\d+)?|\d+(?:\.\d+)?)(?:(?=/)|(?=KG)|(?=LB)|(?=/KG)|(?=/P)|(?=b)|(?=/lb)|(?=/磅)|(?=/包)|(?=/k))'
    match = regex.findall(pattern, concatText)
    if match:
        return float(match[0])
    else:
        print("No match found.")
        return 0.0
    

def getWeightUnit(concatText:str) -> str:
    # This function searches for and returns the weight unit found in a given string `concatText`.
    # It initializes an empty string `weightUnit` to store the found weight unit.
    # The `weightUnitDict` is obtained by calling `_readData` with parameters "WEIGHTUNIT" and "LOCAL",
    # which presumably reads a predefined list or dictionary of weight units from a local data source.
    # `_matchList` function is then called with `concatText` and `weightUnitDict` to search for a matching weight unit in the text.
    # If a match is found, `weightUnit` is set to the matching unit; otherwise, a message is printed indicating no match was found.
    # Finally, the found weight unit (or an empty string if no match is found) is returned.

    weightUnit = ""
    weightUnitDict = _readData("WEIGHTUNIT","LOCAL")
    weightUnit = _match(concatText,weightUnitDict)
    if weightUnit == "":
        print("no _match is found")
    return weightUnit



In [83]:
df = pd.read_csv(os.getcwd() + "/conversion_table/"+"product_conversion.csv")
df = df.drop(["category"], axis=1)
df

Unnamed: 0,Standard_product,Common_product,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,牛前筒骨,,,,,,
1,牛腩,牛坑腩,,,,,
2,牛下肩肉眼卷,肩胛肉眼卷,肩胛肉眼,肩胛,,,
3,肩胛板翼,肩胛翼,板翼,牛頸脊,,,
4,牛仔扒,牛細腩包,,,,,
...,...,...,...,...,...,...,...
240,手工花枝鑲油條,,,,,,
241,玉米魚餅,,,,,,
242,花枝酥,,,,,,
243,芝士條,,,,,,


In [79]:
warehouseDict = readData("WAREHOUSE","LOCAL")
specDict = readData("SPEC","LOCAL")
brandDict = readData("BRAND","LOCAL")
productDict = readData("PRODUCT","LOCAL")
display(productDict)

  result_dict = df_new.to_dict('list')
  result_dict = df_new.to_dict('list')
  result_dict = df_new.to_dict('list')


{'牛前筒骨': [],
 '牛腩': ['牛坑腩'],
 '牛下肩肉眼卷': ['肩胛肉眼卷', '肩胛肉眼', '肩胛'],
 '肩胛板翼': ['肩胛翼', '板翼', '牛頸脊'],
 '牛仔扒': ['牛細腩包'],
 '牛冧': ['牛林'],
 '鯉魚管': [],
 '牛肋條': [],
 '金錢𦟌': ['金𦟌', '金展'],
 '牛𦟌': ['牛 前 𦟌', '牛 後 𦟌', '牛水展'],
 '牛柳頭': ['cuberoll'],
 '牛肉眼': ['肉眼', '眼肉', 'ribeye'],
 '有骨肉眼': ['燒牛肉', 'op rib'],
 '斧頭扒': [],
 '西冷': [],
 '牛柳': [],
 '牛板腱': ['牛板建', '肩胛脊肉'],
 '牛t骨': [],
 '牛仔骨': [],
 '肩胛肥牛肉': ['肩胛肥牛', '三角', '肥牛肉肩胛'],
 '去骨牛小排': ['牛小排'],
 'pastrami': ['肥牛肉 pastrami', 'pastrami 肥牛肉', 'pastrami 肥牛'],
 'superplate肥牛肉': ['super plate', 'super upper plate', 'superplate'],
 '封門柳': ['橫隔肌'],
 '白牛仔湯骨': [],
 '白牛仔扒': [],
 '牛肺': [],
 '牛百葉': [],
 '牛金錢肚': [],
 '牛骨髓': ['牛骨水'],
 '牛肚': [],
 '牛孖筋': ['孖筋'],
 '牛大腸': [],
 '牛尾': [],
 '牛舌': ['牛脷', '牛利'],
 '豬梅肉': ['梅頭', '梅肉'],
 '豬二肉': ['二肉'],
 '豬扒': ['豬大排'],
 '豬肉眼': ['三肉', '豬肉眼三肉'],
 '豬背骨': [],
 '豬腩帶骨': ['帶骨豬腩'],
 '豬腩挑骨': ['挑骨豬腩'],
 '豬肋排': ['肋排', '力排'],
 '豬四肉': ['四肉'],
 '豬脊膘油': ['豬背油'],
 '豬下梅肉': [],
 '豬扒法式': ['法式豬扒'],
 '豬肋條': [],
 '豬手': [],
 '豬腿筒骨': ['筒骨', '豬前筒骨', '後筒骨', '

In [65]:
df = pd.read_csv(os.getcwd() + "/conversion_table/category_conversion.csv")
category_dict = df.groupby('category')['Standard_product'].apply(list).to_dict()
category_dict

{'Beef': ['牛前筒骨',
  '牛腩',
  '牛下肩肉眼卷',
  '肩胛板翼',
  '牛仔扒',
  '牛冧',
  '鯉魚管',
  '牛肋條',
  '金錢𦟌',
  '牛𦟌',
  '牛柳頭',
  '牛肉眼',
  '有骨肉眼',
  '斧頭扒',
  '西冷',
  '牛柳',
  '牛板腱',
  '牛T骨',
  '牛仔骨',
  '肩胛肥牛肉',
  '去骨牛小排',
  'Pastrami',
  'SuperPlate肥牛肉',
  '封門柳',
  '白牛仔湯骨',
  '白牛仔扒',
  '白牛仔扒',
  '牛肺',
  '牛百葉',
  '牛金錢肚',
  '牛骨髓',
  '牛肚',
  '牛孖筋',
  '牛大腸',
  '牛尾',
  '牛舌',
  '西冷 Striploin サ-ロイン',
  '肉眼 Ribeye リプアイロ-ル',
  '肋益肉 Rib Cap リプキャッ',
  '牛柳 Tenderloin ヒレ',
  '牛上脑/爱胛肉眼 Chuck Roll 力タ口-ス',
  '沙巴當 Zabuton(Chuck Flap) ザプトン',
  '上局胛肉/前腿肉 Clod ツデ',
  '肩胛小排/三角腩 Chuck Rib サンカクバラ',
  '牛小排/内腹肉 Short Rib ウチバラ',
  '胸腹肉/外腹肉 Short Plate ソトバラ',
  '内後腿肉 Top Round ウチモモ',
  '外後瑟肉 Bottom Round ソトモモ',
  '和尚頭/牛冧 Knuckle マル',
  '上後腰脊多 Top SirloinButt ラム',
  '西冷 Striploin サ-ロイン',
  '肉眼 Ribeye リプアイロ-ル',
  '肋蓋肉 Rib Cap リプキャップ',
  '西冷 Striploin サ-ロイン',
  '西冷 Striploin サ-ロイン',
  '肉眼 Ribeye リプロ-ス',
  '肉眼 Ribeye リプロ-ス',
  '肉眼頂蓋 Rib Cap りフ* ロ-スカフ*リ',
  '肉眼頂蓋 Rib Cap りフ* ロ-スカフ*リ',
  '牛柳 Tenderloin ヒレ',
  '牛柳 Tenderloin ヒレ',
  '牛上腾/肩

In [70]:
getCategory("10* 牛仔骨 美麗華 4+ 3條庄  231.00/P 光1")


  result_dict = df_new.to_dict('list')


'Beef'

In [80]:
print(matchList("西冷美麗華4k+3條庄留單prime22.90P 23.00P",specDict))

['4k+', '3條庄', 'prime', 'm', 'pr']


In [6]:
def use_regex(input_text):
    pattern = re.compile(r"[A-Za-z0-9]+牛下肩肉眼卷\([^)]*\)[A-Za-z]+", re.IGNORECASE)
    return pattern.match(input_text)

print(use_regex("dfa牛下肩肉眼卷daf"))

None
