In [30]:
import pandas as pd
import os
import re
import regex
import csv


def _readData(targetTable :str,method:str) -> dict:
    
    if method == "LOCAL":
        dir = os.getcwd() + "/conversion_table/"
        if targetTable == "BRAND":
            dir += "brand_conversion.csv"
        elif targetTable == "COUNTRY":
            dir += "country_conversion.csv"
        elif targetTable == "SPEC":
            dir += "spec_conversion.csv"
        elif targetTable == "PRODUCT":
            dir += "product_conversion.csv"
        elif targetTable == "WAREHOUSE":
            dir += "warehouse_conversion.csv"
        elif targetTable == "SUPPLIER":
            dir += "supplier_conversion.csv"
        elif targetTable == "CATEGORY":
            dir += "product_conversion.csv"
        elif targetTable == "PACKING":
            dir += "packing_conversion.csv"
        elif targetTable == "WEIGHTUNIT":
            dir += "weightunit_conversion.csv"
        else:
            raise TypeError("Undefined type: " + targetTable)

        df = pd.read_csv(dir)
        if targetTable == "PRODUCT":
            #product is a special case. There is a "category" column which affects the algorithm 
            df = df.drop(["category"], axis=1)
        elif targetTable == "CATEGORY":
            df = df[['category', 'Standard_product']]
            categories_order = ['Wagyu'] + sorted(df['category'].unique().tolist())
            categories_order.remove('Wagyu')
            df['category'] = pd.Categorical(df['category'], categories=categories_order, ordered=True)
            

            # Group by 'category' and convert to dictionary
            category_dict = df.groupby('category')['Standard_product'].apply(list).to_dict()

            if 'Wagyu' in category_dict:
                wagyu_products = category_dict['Wagyu']
                # Create a new dictionary with "Wagyu" first
                new_category_dict = {'Wagyu': wagyu_products}
                # Add the rest of the categories except "Wagyu"
                for category, products in category_dict.items():
                    if category != 'Wagyu':
                        new_category_dict[category] = products
                category_dict = new_category_dict
                
            return category_dict
        
        df = df.map(lambda x: x.lower() if isinstance(x, str) else x).T    
        new_columns = df.iloc[0]
        df_new = df[1:].reset_index(drop=True)
        df_new.columns = new_columns
        result_dict = df_new.to_dict('list')
        dataDict = {k: [i for i in v if type(i) != float] for k, v in result_dict.items()}
        return dataDict
    
    elif method == "AWS":
        return
            

    return 

def _compareString(stringValue: str, textlist: list) -> bool:
    stringValueLower = stringValue.lower()  # Convert stringValue to lowercase
    for text in textlist:
        if str(text).lower() in stringValueLower:  # Convert text to lowercase and compare
            return True
    return False

def _compareString_v2(stringValue: str, textlist: list) -> (bool, bool):
    stringValueLower = stringValue.lower()
    partial_match = False
    exact_match = False
    for text in textlist:
        text_lower = str(text).lower()
        if text_lower in stringValueLower:
            partial_match = True
            # Check for exact matches by splitting words and checking
            if text_lower == stringValueLower or f" {text_lower} " in f" {stringValueLower} "  :
                exact_match = True
                break
    return partial_match, exact_match

def _match_v2(concatText: str, dataDict: dict) -> str:
    concatTextLower = concatText.lower()
    best_match = None
    for key, value in dataDict.items():
        partial_match, exact_match = _compareString_v2(concatTextLower, [key] + list(value) )

        # Prioritize exact matches first
        if exact_match:
            
            if key.lower() in concatTextLower:
                return key  # Return immediately if the key itself is an exact match
            if not best_match:
                best_match = key
        elif partial_match and not best_match:
            best_match = key  # Consider partial matches if no exact match has been found yet

    return best_match

def _match(concatText: str, dataDict: dict) -> str:
    concatTextLower = concatText.lower()  # Convert concatText to lowercase for case-insensitive comparison
    for key, value in dataDict.items():
        if _compareString(concatTextLower, list(value)) or key.lower() in concatTextLower:
            return key
    return None

def _matchList(concatText: str, dataDict: dict) -> list:
    returnList = []
    concatTextLower = concatText.lower()  # Convert concatText to lowercase for case-insensitive comparison
    for key, value in dataDict.items():
        if _compareString(concatTextLower, list(value)) or (key.lower() in concatTextLower):
            returnList.append(key)
            
    return returnList


def getBrand(concatText:str) -> str :
    # this function return the strandard brand name 
    standardName = ""
    brandDict = _readData("BRAND","LOCAL")
    standardName = _match_v2(concatText,brandDict)

    if standardName == "":
        print("no _match is found :" + concatText)
    
    if standardName is not None:
        return standardName.upper()
    
    return standardName

def getCountry(concatText:str) -> str :
    standardCountry = ""
    CountryDict = _readData("COUNTRY","LOCAL")
    standardCountry = _match_v2(concatText,CountryDict)

    if standardCountry == "":
        print("no _match is found :" + concatText)
        return ""
    return standardCountry

def getSpec(concatText:str) -> list:
    standardSpec = []
    specDict = _readData("SPEC","LOCAL")
    standardSpec = _matchList(concatText,specDict)
    if not standardSpec:
        print("no _match is found")
    return [s.upper() if isinstance(s, str) else s for s in standardSpec]

def getProduct(concatText:str) -> str:
    standardProduct = ""
    productDict = _readData("PRODUCT","LOCAL")
    standardProduct = _match_v2(concatText, productDict)
    if standardProduct == "":
        print("no _match is found")
    if standardProduct is not None:
        return standardProduct.upper()

    return ""
    

def getWarehoue(concatText:str) -> list:
    # Searches the given string `concatText` for warehouse names and returns a list of found names.
    # Initializes an empty list `standardWarehoue` to store the matching warehouse names.
    # Retrieves a dictionary or list of warehouse names from a local data source through `_readData` with "WAREHOUSE" and "LOCAL" as parameters.
    # Calls `_matchList` to search `concatText` against `warehouseDict`, looking for matches.
    # If matches are found, `standardWarehoue` is updated with the list of matching names; if not, prints a message indicating no match.
    # Returns a list of identified warehouse names, which can be empty if no matches are found.

    standardWarehoue = []
    warehouseDict = _readData("WAREHOUSE","LOCAL")
    standardWarehoue = _matchList(concatText,warehouseDict)
    if not standardWarehoue:
        print("no _match is found")
    return standardWarehoue

def getSupplier(concatText:str) ->str:
    # Searches the given string for a supplier name and returns it.
    # Initializes an empty string `supplier` to store the found supplier name.
    # Retrieves a dictionary or list of supplier names from a local data source via `_readData` with "SUPPLIER" and "LOCAL" as arguments.
    # Uses `_matchList` function to compare `concatText` with `supplierDict` for any matching supplier names.
    # If a matching supplier name is found, `supplier` is set to this name; if not, a message is printed indicating no match.
    # Returns the name of the matched supplier, or an empty string if no match is found.

    supplier = ""
    supplierDict = _readData("SUPPLIER","LOCAL")
    supplier = _match_v2(concatText,supplierDict)
    if supplier == "":
        print("no _match is found")
    return supplier
    

def getCategory(concatText:str) -> str:
    product = getProduct(concatText)
    if product != "":
        categoryDict = _readData("CATEGORY","LOCAL")
        category = _match_v2(product,categoryDict)
        if category is not None:
            return category.upper()
        

    return ""

def getPacking(concatText:str) ->str:
    # Searches the given string `concatText` for packing types and returns the found type.
    # Initializes an empty string `packing` for storing the result.
    # Retrieves a dictionary or list of known packing types from a local data source via `_readData` with "PACKING" and "LOCAL" as parameters.
    # Uses `_matchList` to search `concatText` against `packingDict` for a match.
    # If a match is found, `packing` is updated with the matching packing type; if not, a message indicating no match is printed.
    # Returns the identified packing type, or an empty string if no match is found.

    packing = ""
    packingDict = _readData("PACKING","LOCAL")
    packing = _match_v2(concatText,packingDict)
    if packing == None:
        print("no _match is found")
    return packing

def getPrice(concatText: str) -> float:
    # Direct and simplified regex pattern focusing on capturing numeric values
    # Optionally preceded by $ and followed by zero or more spaces and then units or the end of the string
    pattern = r"\$?\s*(\d+(?:\.\d+)?)\s*(?:KG|LB|/KG|/P|b|/lb|/磅|/包|/k|\Z)?"
    
    # Searching for all matches, considering case insensitivity for units
    matches = re.findall(pattern, concatText, flags=re.IGNORECASE)

    if matches:
        # Assuming the first match is the relevant price
        try:
            return float(matches[-1])
        except ValueError:
            print(f"Conversion issue with: '{concatText}', found: {matches[0]}")
            return 0.0

    print(f"No valid price found in: '{concatText}'")
    return None

def getPriceWord(concatText: str) -> float:
    # Regex pattern focusing on capturing numeric values preceded by $ or before specified symbols
    pattern = r"(?:^|\$|\b(?:/|P|p|B|b|KG|k|磅|公斤|LB|lb|包|kg)\b)(\d+(?:\.\d+)?)"
    
    # Searching for all matches, considering case insensitivity for units
    matches = re.findall(pattern, concatText, flags=re.IGNORECASE)

    if matches:
        # Assuming the last match is the relevant price
        try:
            return float(matches[-1])
        except ValueError:
            print(f"Conversion issue with: '{concatText}', found: {matches[0]}")
            return None

    print(f"No valid price found in: '{concatText}'")
    return None  

def getWeightUnit(concatText:str) -> str:
    # This function searches for and returns the weight unit found in a given string `concatText`.
    # It initializes an empty string `weightUnit` to store the found weight unit.
    # The `weightUnitDict` is obtained by calling `_readData` with parameters "WEIGHTUNIT" and "LOCAL",
    # which presumably reads a predefined list or dictionary of weight units from a local data source.
    # `_matchList` function is then called with `concatText` and `weightUnitDict` to search for a matching weight unit in the text.
    # If a match is found, `weightUnit` is set to the matching unit; otherwise, a message is printed indicating no match was found.
    # Finally, the found weight unit (or an empty string if no match is found) is returned.

    weightUnit = []
    weightUnitDict = _readData("WEIGHTUNIT","LOCAL")
    weightUnit = _matchList(concatText,weightUnitDict)
    if not weightUnit :
        print("no _match is found")
        return "lb"
    return weightUnit[-1]

def _add_tag_to_row(target_file: str, row_number: int, new_tag: str):
    """
    Adds a new tag to the last column of a specific row in a CSV file, filling any empty columns first.
    
    Args:
    target_file (str): The path to the CSV file.
    row_number (int): The row number (1-based index) to modify.
    new_tag (str): The tag to add to the last column of the specified row.
    """
    try:
        # Read all rows from the file
        with open(target_file, mode='r', newline='') as file:
            rows = list(csv.reader(file))

        # Check if the specified row number is within the range of existing rows
        if 0 < row_number <= len(rows):
            # Target the specific row (convert row_number from 1-based to 0-based)
            target_row = rows[row_number - 1]

            # Find the first empty column in the row, if any
            try:
                empty_index = target_row.index('')
                target_row[empty_index] = new_tag  # Fill the first empty column found
            except ValueError:
                # No empty columns found, append new tag to the end
                target_row.append(new_tag)
            
            # Write the modified rows back to the file
            with open(target_file, mode='w', newline='') as file:
                writer = csv.writer(file)
                writer.writerows(rows)
            print("Tag added successfully.")
        else:
            print(f"Row {row_number} does not exist in the file.")
    except FileNotFoundError:
        print("File not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

def _find_tag_in_csv(target_file: str, tag: str) -> int:
    """
    Search for a tag in a CSV file. Returns the row number where the tag is found.
    If the tag is not found, returns -1.

    Args:
    target_file (str): The path to the CSV file.
    tag (str): The tag to search for.

    Returns:
    int: The row number of the found tag, or -1 if the tag is not found.
    """
    try:
        with open(target_file, mode='r', newline='') as file:
            reader = csv.reader(file)
            # Iterate through rows in the file
            for index, row in enumerate(reader):
                if tag in row:  # Check if tag is in any column of the row
                    return index + 1  # Return row number (1-based index)
    except FileNotFoundError:
        print("File not found.")
        return -1  # Optionally, could raise an exception or handle differently
    except Exception as e:
        print(f"An error occurred: {e}")
        return -1

    return -1  # Return -1 if the tag is not found in any row

def _add_tag_to_last_row(target_file: str, new_tag: str):
    """
    Appends a new tag to the last row of a CSV file.

    Args:
    target_file (str): The path to the CSV file.
    new_tag (str): The tag to append to the last row.
    """
    try:
        # Open the file in append mode
        with open(target_file, mode='a', newline='') as file:
            writer = csv.writer(file)
            # Append a new row with the new tag
            writer.writerow([])
            writer.writerow([new_tag])
        print("Tag added successfully to the last row.")
    except FileNotFoundError:
        print("File not found. Please check the path.")
    except Exception as e:
        print(f"An error occurred: {e}")

def _add_tag_to_last_row_category(target_file: str, new_tag: str,category):
    """
    Appends a new tag to the last row of a CSV file.

    Args:
    target_file (str): The path to the CSV file.
    new_tag (str): The tag to append to the last row.
    """
    try:
        # Open the file in append mode
        with open(target_file, mode='a', newline='') as file:
            writer = csv.writer(file)
            # Append a new row with the new tag
            writer.writerow([])
            writer.writerow([category,new_tag])
        print("Tag added successfully to the last row.")
    except FileNotFoundError:
        print("File not found. Please check the path.")
    except Exception as e:
        print(f"An error occurred: {e}")

def _canFindTag(rowNum:int)->bool :
    if rowNum == -1:
        return False;
    else:
        return True

def addCommonName(target_file: str, newCommonName: str, oldCommonName = None, category = None) -> bool:

    dir = os.getcwd() + "/conversion_table/"
    if target_file == "BRAND":
        dir += "brand_conversion.csv"
    elif target_file == "COUNTRY":
        dir += "country_conversion.csv"
    elif target_file == "SPEC":
        dir += "spec_conversion.csv"
    elif target_file == "PRODUCT":
        dir += "product_conversion.csv"
    elif target_file == "WAREHOUSE":
        dir += "warehouse_conversion.csv"
    elif target_file == "SUPPLIER":
        dir += "supplier_conversion.csv"
    elif target_file == "PACKING":
        dir += "packing_conversion.csv"
    elif target_file == "WEIGHTUNIT":
        dir += "weightunit_conversion.csv"
    else:
        raise TypeError("Undefined type: " + target_file)
        return False
    


    if oldCommonName is not None:
        rowNum = _find_tag_in_csv(dir,oldCommonName) # find the corresponding row of the current tag
        if not _canFindTag(rowNum):
            print("找不到所需的標籤")
            return False
        else: #old name does exist
            newRowNum = _find_tag_in_csv(dir,newCommonName) 
            
            if not _canFindTag(newRowNum):
                _add_tag_to_row(dir,rowNum,newCommonName)
                print("tag successfully added")
                return True
            else:
                print("標籤已經存在a")
                return False
    else:
        rowNum = _find_tag_in_csv(dir,newCommonName)
        
        if not _canFindTag(rowNum): # cannot found tag

            if target_file != "PRODUCT":
                _add_tag_to_last_row(dir,newCommonName)
                return True
            else: # is product
                if category is not None:
                    _add_tag_to_last_row_category(dir,newCommonName,category)
                    return True
                else:
                    print("缺少category")
                    return False
        else:
            print("標籤已經存在b")
            return False
        





In [32]:
getProduct("40包 阿拉斯加 銀雪魚 200克 $36.0 包 加1")


['和牛上腦']
(False, False)
['和牛針扒']
(False, False)
['和牛燴扒']
(False, False)
['和牛尾扒']
(False, False)
['和牛三筋']
(False, False)
['和牛牛胸']
(False, False)
['和牛小肉']
(False, False)
['和牛肉眼']
(False, False)
['和牛肋益肉']
(False, False)
['和牛牛柳']
(False, False)
['和牛牛上脑']
(False, False)
['和牛沙巴當']
(False, False)
['和牛上局胛肉']
(False, False)
['和牛肩胛小排']
(False, False)
['和牛牛小排']
(False, False)
['和牛胸腹肉']
(False, False)
['和牛内後腿肉']
(False, False)
['和牛外後腿肉']
(False, False)
['和牛牛冧']
(False, False)
['和牛上後腰脊']
(False, False)
['和牛西冷']
(False, False)
['和牛肋蓋肉']
(False, False)
['和牛臀肉']
(False, False)
['和牛帶蓋肉眼']
(False, False)
['和牛牛腩']
(False, False)
['和牛牛仔骨']
(False, False)
['和牛三角']
(False, False)
['牛前筒骨']
(False, False)
['牛下肩肉眼卷']
(False, False)
['牛腩']
(False, False)
['肩胛板翼']
(False, False)
['牛仔扒']
(False, False)
['牛冧']
(False, False)
['鯉魚管']
(False, False)
['牛肋條']
(False, False)
['金錢𦟌']
(False, False)
['牛𦟌']
(False, False)
['牛柳頭']
(False, False)
['牛肉眼']
(False, False)
['有骨肉眼']
(False, False)
['斧頭扒']
(False, False)
['西冷']
(

  result_dict = df_new.to_dict('list')


'銀鱈魚'

In [15]:
addCommonName("SPEC","tet3","+300g","tet") 

Tag added successfully.
tag successfully added


True

In [23]:
test_cases = [
    "$113.0",
    "Price is: $45.50",
    "Cost: $99/KG",
    "It's $1.99/LB this week.",
    "$100 is the total.",
    "The price is $ 123.45 per unit.",
    "A discount at $5 off",
    "Now only $0.99!",
    "Save on our deal of $100.00 savings",
    "This item costs $ 50",
    "$75",
    "$123.99 per package",
    "It was $100.00 before the discount.",
    "Expect to pay around $40/kg for quality goods.",
    "Prices start at $9.99/lb for our premium product line.",
    "Just $199.99 for a limited time!",
    "Get it now for the low price of $250!",
    "On sale: $150!",
    "Originally $300, now just $150!",
    "Special promotion: only $0.99 per lb!",
    "40/lb",
    "50/",
    "30",
    "西冷美麗華4k+3條庄留單prime22.90P 23.00P"
    # Add more test cases as needed
]

# Running the test loop
for case in test_cases:
    result = getPrice(case)
    print(f"Extracted from '{case}': {result}")

Extracted from '$113.0': 113.0
Extracted from 'Price is: $45.50': 45.5
Extracted from 'Cost: $99/KG': 99.0
Extracted from 'It's $1.99/LB this week.': 1.99
Extracted from '$100 is the total.': 100.0
Extracted from 'The price is $ 123.45 per unit.': 123.45
Extracted from 'A discount at $5 off': 5.0
Extracted from 'Now only $0.99!': 0.99
Extracted from 'Save on our deal of $100.00 savings': 100.0
Extracted from 'This item costs $ 50': 50.0
Extracted from '$75': 75.0
Extracted from '$123.99 per package': 123.99
Extracted from 'It was $100.00 before the discount.': 100.0
Extracted from 'Expect to pay around $40/kg for quality goods.': 40.0
Extracted from 'Prices start at $9.99/lb for our premium product line.': 9.99
Extracted from 'Just $199.99 for a limited time!': 199.99
Extracted from 'Get it now for the low price of $250!': 250.0
Extracted from 'On sale: $150!': 150.0
Extracted from 'Originally $300, now just $150!': 300.0
Extracted from 'Special promotion: only $0.99 per lb!': 0.99
Ext

In [None]:
df = pd.read_csv(os.getcwd() + "/conversion_table/"+"product_conversion.csv")
df = df.drop(["category"], axis=1)
df

In [None]:
warehouseDict = readData("WAREHOUSE","LOCAL")
specDict = readData("SPEC","LOCAL")
brandDict = readData("BRAND","LOCAL")
productDict = readData("PRODUCT","LOCAL")
display(productDict)

In [None]:
df = pd.read_csv(os.getcwd() + "/conversion_table/category_conversion.csv")
category_dict = df.groupby('category')['Standard_product'].apply(list).to_dict()
category_dict

In [17]:
getCategory("A4 和牛西冷")




  result_dict = df_new.to_dict('list')
  category_dict = df.groupby('category')['Standard_product'].apply(list).to_dict()


'WAGYU'

In [21]:
getCategory("A4 和牛西冷")


  result_dict = df_new.to_dict('list')
  category_dict = df.groupby('category')['Standard_product'].apply(list).to_dict()


'WAGYU'

In [19]:
categoryDict

{'Wagyu': ['和牛上腦',
  '和牛針扒',
  '和牛燴扒',
  '和牛尾扒',
  '和牛三筋',
  '和牛牛胸',
  '和牛小肉',
  '和牛肉眼',
  '和牛肋益肉',
  '和牛牛柳',
  '和牛牛上脑',
  '和牛沙巴當',
  '和牛上局胛肉',
  '和牛肩胛小排',
  '和牛牛小排',
  '和牛胸腹肉',
  '和牛内後腿肉',
  '和牛外後腿肉',
  '和牛牛冧',
  '和牛上後腰脊',
  '和牛西冷',
  '和牛肉眼',
  '和牛肋蓋肉',
  '和牛臀肉',
  '和牛帶蓋肉眼',
  '和牛牛腩',
  '和牛三筋',
  '和牛牛仔骨',
  '和牛三角'],
 'Beef': ['牛前筒骨',
  '牛腩',
  '肩胛板翼',
  '牛仔扒',
  '牛冧',
  '鯉魚管',
  '牛肋條',
  '金錢𦟌',
  '牛𦟌',
  '牛柳頭',
  '牛肉眼',
  '有骨肉眼',
  '斧頭扒',
  '西冷',
  '牛柳',
  '牛板腱',
  '牛T骨',
  '牛仔骨',
  '肩胛肥牛肉',
  '去骨牛小排',
  'Pastrami',
  'SuperPlate肥牛肉',
  '封門柳',
  '白牛仔湯骨',
  '白牛仔扒',
  '白牛仔扒',
  '牛肺',
  '牛柏葉',
  '牛金錢肚',
  '牛骨髓',
  '牛肚',
  '牛孖筋',
  '牛大腸',
  '牛尾',
  '牛舌',
  '帶骨牛腩',
  '牛草肚',
  '牛膀',
  '牛小腸',
  'Plate Eye肥牛',
  'Short Plate肥牛',
  '免治牛肉',
  'Karubi 肥牛',
  '牛下肩肉眼卷'],
 'Chicken': ['春雞',
  '雞腎',
  '雞大胸',
  '雞白髀',
  '雞上脾',
  '雞上腿肉',
  '雞扒',
  '雞柳',
  '雞軟骨',
  '雞翼鎚',
  '雞腳',
  '火雞子',
  '全雞',
  '雞全翼',
  '雞 下髀',
  '鳳爪',
  '雞中翼',
  '日本森林雞即食雞胸肉',
  '雞髀肉',
  '上脾肉'],
 'Lamb': ['白山羊帶皮',
  '羊仔肩捲肉',
  '羊仔肩

In [None]:
def use_regex(input_text):
    pattern = re.compile(r"[A-Za-z0-9]+牛下肩肉眼卷\([^)]*\)[A-Za-z]+", re.IGNORECASE)
    return pattern.match(input_text)

print(use_regex("dfa牛下肩肉眼卷daf"))