In [1]:
import re

def parse_formula(formula_text):
    # Tokenization using regular expressions
    tokens = re.findall(r'\b(?:[a-zA-Z]+|\d+|\S)\b', formula_text)
    
    # Parsing and building structured formula
    structured_formula = []
    for token in tokens:
        if token.isdigit():
            structured_formula.append(('constant', int(token)))
        elif token.isalpha():
            structured_formula.append(('variable', token))
        else:
            structured_formula.append(('operator', token))
    
    return structured_formula

# Example usage
formula_text = "LME + 280 eur"
structured_formula = parse_formula(formula_text)
print(structured_formula)

[('variable', 'LME'), ('constant', 280), ('variable', 'eur')]


In [1]:
import pandas as pd
import os
import regex as re
import numpy as np

path = os.path.join('../data_git', 'cart_items.csv')
cart_items = pd.read_csv(path)

In [2]:
df = pd.DataFrame(cart_items[~cart_items['ds_sale_price_formula'].isna()]['ds_purchase_price_formula'].str.lower())

# Change type to string
df['ds_purchase_price_formula'] = df['ds_purchase_price_formula'].astype(str)

# Important patterns
usd_pattern = re.compile(r'\b(?:usd|usdollar|usdollars|\$)\b')
eur_pattern = re.compile(r'\b(?:eur|euro|euros|\€)\b')
perc_pattern = re.compile(r'(?:%|\bpercent\b)')
plus_pattern = re.compile(r'\s(?:\+|plus)\s')
minus_pattern = re.compile(r'\s(?:\-|minus)\s')
mul_pattern = re.compile(r'\s(?:\*|x|times)\s')
div_pattern = re.compile(r'\s(?:\/|divided by)\s')
currency_per_mt_pattern = re.compile(r'\b(?:usd\/mt|usd per mt|usdollar per mt|usdollars per mt|eur\/mt|eur per mt|euro per mt|euros per mt)\b')
index_pattern = re.compile(r'\b(?:lme|london metal exchange|bdsv)\b')
# Find all constant values indluding with thousands or decimal separator
constant_pattern = re.compile(r'\b(?:\d{1,3}(?:,\d{3})*(?:\.\d+)*|\d+\.\d*)\b')
unit_pattern = re.compile(r'\b(?:mt|metric ton|metric tons|ton|tons|kg|kilogram|kilograms \
                          |lb|pound|pounds|oz|ounce|ounces|g|gram|grams|mg|milligram|milligrams \
                          |t|metric tonne|metric tonnes|tonne|tonnes)\b')


def detect_currency(text):
    if usd_pattern.search(text):
        return 'usd'
    elif eur_pattern.search(text):
        return 'eur'
    else:
        return None
    
def detect_operator(text):
    if plus_pattern.search(text):
        return 'addition'
    elif minus_pattern.search(text):
        return 'subtraction'
    elif mul_pattern.search(text):
        return 'multiplication'
    elif div_pattern.search(text):
        return 'division'
    elif perc_pattern.search(text):
        return 'percentage'
    else:
        return None

def detect_index(text):
    if index_pattern.search(text):
        return index_pattern.search(text).group(0)
    else:
        return None

def detect_constant(text):
    if constant_pattern.search(text):
        return constant_pattern.search(text).group(0)
    else:
        return None
    
def detect_unit(text):
    if unit_pattern.search(text):
        return unit_pattern.search(text).group(0)
    else:
        return None

def convert_constant(number_str):
    if number_str is None:
        return float('nan')
    else:
        try:
            if ',' in number_str and '.' in number_str:
                comma_pos = number_str.rfind(',')
                dot_pos = number_str.rfind('.')
                if comma_pos < dot_pos:
                    # Comma as thousand separator
                    number_str = number_str.replace(',', '')
                else:
                    # Comma as decimal separator
                    number_str = number_str.replace('.', '').replace(',', '.')
            elif ',' in number_str:
                if number_str.count(',') > 1:
                    # Comma as thousand separator
                    number_str = number_str.replace(',', '')
                # Check if there are more than three digits after the comma
                elif len(number_str[number_str.rfind(',')+1:]) > 2:
                    # Comma as thousand separator
                    number_str = number_str.replace(',', '')
                else:
                    # Comma as decimal separator
                    number_str = number_str.replace(',', '.')
            elif '.' in number_str:
                if number_str.count('.') > 1:
                    # Dot as thousand separator
                    number_str = number_str.replace('.', '')
                # Check if there are more than three digits after the dot
                elif len(number_str[number_str.rfind('.')+1:]) > 2:
                    # Dot as thousand separator
                    number_str = number_str.replace('.', '')
                else:
                # Dot as decimal separator
                    pass
            return float(number_str)
        except ValueError:
            return float('nan')


# Detect currency
df['currency'] = df['ds_purchase_price_formula'].apply(detect_currency)

# Get operator
df['operator'] = df['ds_purchase_price_formula'].apply(detect_operator)

# Get index
df['index'] = df['ds_purchase_price_formula'].apply(detect_index)

# Get constant
df['constant'] = df['ds_purchase_price_formula'].apply(detect_constant)

# Convert to float
df['constant'] = df['constant'].apply(convert_constant)

# Get unit
df['unit'] = df['ds_purchase_price_formula'].apply(detect_unit)

# Identify if the price is fixed
df['fixed_price'] = df['ds_purchase_price_formula'].str.contains('fix')

df


Unnamed: 0,ds_purchase_price_formula,currency,operator,index,constant,unit,fixed_price
225,lme + 25 usd/mt,usd,addition,lme,25.0,mt,False
55654,lme + 45 usd / mt,usd,addition,lme,45.0,mt,False
55915,81% lme,,percentage,lme,81.0,,False
56032,metaloop´s option,,,,,,False
56668,lme previous week to shipment + 450 usd/mt,usd,addition,lme,450.0,mt,False
...,...,...,...,...,...,...,...
61548,"1,460 usd/mt",usd,,,1460.0,mt,False
61562,480 usd/mt,usd,,,480.0,mt,False
61563,"10,050 usd/mt",usd,,,10050.0,mt,False
61564,"3,675 usd/mt",usd,,,3675.0,mt,False


### The constan_pattern regex follows the following rules:

1. \b: Word boundary to ensure the number is matched as a whole word.
2. (?:\d{1,3}(?:,\d{3})*(?:\.\d+)*|\d+\.\d*):
    - \d{1,3}: Matches between 1 and 3 digits at the beginning.
    - (?:,\d{3})*: Non-capturing group matching a comma followed by exactly three digits, repeated zero or more times.
    - (?:\.\d+)*: Non-capturing group matching a period followed by one or more digits, repeated zero or more times.
    - \d+\.\d*: Matches one or more digits followed by a period and zero or more digits.
3. \b: Word boundary to ensure the number is matched as a whole word.

### Plus pattern match

1. \s: Matches any whitespace character (spaces, tabs, etc.).
2. (?:\+|plus): Non-capturing group that matches either a + or the word "plus".
3. \s: Matches any whitespace character (spaces, tabs, etc.).

### Percentage pattern match
1. (?: ... ): Non-capturing group.
2. %: Matches the percentage symbol.
3. |: Alternation operator, meaning "or".
4. \bpercent\b: Matches the word "percent" as a whole word due to the word boundaries \b.

Function Definition: The function guess_and_convert takes a string number_str as input.

Comma and Dot: If the string contains both a comma , and a dot .:

comma_pos and dot_pos determine the positions of the last comma and dot.
If the comma is before the dot, it treats the comma as a thousand separator.
Otherwise, it treats the comma as a decimal separator.
Comma Only: If the string contains only a comma:

If there is more than one comma, it treats commas as thousand separators.
If there is one comma, it checks the length of the digits after the comma.
If more than two digits follow the comma, it's likely a thousand separator.
Otherwise, it treats the comma as a decimal separator.
Dot Only: If the string contains only a dot:

If there is more than one dot, it treats dots as thousand separators.
If there is one dot, it checks the length of the digits after the dot.
If more than two digits follow the dot, it's likely a thousand separator.
Otherwise, it treats the dot as a decimal separator.
Conversion: The modified string is then converted to a float. If the conversion fails, it returns NaN.

This function helps in guessing the correct numerical format and converting the string representation to a float accordingly.

## Tokenizing the formula


In [None]:
# 