In [1]:
import polars as pl

In [2]:
pl.Config.set_fmt_str_lengths(100)

df = pl.read_csv('./russian_supermarket_prices.csv') \
    .select(
        name='product_name',
        category='product_category',
        brand='brand',
        package_size='package_size',
        unit='unit',
        catalogue_name='catalogue_name',
        year='year',
    ) \
    .sort('year', descending=True) \
    .with_row_index() \
    .filter(pl.col('index').rank('ordinal').over('name') == 1) \
    .drop('year')

print(df.shape)
df.head(2)

(19341, 7)


index,name,category,brand,package_size,unit,catalogue_name
u32,str,str,str,str,str,str
0,"""Молоко «Правильное» 3,2%-4%, 900 мл""","""Молоко""","""Правильное""","""900""","""мл""","""Сезонный каталог Атак «Недельный каталог»"""
1,"""Бифилайф «Рузский» кисломолочный2,5 %, 250 г""","""Кефир, ряженка, тан, айран""","""Рузское молоко""","""250""","""г""","""Сезонный каталог Атак «Недельный каталог»"""


In [29]:
from yargy import rule, or_, and_, Parser
from yargy.interpretation import fact
from yargy.predicates import gte, lte, caseless, normalized, dictionary

from razdel import tokenize as razdel_tokenize
def tokenize(text: str) -> list[str]:
    return [t.text for t in razdel_tokenize(text)]

In [36]:
import re
rvalue = re.compile(r'^[0-9]$|^[0-9]+[.,][0-9]+$')
def is_value(token: str) -> bool:
    return rvalue.match(token) is not None

text = "Молоко «Правильное» 3,2%-4%, 900 мл"
tokens = tokenize(text)
print(tokens)
for token in tokens:
    if is_value(token):
        print(token)

['Молоко', '«', 'Правильное', '»', '3,2', '%', '-', '4', '%', ',', '900', 'мл']
3,2
4


In [37]:
import fuzzywuzzy

In [None]:
fuzzywuzzy.

In [None]:
Date = fact(
    'Date',
    ['year', 'month', 'day']
)


DAY = and_(
    gte(1),
    lte(31)
).interpretation(
    Date.day
)
MONTH = and_(
    gte(1),
    lte(12)
).interpretation(
    Date.month
)
YEAR = and_(
    gte(1),
    lte(2018)
).interpretation(
    Date.year
)
MONTH_NAME = dictionary(
    MONTHS
).interpretation(
    Date.month
)
DATE = or_(
    rule(YEAR, '-', MONTH, '-', DAY),
    rule(
        DAY,
        MONTH_NAME,
        YEAR,
        YEAR_WORDS.optional()
    )
).interpretation(Date)

match = parser.match('05 февраля 2011 года')
match.tree.as_dot