In [30]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install tqdm

In [31]:
import json
import multiprocessing
import time

import numpy as np
import pandas as pd

In [32]:
FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'
DICTS_PATH = 'out/dicts/'

In [33]:
# data = pd.read_csv('datasets/receipts_new.csv', nrows=5_000_000)
# data = data.rename(columns = {'line_item_id': 'product_id'})
# data
# data.head(1000).to_csv('datasets/receipts_sample.csv')

In [34]:
# data.info(memory_usage='deep')

Можно заметить, что совпадают некоторые строки по ключевому признаку - transaction_key, необходимо произвести схлопывание данных по одинаковым ключам с суммированием признаков line_quantity.

### Сохранение обработанных данных

In [35]:
def save(data, name):
    print('SAVING dataset: {}\n'.format(name.upper()))
    data.info(memory_usage='deep')
    data.to_csv(DATASETS_PATH + name)

### Схлопывание одинаковых позиций в чеках

In [36]:
def collapse(data):
    # print('Collapsing data')
    data_grouped = data[['transaction_key', 'product_id', 'line_quantity']].groupby(
        by=['transaction_key', 'product_id'])
    grouped_series = data_grouped['line_quantity'].agg(lambda x: sum(x))
    data = data.join(grouped_series, how='left', on=['transaction_key', 'product_id'], rsuffix='LA')
    data = data.drop(columns=['line_quantity'])
    data = data.rename(columns={'line_quantityLA': 'line_quantity'})
    return data.drop_duplicates()

In [37]:
# data = collapse(data)
# data

### Преобразование transaction_key

In [38]:
def get_data(str):
    return str.split('_')[3:4][0]


def get_key(str):
    splitted = str.split('_')
    ans = ''
    for i in range(3):
        ans += splitted[i]
    ans += splitted[-1]
    return int(ans)


def transform_transaction_key(data):
    global trans_id
    # print('Transforming dates')
    dates = data['transaction_key'].apply(get_data)
    dates = pd.to_datetime(dates)
    data['datetime'] = dates

    # print('Transforming keys')
    keys = data['transaction_key']
    keys = keys.drop_duplicates()
    keys = keys.to_frame()

    for ind, key in keys.iterrows():
        keys.at[ind, 'key_id'] = trans_id.push(key['transaction_key'])

    data = data.join(keys.set_index('transaction_key'), how='left', on='transaction_key')
    data = data.drop(columns=['transaction_key'])
    return data

In [39]:
# data = transform_transaction_key(data)
# data

### Преобразование gid

In [40]:
def transform_gid(data):
    global gids_id
    # print('Transforming gid')
    gids = data['gid']
    gids = gids.drop_duplicates()
    gids = gids.to_frame()

    for ind, gid in gids.iterrows():
        gids.at[ind, 'gid_id'] = gids_id.push(gid['gid'])

    data = data.join(gids.set_index('gid'), on='gid', how='left')
    return data.drop(columns=['gid'])

In [41]:
# data = transform_gid(data)
# data

### Преобразование line_type

In [42]:
def transform_line_type(data):
    global lines_id
    # print('Transforming line_type')
    line_types = data['line_type']
    line_types = line_types.drop_duplicates()
    line_types = line_types.to_frame()

    for ind, line_type in line_types.iterrows():
        line_types.at[ind, 'line_type_id'] = lines_id.push(line_type['line_type'])

    data = data.join(line_types.set_index('line_type'), on='line_type', how='left')
    return data.drop(columns=['line_type'])

In [43]:
# data = transform_line_type(data)
# data

### Подключение категорий товаров

In [44]:
cats = pd.read_csv('datasets/ProductsModels.csv')
cats = cats.rename(columns={'model_id': 'category_id', 'name': 'category_name'})
cats['new_id'] = cats.index
cats.to_csv(DATASETS_PATH + 'categories_id.csv')

In [45]:
def include_categories(data, cats_path='datasets/ProductsModels.csv'):
    cats_new = cats.drop(columns=['category_name'])
    data = data.join(cats_new.set_index('product_id'), on='product_id', how='left')
    return data

In [46]:
# data = include_categories(data, 'datasets/ProductsModels.csv')

### Добавление дней недели

In [47]:
def include_weekday(data):
    # print('Including weekdays')
    def get_weekday(datetime):
        return datetime.weekday()

    weekdays = data['datetime'].apply(get_weekday)
    data['weekday'] = weekdays
    return data

In [48]:
# data = include_weekday(data)
# data

### Обработка всего набора данных

### Класс для ключей

In [49]:
class KeyDict:
    def __init__(self):
        self.max = 0
        self.dict = {}

    def push(self, obj):
        if obj in self.dict:
            # print(self.dict[obj])
            return self.dict[obj]
        self.dict[obj] = self.max
        self.max += 1
        # print(self.max - 1)
        return self.max - 1

    def get(self, obj):
        if obj not in self.dict:
            return None
        return self.dict[obj]

    def save(self, path):
        with open(path, 'w') as file:
            file.write(json.dumps(self.dict))


In [50]:
NROWS = 25_000_000
BATCH_SIZE = 500_000
PATH = 'datasets/receipts_new.csv'

In [51]:
def process_batch(batch):
    time_batch_0 = time.perf_counter()
    batch = collapse(batch)
    batch = transform_transaction_key(batch)
    batch = transform_gid(batch)
    batch = transform_line_type(batch)
    batch = include_categories(batch, 'datasets/ProductsModels.csv')
    batch = include_weekday(batch)

    time_batch_delta = time.perf_counter() - time_batch_0
    print('Batch time - {}'.format(time_batch_delta))
    return batch


def process_data(batch_size=BATCH_SIZE, path=PATH):
    batches = []

    def concat_batches(batch):
        nonlocal batches
        batches.append(batch)

    data = pd.read_csv(path, nrows=NROWS)
    data = data.rename(columns={'line_item_id': 'product_id'}).drop(columns=['opened_date', 'line_margin'])

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())

    for batch in np.array_split(data, data.shape[0] / batch_size):
        pool.apply_async(
            process_batch,
            args=(batch,),
            callback=concat_batches
        )
    pool.close()
    pool.join()

    data = pd.concat(batches, ignore_index=True, sort=False)
    data = data.rename(columns = {'key_id': 'transaction_key'})
    data = collapse(data)

    return data

In [52]:
weekdays_id = KeyDict()
weekdays_id.push('Воскресенье')
weekdays_id.push('Понедельник')
weekdays_id.push('Вторник')
weekdays_id.push('Среда')
weekdays_id.push('Четверг')
weekdays_id.push('Пятница')
weekdays_id.push('Суббота')
weekdays_id.save(DICTS_PATH + 'weekdays.json')

In [53]:
time_0 = time.perf_counter()

trans_id, gids_id, lines_id = KeyDict(), KeyDict(), KeyDict()

data_processed = process_data()
time_delta = time.perf_counter() - time_0
print('Overall time - {}'.format(time_delta))

In [None]:
data_processed

In [None]:
trans_id.save(DICTS_PATH + 'transaction_keys.json')
gids_id.save(DICTS_PATH + 'gids.json')
lines_id.save(DICTS_PATH + 'type_lines.json')
save(data_processed, 'data_processed.csv')