In [52]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install tqdm

In [53]:

import math
import multiprocessing
import time

import numpy as np
import pandas as pd
from tqdm import tqdm

from helper import KeyDict, save

In [54]:
FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'
DICTS_PATH = 'out/dicts/'

In [55]:
# data = pd.read_csv('datasets/receipts_new.csv', nrows=5_000_000)
# data = data.rename(columns = {'line_item_id': 'product_id'})
# data
# data.head(1000).to_csv('datasets/receipts_sample.csv')

In [56]:
# data.info(memory_usage='deep')

Можно заметить, что совпадают некоторые строки по ключевому признаку - transaction_key, необходимо произвести схлопывание данных по одинаковым ключам с суммированием признаков line_quantity.

### Сохранение обработанных данных

In [57]:
# def save(data, name):
#     print('SAVING dataset: {}\n'.format(name.upper()))
#     data.info(memory_usage='deep')
#     data.to_csv(DATASETS_PATH + name)

### Схлопывание одинаковых позиций в чеках

In [58]:
def collapse(data):
    # print('Collapsing data')
    grouped_series = data[['transaction_key', 'product_id', 'line_quantity']].groupby(
        by=['transaction_key', 'product_id'])['line_quantity'].sum()

    # data = data.groupby(by=['transaction_key', 'product_id']).first().reset_index()

    data = data.join(grouped_series, how='left', on=['transaction_key', 'product_id'], lsuffix='_old')
    data = data.drop(columns=['line_quantity_old'])
    return data.drop_duplicates()

In [59]:
# data = collapse(data)
# data

### Преобразование transaction_key

In [60]:
def get_data(str):
    return str.split('_')[3:4][0]


def get_key(str):
    splitted = str.split('_')
    ans = ''
    for i in range(3):
        ans += splitted[i]
    ans += splitted[-1]
    return int(ans)


def transform_transaction_key(data, trans_id):
    # global trans_id
    # print('Transforming dates')
    dates = data['transaction_key'].apply(get_data)
    dates = pd.to_datetime(dates)
    data['datetime'] = dates

    data['transaction_key'] = data['transaction_key'].apply(lambda x: trans_id.get(x))
    return data
    # print('Transforming keys')
    # keys = data['transaction_key']
    # keys = keys.drop_duplicates()
    # keys = keys.to_frame()
    #
    # for ind, key in keys.iterrows():
    #     keys.at[ind, 'key_id'] = trans_id.push(key['transaction_key'])
    #
    # data = data.join(keys.set_index('transaction_key'), how='left', on='transaction_key')
    # data = data.drop(columns=['transaction_key'])
    # return data

In [61]:
# data = transform_transaction_key(data)
# data

### Преобразование gid

In [62]:
def transform_gid(data, gids_id):
    data['gid'] = data['gid'].apply(lambda x: gids_id.get(x))
    return data
    # print('Transforming gid')
    # gids = data['gid']
    # gids = gids.drop_duplicates()
    # gids = gids.to_frame()
    #
    # for ind, gid in gids.iterrows():
    #     gids.at[ind, 'gid_id'] = gids_id.push(gid['gid'])
    #
    # data = data.join(gids.set_index('gid'), on='gid', how='left')
    # return data.drop(columns=['gid'])

In [63]:
# data = transform_gid(data)
# data

### Преобразование product_id

In [64]:
def transform_product_id(data, products_id):
    data['product_id'] = data['product_id'].apply(lambda x: products_id.get(x))
    return data

### Преобразование line_type

In [65]:
def transform_line_type(data, lines_id):
    # print('Transforming line_type')
    data['line_type'] = data['line_type'].apply(lambda x: lines_id.get(x))
    return data
    # line_types = data['line_type']
    # line_types = line_types.drop_duplicates()
    # line_types = line_types.to_frame()
    #
    # for ind, line_type in line_types.iterrows():
    #     line_types.at[ind, 'line_type_id'] = lines_id.push(line_type['line_type'])
    #
    # data = data.join(line_types.set_index('line_type'), on='line_type', how='left')
    # return data.drop(columns=['line_type'])

In [66]:
# data = transform_line_type(data)
# data

### Подключение категорий товаров

In [67]:
cats = pd.read_csv('datasets/ProductsModels.csv')
cats = cats.rename(columns={'model_id': 'category_id', 'name': 'category_name'})
cats['new_id'] = cats.index
cats.to_csv(DATASETS_PATH + 'categories_id.csv')

In [68]:
def include_categories(data):
    cats_new = cats.drop(columns=['category_name', 'category_id']).rename(columns={'new_id': 'category_id'})
    data = data.join(cats_new.set_index('product_id'), on='product_id', how='left')
    return data

In [69]:
# data = include_categories(data, 'datasets/ProductsModels.csv')

### Добавление дней недели

In [70]:
def include_weekday(data):
    # print('Including weekdays')
    def get_weekday(datetime):
        return datetime.weekday()

    weekdays = data['datetime'].apply(get_weekday)
    data['weekday'] = weekdays
    return data

In [71]:
# data = include_weekday(data)
# data

### Обработка всего набора данных

### Класс для ключей

In [72]:
# class KeyDict:
#     def __init__(self, name='Default'):
#         self.name = name
#         self.max = 0
#         self.dict = {}
#
#     def push(self, obj):
#         if obj in self.dict:
#             return self.dict[obj]
#         self.dict[obj] = self.max
#         self.max += 1
#         return self.max - 1
#
#     def get(self, obj):
#         if obj not in self.dict:
#             return None
#         return self.dict[obj]
#
#     def save(self, path):
#         print(len(self.dict))
#         with open(path, 'w') as file:
#             file.write(json.dumps(self.dict))
#
#     def load(self, path):
#         with open(path) as json_file:
#             self.dict = json.load(json_file)
#


In [73]:
NROWS = 10_000_000
BATCH_SIZE = 250_000
PATH = 'datasets/receipts_new.csv'

In [74]:
def process_data(batch_size=BATCH_SIZE, path=PATH):
    batches = []

    data = pd.read_csv(path, nrows=NROWS)
    print('Data has been read with {} rows'.format(data.shape[0]))

    data = data.rename(columns={'line_item_id': 'product_id'}).drop(columns=['opened_date', 'line_margin'])

    print('Collecting key-dictionaries...')

    def process_dict(data_column):
        dict = KeyDict()
        for row in data_column.drop_duplicates():
            dict.push(row)
        return dict

    trans_id = process_dict(data['transaction_key'])
    gids_id = process_dict(data['gid'])
    lines_id = process_dict(data['line_type'])
    products_id = process_dict(data['product_id'])

    def concat_batches(batch):
        nonlocal batches
        batches.append(batch)

    print('Batching with {} batch size, {} batches count'.format(BATCH_SIZE, math.ceil(data.shape[0] / batch_size)))

    # print('Running multiprocessing with {} cpu units...'.format(multiprocessing.cpu_count()))

    def process_batch(batch):
        time_batch_0 = time.perf_counter()
        batch = collapse(batch)
        batch = transform_transaction_key(batch, trans_id)
        batch = transform_gid(batch, gids_id)
        batch = transform_line_type(batch, lines_id)
        batch = transform_product_id(batch, products_id)
        batch = include_categories(batch)
        batch = include_weekday(batch)
        time_batch_delta = time.perf_counter() - time_batch_0
        # print('Batch time - {:.3f}s'.format(time_batch_delta))
        return batch

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    for batch in tqdm(np.array_split(data, data.shape[0] / batch_size)):
        batch = process_batch(batch)
        concat_batches(batch)
        # pool.apply_async(
        #     process_batch,
        #     args=(batch,),
        #     callback=concat_batches
        # )
    pool.close()
    pool.join()

    data = pd.concat(batches, ignore_index=True, sort=False)
    data = data.rename(columns={'key_id': 'transaction_key'})
    print('Collapsing all batches...')
    data = collapse(data)

    return data, trans_id, gids_id, lines_id, products_id

In [None]:
time_0 = time.perf_counter()

data_processed, trans_id, gids_id, lines_id, products_id = process_data()
time_delta = time.perf_counter() - time_0
print('Overall time - {:.3f}s'.format(time_delta))

In [None]:
data_processed

In [None]:
trans_id.save(DICTS_PATH + 'transaction_keys.json')
gids_id.save(DICTS_PATH + 'gids.json')
lines_id.save(DICTS_PATH + 'type_lines.json')
products_id.save(DICTS_PATH + 'products.json')

save(data_processed, 'data_processed.csv', DATASETS_PATH)