In [1]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install tqdm

In [2]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import random
from tqdm.notebook import tqdm
import json
import multiprocessing
import time

In [3]:
FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'
DICTS_PATH = 'out/dicts/'

In [4]:
# data = pd.read_csv('datasets/receipts_new.csv', nrows=5_000_000)
# data = data.rename(columns = {'line_item_id': 'product_id'})
# data
# data.head(1000).to_csv('datasets/receipts_sample.csv')

In [5]:
# data.info(memory_usage='deep')

Можно заметить, что совпадают некоторые строки по ключевому признаку - transaction_key, необходимо произвести схлопывание данных по одинаковым ключам с суммированием признаков line_quantity.

### Сохранение обработанных данных

In [6]:
def save(data, name):
    print('SAVING dataset: {}\n'.format(name.upper()))
    data.info(memory_usage='deep')
    data.to_csv(DATASETS_PATH + name)

### Схлопывание одинаковых позиций в чеках

In [7]:
def collapse(data):
    # print('Collapsing data')
    data_grouped = data[['transaction_key', 'product_id', 'line_quantity']].groupby(by = ['transaction_key', 'product_id'])
    grouped_series = data_grouped['line_quantity'].agg(lambda x: sum(x))
    data = data.join(grouped_series, how='left', on=['transaction_key', 'product_id'], rsuffix='LA')
    data = data.drop(columns=['line_quantity'])
    data = data.rename(columns={'line_quantityLA': 'line_quantity'})
    return data.drop_duplicates()

In [8]:
# data = collapse(data)
# data

### Преобразование transaction_key

In [9]:
def get_data(str):
    return str.split('_')[3:4][0]

def get_key(str):
    splitted = str.split('_')
    ans = ''
    for i in range(3):
        ans += splitted[i]
    ans += splitted[-1]
    return int(ans)

def transform_transaction_key(data, trans_id):
    # print('Transforming dates')
    dates = data['transaction_key'].apply(get_data)
    dates = pd.to_datetime(dates)
    data['datetime'] = dates

    # print('Transforming keys')
    keys = data['transaction_key']
    keys = keys.drop_duplicates()
    keys = keys.to_frame()

    for ind, key in keys.iterrows():
        keys.at[ind, 'key_id'] = trans_id.push(key['transaction_key'])

    data = data.join(keys.set_index('transaction_key'), how='left', on='transaction_key')
    data = data.drop(columns=['transaction_key'])
    return data, trans_id

In [10]:
# data = transform_transaction_key(data)
# data

### Преобразование gid

In [11]:
def transform_gid(data, gids_id):
    # print('Transforming gid')
    gids = data['gid']
    gids = gids.drop_duplicates()
    gids = gids.to_frame()

    for ind, gid in gids.iterrows():
        gids.at[ind, 'gid_id'] = gids_id.push(gid['gid'])

    data = data.join(gids.set_index('gid'), on='gid', how='left')
    return data.drop(columns=['gid']), gids_id

In [12]:
# data = transform_gid(data)
# data

### Преобразование line_type

In [13]:
def transform_line_type(data, lines_id):
    # print('Transforming line_type')
    line_types = data['line_type']
    line_types = line_types.drop_duplicates()
    line_types = line_types.to_frame()

    for ind, line_type in line_types.iterrows():
        line_types.at[ind, 'line_type_id'] = lines_id.push(line_type['line_type'])

    data = data.join(line_types.set_index('line_type'), on='line_type', how='left')
    return data.drop(columns=['line_type']), lines_id

In [14]:
# data = transform_line_type(data)
# data

### Подключение категорий товаров

In [15]:
def include_categories(data, cats_path='datasets/ProductsModels.csv'):
    # print('Including products categories')
    cats = pd.read_csv(cats_path)
    cats = cats.rename(columns = {'model_id': 'category_id', 'name': 'category_name'})
    cats = cats.drop(columns = ['category_name'])
    data = data.join(cats.set_index('product_id'), on='product_id', how='left')
    return data

In [16]:
# data = include_categories(data, 'datasets/ProductsModels.csv')

### Добавление дней недели

In [17]:
weekday_dict = {
    0: 'Воскресенье',
    1: 'Понедельник',
    2: 'Вторник',
    3: 'Среда',
    4: 'Четверг',
    5: 'Пятница',
    6: 'Суббота',
               }

weekday_dict.values()
weekdays_series = pd.Series(weekday_dict.values())
weekdays_series = weekdays_series.to_frame()
weekdays_series['weekday_id'] = weekdays_series.index

save(weekdays_series, 'weekdays_keys.csv')

SAVING dataset: WEEKDAYS_KEYS.CSV

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   0           7 non-null      object
 1   weekday_id  7 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 868.0 bytes


In [18]:
def include_weekday(data):
    # print('Including weekdays')
    def get_weekday(datetime):
        return datetime.weekday()

    weekdays = data['datetime'].apply(get_weekday)
    data['weekday'] = weekdays
    return data

In [19]:
# data = include_weekday(data)
# data

### Обработка всего набора данных

### Класс для ключей

In [20]:
class KeyDict:
    def __init__(self):
        self.max = 0
        self.dict = {}

    def push(self, obj):
        if obj in self.dict:
            # print(self.dict[obj])
            return self.dict[obj]
        self.dict[obj] = self.max
        self.max += 1
        # print(self.max - 1)
        return self.max - 1

    def get(self, obj):
        if obj not in self.dict:
            return None
        return self.dict[obj]

    def save(self, path):
        with open(path, 'w') as file:
            file.write(json.dumps(self.dict))


In [21]:
NROWS = 200_000
BATCH_SIZE = 200_000
PATH = 'datasets/receipts_new.csv'

In [22]:
# def process_batch(batch, trans_id, gid_id, types_id):
#     time_batch_0 = time.perf_counter()
#     batch = collapse(batch)
#     batch, trans_id = transform_transaction_key(batch, trans_id)
#     batch, gid_id = transform_gid(batch, gid_id)
#     batch, types_id = transform_line_type(batch, types_id)
#     batch = include_categories(batch, 'datasets/ProductsModels.csv')
#     batch = include_weekday(batch)
#
#     time_batch_delta = time.perf_counter() - time_batch_0
#     print('Batch time - {}'.format(time_batch_delta))
#     return batch, trans_id, gid_id, types_id
#
#
# def process_data(batch_size=BATCH_SIZE, path=PATH):
#     batches = pd.DataFrame()
#     trans_id = KeyDict()
#     gid_id = KeyDict()
#     types_id = KeyDict()
#
#     def concat_batches(batch, tr_id, gi_id, ty_id):
#         nonlocal batches, trans_id, gid_id, types_id
#
#         print('\n\n' + batch.info() + '\n\n')
#         trans_id, gid_id, types_id = tr_id, gi_id, ty_id
#         batches = pd.concat([batches, batch], ignore_index=True, sort=False)
#
#     data = pd.read_csv(path, nrows=NROWS)
#     data = data.rename(columns={'line_item_id': 'product_id'}).drop(columns=['opened_date', 'line_margin'])
#
#
#     weekdays_id = KeyDict()
#     weekdays_id.push('Воскресенье')
#     weekdays_id.push('Понедельник')
#     weekdays_id.push('Втроник')
#     weekdays_id.push('Среда')
#     weekdays_id.push('Четверг')
#     weekdays_id.push('Пятница')
#     weekdays_id.push('Суббота')
#
#
#     pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() - 1)
#
#     for batch in np.array_split(data, data.shape[0] / batch_size):
#         pool.apply_async(
#             process_batch,
#             args=(batch, trans_id, gid_id, types_id),
#             callback=concat_batches
#         )
#     pool.close()
#     pool.join()
#
#     batches = collapse(batches)
#     batches, trans_id = transform_transaction_key(batches, trans_id)
#     batches, gid_id = transform_gid(batches, gid_id)
#     batches, types_id = transform_line_type(batches, types_id)
#
#     return batches, trans_id, gid_id, types_id, weekdays_id

In [23]:
def process_batch(batch):
    global trans_id, gid_id, types_id
    time_batch_0 = time.perf_counter()
    # batch = batch.rename(columns = {'line_item_id': 'product_id'})
    batch = collapse(batch)
    batch, trans_id = transform_transaction_key(batch, trans_id)
    batch, gid_id = transform_gid(batch, gid_id)
    batch, types_id = transform_line_type(batch, types_id)
    batch = include_categories(batch, 'datasets/ProductsModels.csv')
    batch = include_weekday(batch)

    time_batch_delta = time.perf_counter() - time_batch_0
    print('Batch time - {}'.format(time_batch_delta))
    return batch


def process_data(batch_size=BATCH_SIZE, path=PATH):
    global trans_id, gid_id, types_id
    batches = pd.DataFrame()

    def concat_batches(batch):
        nonlocal batches
        print(trans_id.max)
        batches = pd.concat([batches, batch], ignore_index=True, sort=False)

    data = pd.read_csv(path, nrows=NROWS)
    data = data.rename(columns={'line_item_id': 'product_id'}).drop(columns=['opened_date', 'line_margin'])

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())

    for batch in np.array_split(data, data.shape[0] / batch_size):
        pool.apply_async(
            process_batch,
            args=(batch,),
            callback=concat_batches
        )
    pool.close()
    pool.join()

    batches = batches.rename(columns = {'key_id': 'transaction_key'})
    batches = collapse(batches)

    return batches

In [24]:
time_0 = time.perf_counter()
trans_id, gid_id, types_id, weekdays_id = KeyDict(), KeyDict(), KeyDict(), KeyDict()
weekdays_id.push('Воскресенье')
weekdays_id.push('Понедельник')
weekdays_id.push('Вторник')
weekdays_id.push('Среда')
weekdays_id.push('Четверг')
weekdays_id.push('Пятница')
weekdays_id.push('Суббота')

data_processed = process_data()
time_delta = time.perf_counter() - time_0
print('Overall time - {}'.format(time_delta))

Batch time - 10.59712091900019


Exception in thread Thread-7 (_handle_results):
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 595, in _handle_results
    cache[job]._set(i, obj)
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 779, in _set
    self._callback(self._value)
TypeError: process_data.<locals>.concat_batches() missing 3 required positional arguments: 'tr_id', 'gi_id', and 'ty_id'
Process ForkPoolWorker-1:
Process ForkPoolWorker-14:
Process ForkPoolWorker-5:
Process ForkPoolWorker-6:
Process ForkPoolWorker-4:
Process ForkPoolWorker-9:
Process ForkPoolWorker-3:
Process ForkPoolWorker-12:
Process ForkPoolWorker-8:
Process ForkPoolWorker-13:
Process ForkPoolWorker-7:
Process ForkPoolWorker-2:
Process ForkPoolWorker-15:
Process ForkPoolWorker-11:
Process ForkPoolW

KeyboardInterrupt: 

In [None]:
print(multiprocessing.cpu_count())

In [None]:
data_processed

In [None]:
trans_id.save(DICTS_PATH + 'transaction_keys.json')
gid_id.save(DICTS_PATH + 'gids.json')
types_id.save(DICTS_PATH + 'type_lines.json')
weekdays_id.save(DICTS_PATH + 'weekdays.json')
save(data_processed, 'data_processed.csv')