In [1]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install tqdm

In [2]:
import math
import multiprocessing
import time

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

tqdm.pandas()

from helper import KeyDict, save

In [3]:
FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'
DICTS_PATH = 'out/dicts/'

In [4]:
data = pd.read_csv('datasets/receipts_new.csv', nrows=None)
data = data.rename(columns = {'line_item_id': 'product_id'})
data

Unnamed: 0,gid,transaction_key,store_id,opened_date,product_id,line_quantity,line_item_price,line_item_cost,line_margin,line_type
0,BipqVCA9TxmTukbRjRKJXg,009_002_74_2022-04-18 12:22:18_9058,2,2022-04-18,83591944,1.0,2990.0,2516.10,-24.43,Sales
1,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,15058563,2.0,1413.0,1206.73,412.54,Undefined
2,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,82148968,1.0,338.0,179.51,158.49,Undefined
3,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,82148968,3.0,338.0,179.51,475.47,Undefined
4,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,16042299,1.0,1634.0,1402.83,231.17,Undefined
...,...,...,...,...,...,...,...,...,...,...
27601037,DJf64XaDQSWhZgwwXld0nw,009_002_75_2022-09-08 11:14:12_4196,2,2022-09-08,12240853,21.0,43.0,44.66,-185.36,Sales
27601038,DJf64XaDQSWhZgwwXld0nw,009_002_75_2022-09-08 11:14:12_4196,2,2022-09-08,12240853,1.0,43.0,44.66,-8.83,Sales
27601039,DJf64XaDQSWhZgwwXld0nw,009_002_75_2022-09-08 11:14:12_4196,2,2022-09-08,82258971,1.0,950.0,969.86,-178.19,Sales
27601040,DJf64XaDQSWhZgwwXld0nw,009_002_75_2022-09-08 11:14:12_4196,2,2022-09-08,49000003,1.0,4.0,,,Sales


In [5]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27601042 entries, 0 to 27601041
Data columns (total 10 columns):
 #   Column           Dtype  
---  ------           -----  
 0   gid              object 
 1   transaction_key  object 
 2   store_id         int64  
 3   opened_date      object 
 4   product_id       int64  
 5   line_quantity    float64
 6   line_item_price  float64
 7   line_item_cost   float64
 8   line_margin      float64
 9   line_type        object 
dtypes: float64(4), int64(2), object(4)
memory usage: 9.0 GB


In [7]:
data.info()

In [8]:
data.describe()

Можно заметить, что совпадают некоторые строки по ключевому признаку - transaction_key, необходимо произвести схлопывание данных по одинаковым ключам с суммированием признаков line_quantity.

### Схлопывание одинаковых позиций в чеках

In [10]:
def collapse(data):
    grouped_series = data[['transaction_key', 'product_id', 'line_quantity']].groupby(
        by=['transaction_key', 'product_id'])['line_quantity'].sum()

    data = data.join(grouped_series, how='left', on=['transaction_key', 'product_id'], lsuffix='_old')
    data = data.drop(columns=['line_quantity_old'])
    return data.drop_duplicates()

### Преобразование transaction_key

In [12]:
def get_data(str):
    return str.split('_')[3:4][0]


def get_key(str):
    splitted = str.split('_')
    ans = ''
    for i in range(3):
        ans += splitted[i]
    ans += splitted[-1]
    return int(ans)


def transform_transaction_key(data, trans_id):
    dates = data['transaction_key'].apply(get_data)
    dates = pd.to_datetime(dates)
    data['datetime'] = dates

    data['transaction_key'] = data['transaction_key'].apply(lambda x: trans_id.get(x))
    return data

### Преобразование gid

In [14]:
def transform_gid(data, gids_id):
    data['gid'] = data['gid'].apply(lambda x: gids_id.get(x))
    return data

### Преобразование product_id

In [16]:
def transform_product_id(data, products_id):
    data['product_id'] = data['product_id'].apply(lambda x: products_id.get(x))
    return data

### Преобразование line_type

In [17]:
def transform_line_type(data, lines_id):
    data['line_type'] = data['line_type'].apply(lambda x: lines_id.get(x))
    return data

### Подключение категорий товаров

In [19]:
cats = pd.read_csv('datasets/ProductsModels.csv')
cats = cats.rename(columns={'model_id': 'category_id', 'name': 'category_name'})
cats['new_id'] = cats.index


cats.drop_duplicates()

Unnamed: 0,product_id,category_id,category_name,new_id
0,18843508,201658,Ручной инструмент для обработки почвы: культив...,0
1,82406527,201271,Комнатные растения,1
2,82597131,201274,Саженцы плодовых деревьев и кустарников,2
3,84738086,200990,Шторка для ванной комнаты,3
4,18816091,200340,Кабели для ТВ,4
...,...,...,...,...
540648,82192555,202275,Наконечник для карниза,540648
540649,82192571,202275,Наконечник для карниза,540649
540650,82088853,202276,Кронштейн для штанги со сверлением,540650
540651,81966519,202346,Ткани и тюли на отрез,540651


In [21]:
def include_categories(data):
    cats_new = cats.drop(columns=['category_name', 'category_id']).rename(columns={'new_id': 'category_id'})
    data = data.join(cats_new.set_index('product_id'), on='product_id', how='left')
    return data

### Добавление дней недели

In [23]:
def include_weekday(data):
    def get_weekday(datetime):
        return datetime.weekday()

    weekdays = data['datetime'].apply(get_weekday)
    data['weekday'] = weekdays
    return data

### Обработка всего набора данных

In [26]:
NROWS = None
BATCH_SIZE = 500_000
PATH = 'datasets/receipts_new.csv'

In [27]:
batches = []

def process_batch(batch, trans_id, gids_id, lines_id, products_id):
    time_batch_0 = time.perf_counter()
    batch = collapse(batch)
    batch = transform_transaction_key(batch, trans_id)
    batch = transform_gid(batch, gids_id)
    batch = transform_line_type(batch, lines_id)
    batch = include_categories(batch)
    batch = transform_product_id(batch, products_id)
    batch = include_weekday(batch)
    time_batch_delta = time.perf_counter() - time_batch_0
    print('Batch time - {:.3f}s'.format(time_batch_delta))
    return batch
    
def concat_batches(batch):
    global batches
    batches.append(batch)

def process_data(batch_size=BATCH_SIZE, path=PATH):
    global batches

    data = pd.read_csv(path, nrows=NROWS)
    print('Data has been read with {} rows'.format(data.shape[0]))

    data = data.rename(columns={'line_item_id': 'product_id'}).drop(columns=['opened_date', 'line_margin'])

    print('Collecting key-dictionaries...')

    def process_dict(data_column):
        dict = KeyDict()
        for row in data_column.drop_duplicates():
            dict.push(row)
        return dict

    trans_id = process_dict(data['transaction_key'])
    gids_id = process_dict(data['gid'])
    lines_id = process_dict(data['line_type'])
    products_id = process_dict(data['product_id'])

    print('Batching with {} batch size, {} batches count'.format(BATCH_SIZE, math.ceil(data.shape[0] / batch_size)))

    print('Running multiprocessing with {} cpu units...'.format(multiprocessing.cpu_count()))

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    for batch in tqdm(np.array_split(data, data.shape[0] / batch_size)):
        batch = process_batch(batch, trans_id, gids_id, lines_id, products_id)
        concat_batches(batch)
#         pool.apply_async(
#             process_batch,
#             args=(batch, trans_id, gids_id, lines_id, products_id,),
#             callback=concat_batches
#         )
    pool.close()
    pool.join()

    data = pd.concat(batches, ignore_index=True, sort=False)
    data = data.rename(columns={'key_id': 'transaction_key'})
    print('Collapsing all batches...')
    data = collapse(data)

    return data, trans_id, gids_id, lines_id, products_id

In [28]:
time_0 = time.perf_counter()

data_processed, trans_id, gids_id, lines_id, products_id = process_data()
time_delta = time.perf_counter() - time_0
print('Overall time - {:.3f}s'.format(time_delta))

Data has been read with 27601042 rows
Collecting key-dictionaries...
Batching with 500000 batch size, 56 batches count
Running multiprocessing with 16 cpu units...


  0%|          | 0/55 [00:00<?, ?it/s]

Batch time - 1.422s
Batch time - 1.379s
Batch time - 1.424s
Batch time - 1.360s
Batch time - 1.402s
Batch time - 1.427s
Batch time - 1.374s
Batch time - 1.411s
Batch time - 1.431s
Batch time - 1.437s
Batch time - 1.392s
Batch time - 1.407s
Batch time - 1.475s
Batch time - 1.379s
Batch time - 1.424s
Batch time - 1.448s
Batch time - 1.383s
Batch time - 1.438s
Batch time - 1.436s
Batch time - 1.393s
Batch time - 1.416s
Batch time - 1.439s
Batch time - 1.388s
Batch time - 1.435s
Batch time - 1.429s
Batch time - 1.385s
Batch time - 1.442s
Batch time - 1.421s
Batch time - 1.443s
Batch time - 1.385s
Batch time - 1.435s
Batch time - 1.424s
Batch time - 1.432s
Batch time - 1.387s
Batch time - 1.411s
Batch time - 1.420s
Batch time - 1.441s
Batch time - 1.435s
Batch time - 1.389s
Batch time - 1.439s
Batch time - 1.441s
Batch time - 1.397s
Batch time - 1.428s
Batch time - 1.452s
Batch time - 1.437s
Batch time - 1.446s
Batch time - 1.381s
Batch time - 1.437s
Batch time - 1.430s
Batch time - 1.449s


In [29]:
data_processed

Unnamed: 0,gid,transaction_key,store_id,product_id,line_item_price,line_item_cost,line_type,datetime,category_id,weekday,line_quantity
0,0,0,2,0,2990.0,2516.10,0,2022-04-18 12:22:18,49788.0,0,1.0
1,1,1,3,1,1413.0,1206.73,1,2022-08-26 18:00:29,140669.0,4,2.0
2,1,1,3,2,338.0,179.51,1,2022-08-26 18:00:29,157827.0,4,4.0
3,1,1,3,3,1634.0,1402.83,1,2022-08-26 18:00:29,,4,1.0
4,1,1,3,4,78.0,61.70,1,2022-08-26 18:00:29,484922.0,4,40.0
...,...,...,...,...,...,...,...,...,...,...,...
20672577,861563,3945281,2,70820,1925.0,1469.14,0,2022-09-08 11:14:12,381032.0,3,1.0
20672578,861563,3945281,2,3161,43.0,44.66,0,2022-09-08 11:14:12,33888.0,3,22.0
20672579,861563,3945281,2,2612,950.0,969.86,0,2022-09-08 11:14:12,13517.0,3,1.0
20672580,861563,3945281,2,51,4.0,,0,2022-09-08 11:14:12,,3,1.0


In [30]:
trans_id.save(DICTS_PATH + 'transaction_keys.json')
gids_id.save(DICTS_PATH + 'gids.json')
lines_id.save(DICTS_PATH + 'type_lines.json')
products_id.save(DICTS_PATH + 'products.json')

save(data_processed, 'data_processed.csv', DATASETS_PATH)

3945282
999188
5
103406
SAVING dataset: DATA_PROCESSED.CSV

<class 'pandas.core.frame.DataFrame'>
Index: 20662675 entries, 0 to 20672581
Data columns (total 11 columns):
 #   Column           Dtype         
---  ------           -----         
 0   gid              int64         
 1   transaction_key  int64         
 2   store_id         int64         
 3   product_id       int64         
 4   line_item_price  float64       
 5   line_item_cost   float64       
 6   line_type        int64         
 7   datetime         datetime64[ns]
 8   category_id      float64       
 9   weekday          int64         
 10  line_quantity    float64       
dtypes: datetime64[ns](1), float64(4), int64(6)
memory usage: 1.8 GB
