In [36]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import random

In [37]:
FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'

In [38]:
data = pd.read_csv('datasets/receipts_new.csv', nrows=5_000_000)
data = data.rename(columns = {'line_item_id': 'product_id'})
data
# data.head(1000).to_csv('datasets/receipts_sample.csv')

Unnamed: 0,gid,transaction_key,store_id,opened_date,product_id,line_quantity,line_item_price,line_item_cost,line_margin,line_type
0,BipqVCA9TxmTukbRjRKJXg,009_002_74_2022-04-18 12:22:18_9058,2,2022-04-18,83591944,1.0,2990.0,2516.10,-24.43,Sales
1,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,15058563,2.0,1413.0,1206.73,412.54,Undefined
2,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,82148968,1.0,338.0,179.51,158.49,Undefined
3,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,82148968,3.0,338.0,179.51,475.47,Undefined
4,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,16042299,1.0,1634.0,1402.83,231.17,Undefined
...,...,...,...,...,...,...,...,...,...,...
4999995,tbW0qVpPSrWJPBa2uF-MNw,009_004_10_2022-10-02 15:20:04_7015,4,2022-10-02,13426158,1.0,75.0,43.43,19.07,Sales
4999996,tbW0qVpPSrWJPBa2uF-MNw,009_004_10_2022-10-02 15:20:04_7015,4,2022-10-02,13426158,1.0,75.0,43.43,19.07,Sales
4999997,tbW0qVpPSrWJPBa2uF-MNw,009_004_10_2022-10-02 15:20:04_7015,4,2022-10-02,82138914,1.0,140.0,68.59,48.08,Sales
4999998,tbW0qVpPSrWJPBa2uF-MNw,009_004_10_2022-10-02 15:20:04_7015,4,2022-10-02,82138914,1.0,140.0,68.59,48.08,Sales


In [39]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000000 entries, 0 to 4999999
Data columns (total 10 columns):
 #   Column           Dtype  
---  ------           -----  
 0   gid              object 
 1   transaction_key  object 
 2   store_id         int64  
 3   opened_date      object 
 4   product_id       int64  
 5   line_quantity    float64
 6   line_item_price  float64
 7   line_item_cost   float64
 8   line_margin      float64
 9   line_type        object 
dtypes: float64(4), int64(2), object(4)
memory usage: 1.6 GB


Можно заметить, что совпадают некоторые строки по ключевому признаку - transaction_key, необходимо произвести схлопывание данных по одинаковым ключам с суммированием признаков line_quantity.

### Сохранение обработанных данных

In [71]:
def save(data, name):
    print('Dataset: {}\n'.format(name.upper()))
    data.info(memory_usage='deep')
    data.to_csv(DATASETS_PATH + name)

### Схлопывание одинаковых позиций в чеках

In [40]:
def collapse(data):
    data_grouped = data[['transaction_key', 'product_id', 'line_quantity']].groupby(by = ['transaction_key', 'product_id'])
    grouped_series = data_grouped['line_quantity'].agg(lambda x: sum(x))
    data = data.join(grouped_series, how='left', on=['transaction_key', 'product_id'], rsuffix='LA')
    data = data.drop(columns=['line_quantity', 'line_margin'])
    data = data.rename(columns={'line_quantityLA': 'line_quantity'})
    return data.drop_duplicates()

In [42]:
data = collapse(data)
data

Unnamed: 0,gid,transaction_key,store_id,opened_date,product_id,line_item_price,line_item_cost,line_type,line_quantity
0,BipqVCA9TxmTukbRjRKJXg,009_002_74_2022-04-18 12:22:18_9058,2,2022-04-18,83591944,2990.0,2516.10,Sales,1.0
1,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,15058563,1413.0,1206.73,Undefined,2.0
2,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,82148968,338.0,179.51,Undefined,4.0
4,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,16042299,1634.0,1402.83,Undefined,1.0
5,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,82107700,78.0,61.70,Undefined,40.0
...,...,...,...,...,...,...,...,...,...
4999993,CVDg3s84SQ6UkMm2QOLu1A,009_003_6_2022-09-10 10:31:18_7691,3,2022-09-10,82800679,103.0,47.79,Sales,1.0
4999994,tbW0qVpPSrWJPBa2uF-MNw,009_004_10_2022-10-02 15:20:04_7015,4,2022-10-02,84817570,138.0,86.00,Sales,1.0
4999995,tbW0qVpPSrWJPBa2uF-MNw,009_004_10_2022-10-02 15:20:04_7015,4,2022-10-02,13426158,75.0,43.43,Sales,2.0
4999997,tbW0qVpPSrWJPBa2uF-MNw,009_004_10_2022-10-02 15:20:04_7015,4,2022-10-02,82138914,140.0,68.59,Sales,2.0


### Преобразование transaction_key

In [44]:
def get_data(str):
    return str.split('_')[3:4][0]

def get_key(str):
    splitted = str.split('_')
    ans = ''
    for i in range(3):
        ans += splitted[i]
    ans += splitted[-1]
    return int(ans)

def transform_transaction_key(data):
    dates = data['transaction_key'].apply(get_data)
    dates = pd.to_datetime(dates)
    data['datetime'] = dates

    keys = data['transaction_key']
    keys = keys.to_frame()
    keys = keys.drop_duplicates()
    keys['key_id'] = [i for i in range(keys.shape[0])]

    keys = keys.rename(columns={'transaction_key': 'new_key', 'original': 'transaction_key'})
    data = data.join(keys.drop(columns=['new_key']).set_index('transaction_key'), how='left', on='transaction_key')
    data = data.drop(columns=['transaction_key'])
    save(keys, 'transaction_key_keys.csv')
    return data

Unnamed: 0,gid,transaction_key,store_id,opened_date,product_id,line_item_price,line_item_cost,line_type,line_quantity,datetime
0,BipqVCA9TxmTukbRjRKJXg,009_002_74_2022-04-18 12:22:18_9058,2,2022-04-18,83591944,2990.0,2516.10,Sales,1.0,2022-04-18 12:22:18
1,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,15058563,1413.0,1206.73,Undefined,2.0,2022-08-26 18:00:29
2,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,82148968,338.0,179.51,Undefined,4.0,2022-08-26 18:00:29
4,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,16042299,1634.0,1402.83,Undefined,1.0,2022-08-26 18:00:29
5,b0R_ll-dSsSbcMvCgNhqyA,009_003_35_2022-08-26 18:00:29_7374,3,2022-08-26,82107700,78.0,61.70,Undefined,40.0,2022-08-26 18:00:29
...,...,...,...,...,...,...,...,...,...,...
4999993,CVDg3s84SQ6UkMm2QOLu1A,009_003_6_2022-09-10 10:31:18_7691,3,2022-09-10,82800679,103.0,47.79,Sales,1.0,2022-09-10 10:31:18
4999994,tbW0qVpPSrWJPBa2uF-MNw,009_004_10_2022-10-02 15:20:04_7015,4,2022-10-02,84817570,138.0,86.00,Sales,1.0,2022-10-02 15:20:04
4999995,tbW0qVpPSrWJPBa2uF-MNw,009_004_10_2022-10-02 15:20:04_7015,4,2022-10-02,13426158,75.0,43.43,Sales,2.0,2022-10-02 15:20:04
4999997,tbW0qVpPSrWJPBa2uF-MNw,009_004_10_2022-10-02 15:20:04_7015,4,2022-10-02,82138914,140.0,68.59,Sales,2.0,2022-10-02 15:20:04


In [None]:
data = transform_transaction_key(data)
data

### Преобразование gid

In [49]:
def transform_gid(data):
    gids = data['gid']
    gids = gids.drop_duplicates()
    gids = gids.to_frame()
    gids['gid_id'] = [i for i in range(gids.shape[0])]
    # gids
    save(gids, 'gid_keys.csv')
    data = data.join(gids.set_index('gid'), on='gid', how='left')
    return data.drop(columns=['gid'])

Unnamed: 0,gid,gid_id
0,BipqVCA9TxmTukbRjRKJXg,0
1,b0R_ll-dSsSbcMvCgNhqyA,1
58,kSUiFGs-ShWInP0RKU7URw,2
59,8bhSbGSSQ-2ddsYDarsxxA,3
61,HIrEdmVmTEGL9tVVWN-FGw,4
...,...,...
4999835,8QK_Kxt_TlmdmqWreTV9IA,385917
4999935,hTXYOTZTSKK78LtPPpJv4A,385918
4999967,EP86noWCT8uxwhTMwdDyJw,385919
4999985,CVDg3s84SQ6UkMm2QOLu1A,385920


In [50]:
data = transform_gid(data)
data

Unnamed: 0,store_id,opened_date,product_id,line_item_price,line_item_cost,line_type,line_quantity,datetime,key_id,gid_id
0,2,2022-04-18,83591944,2990.0,2516.10,Sales,1.0,2022-04-18 12:22:18,0,0
1,3,2022-08-26,15058563,1413.0,1206.73,Undefined,2.0,2022-08-26 18:00:29,1,1
2,3,2022-08-26,82148968,338.0,179.51,Undefined,4.0,2022-08-26 18:00:29,1,1
4,3,2022-08-26,16042299,1634.0,1402.83,Undefined,1.0,2022-08-26 18:00:29,1,1
5,3,2022-08-26,82107700,78.0,61.70,Undefined,40.0,2022-08-26 18:00:29,1,1
...,...,...,...,...,...,...,...,...,...,...
4999993,3,2022-09-10,82800679,103.0,47.79,Sales,1.0,2022-09-10 10:31:18,715767,385920
4999994,4,2022-10-02,84817570,138.0,86.00,Sales,1.0,2022-10-02 15:20:04,715768,385921
4999995,4,2022-10-02,13426158,75.0,43.43,Sales,2.0,2022-10-02 15:20:04,715768,385921
4999997,4,2022-10-02,82138914,140.0,68.59,Sales,2.0,2022-10-02 15:20:04,715768,385921


### Преобразование line_type

In [52]:
def transform_line_type(data):
    line_types = data['line_type']
    line_types = line_types.drop_duplicates()
    line_types = line_types.to_frame()
    line_types['line_type_id'] = [i for i in range(line_types.shape[0])]
    # line_types
    save(line_types, 'line_type_keys.csv')
    data = data.join(line_types.set_index('line_type'), on='line_type', how='left')
    return data.drop(columns=['line_type'])

Unnamed: 0,line_type,line_type_id
0,Sales,0
1,Undefined,1
58,pickedUp orders,2
240,Orders (Prepayment),3
271557,gift card,4


In [53]:
data = transform_line_type(data)
data

Unnamed: 0,store_id,opened_date,product_id,line_item_price,line_item_cost,line_quantity,datetime,key_id,gid_id,line_type_id
0,2,2022-04-18,83591944,2990.0,2516.10,1.0,2022-04-18 12:22:18,0,0,0
1,3,2022-08-26,15058563,1413.0,1206.73,2.0,2022-08-26 18:00:29,1,1,1
2,3,2022-08-26,82148968,338.0,179.51,4.0,2022-08-26 18:00:29,1,1,1
4,3,2022-08-26,16042299,1634.0,1402.83,1.0,2022-08-26 18:00:29,1,1,1
5,3,2022-08-26,82107700,78.0,61.70,40.0,2022-08-26 18:00:29,1,1,1
...,...,...,...,...,...,...,...,...,...,...
4999993,3,2022-09-10,82800679,103.0,47.79,1.0,2022-09-10 10:31:18,715767,385920,0
4999994,4,2022-10-02,84817570,138.0,86.00,1.0,2022-10-02 15:20:04,715768,385921,0
4999995,4,2022-10-02,13426158,75.0,43.43,2.0,2022-10-02 15:20:04,715768,385921,0
4999997,4,2022-10-02,82138914,140.0,68.59,2.0,2022-10-02 15:20:04,715768,385921,0


### Подключение категорий товаров

In [55]:
def include_categories(data, cats_path='datasets/ProductsModels.csv'):
    cats = pd.read_csv(cats_path)
    cats = cats.rename(columns = {'model_id': 'category_id', 'name': 'category_name'})
    # cats
    categories = cats[['category_id', 'category_name']].drop_duplicates()
    # categories
    save(categories, 'categories.csv')
    cats = cats.drop(columns = ['category_name'])
    data = data.join(cats.set_index('product_id'), on='product_id', how='left')
    return data

Unnamed: 0,product_id,category_id,category_name
0,18843508,201658,Ручной инструмент для обработки почвы: культив...
1,82406527,201271,Комнатные растения
2,82597131,201274,Саженцы плодовых деревьев и кустарников
3,84738086,200990,Шторка для ванной комнаты
4,18816091,200340,Кабели для ТВ
...,...,...,...
540648,82192555,202275,Наконечник для карниза
540649,82192571,202275,Наконечник для карниза
540650,82088853,202276,Кронштейн для штанги со сверлением
540651,81966519,202346,Ткани и тюли на отрез


In [56]:
data = include_categories(data, 'datasets/ProductsModels.csv')

Unnamed: 0,category_id,category_name
0,201658,Ручной инструмент для обработки почвы: культив...
1,201271,Комнатные растения
2,201274,Саженцы плодовых деревьев и кустарников
3,200990,Шторка для ванной комнаты
4,200340,Кабели для ТВ
...,...,...
531601,201720,Моноблочный насос для бассейна
532491,201008,Прокладки для душевых кабин
532663,201015,Подставка для душевого поддона
536625,202387,Кабель с выключателем и вилкой


### Добавление дней недели

In [60]:
weekday_dict = {
    0: 'Воскресенье',
    1: 'Понедельник',
    2: 'Вторник',
    3: 'Среда',
    4: 'Четверг',
    5: 'Пятница',
    6: 'Суббота',
               }

weekday_dict.values()
weekdays_series = pd.Series(weekday_dict.values())
weekdays_series = weekdays_series.to_frame()
weekdays_series['weekday_id'] = weekdays_series.index


save(weekdays_series, 'weekdays_keys.csv')

In [61]:
def include_weekday(data):
    def get_weekday(datetime):
        return datetime.weekday()

    weekdays = data['datetime'].apply(get_weekday)
    data['weekday'] = weekdays
    return data

Unnamed: 0,store_id,opened_date,product_id,line_item_price,line_item_cost,line_quantity,datetime,key_id,gid_id,line_type_id,category_id,weekday
0,2,2022-04-18,83591944,2990.0,2516.10,1.0,2022-04-18 12:22:18,0,0,0,201981.0,0
1,3,2022-08-26,15058563,1413.0,1206.73,2.0,2022-08-26 18:00:29,1,1,1,202253.0,4
2,3,2022-08-26,82148968,338.0,179.51,4.0,2022-08-26 18:00:29,1,1,1,201089.0,4
4,3,2022-08-26,16042299,1634.0,1402.83,1.0,2022-08-26 18:00:29,1,1,1,,4
5,3,2022-08-26,82107700,78.0,61.70,40.0,2022-08-26 18:00:29,1,1,1,201089.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...
4999993,3,2022-09-10,82800679,103.0,47.79,1.0,2022-09-10 10:31:18,715767,385920,0,202211.0,5
4999994,4,2022-10-02,84817570,138.0,86.00,1.0,2022-10-02 15:20:04,715768,385921,0,200614.0,6
4999995,4,2022-10-02,13426158,75.0,43.43,2.0,2022-10-02 15:20:04,715768,385921,0,201286.0,6
4999997,4,2022-10-02,82138914,140.0,68.59,2.0,2022-10-02 15:20:04,715768,385921,0,200165.0,6


In [62]:
data = include_weekday(data)
data

### Обработка всего набора данных

In [None]:
BATCH_SIZE = 500_000
PATH = 'datasets/receipts_new.csv'

In [None]:
def process_data(batch_size=BATCH_SIZE, path=PATH):
    data = pd.read_csv(path)

    trans_id = {}
    gid_id = {}
    types_id = {}

    for batch in np.array_split(data, batch_size):
        batch = batch.rename(columns = {'line_item_id': 'product_id'})
        batch = collapse(batch)
        batch, trans_id = transform_transaction_key(batch, trans_id)
        batch, gid_id = transform_gid(batch, gid_id)
        batch, types_id = transform_line_type(batch, types_id)


In [70]:
save(data, 'data_processed.csv')

Dataset: DATA_PROCESSED.CSV

<class 'pandas.core.frame.DataFrame'>
Index: 3744907 entries, 147073 to 659668
Data columns (total 12 columns):
 #   Column           Dtype         
---  ------           -----         
 0   store_id         int64         
 1   opened_date      object        
 2   product_id       int64         
 3   line_item_price  float64       
 4   line_item_cost   float64       
 5   line_quantity    float64       
 6   datetime         datetime64[ns]
 7   key_id           int64         
 8   gid_id           int64         
 9   line_type_id     int64         
 10  category_id      float64       
 11  weekday          int64         
dtypes: datetime64[ns](1), float64(4), int64(6), object(1)
memory usage: 582.1 MB
