In [1]:
FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'

In [2]:
%pip install multiprocesspandas

Note: you may need to restart the kernel to use updated packages.


In [29]:
pip install -U numba

Collecting numba
  Downloading numba-0.57.0-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0m
[?25hCollecting llvmlite<0.41,>=0.40.0dev0 (from numba)
  Downloading llvmlite-0.40.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: llvmlite, numba
Successfully installed llvmlite-0.40.0 numba-0.57.0
Note: you may need to restart the kernel to use updated packages.


In [30]:
import pandas as pd
from datetime import datetime, timedelta
import os
import multiprocessing
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import random
# from tqdm import tqdm
from tqdm.notebook import tqdm
from multiprocesspandas import applyparallel
from pandarallel import pandarallel
import numba

tqdm.pandas()
from helper import *

In [4]:
NROWS = 250_000

In [5]:
data = pd.read_csv(DATASETS_PATH + 'data_processed.csv', nrows=NROWS).drop(columns=['Unnamed: 0'])
data['datetime'] = pd.to_datetime(data['datetime'])
# data

In [6]:
def get_user_purchases(data, batch_size):
    """
    :param data: receipts - pandas.DataFrame
    :return: ans: ans[i][j] = count of purchases by the user i of the product j - matrix
    """
    ans = np.zeros((data['gid'].drop_duplicates().shape[0], data['product_id'].drop_duplicates().shape[0]), dtype=np.int8)
    data = data[['gid', 'product_id']]

    def fill_arr(x, arr):
        name = x.name
        for i in x.values:
            arr[name][i] += 1

    def process_batch(batch):
        nonlocal ans
        batch.groupby(by='gid')['product_id'].apply(lambda x: fill_arr(x, ans))

    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    for batch in tqdm(np.array_split(data, data.shape[0] / batch_size)):
        batch = process_batch(batch)
        # pool.apply_async(
        #     process_batch,
        #     args=(batch,),
        # )
    pool.close()
    pool.join()

    return ans

In [7]:
# %pip install pandarallel

In [8]:
def get_date_distances(data):
    """
    Считаем по каждому пользователю ближайшие (по модулю даты) покупки товаров.
    Усредняем значения по каждому пользователю

    :param data:
    :return:
    """

    # def concat_ans(ans):
        # return np.sum(ans['timedelta']), ans.shape[0]

    def data_splitting(interval):
        nonlocal data
        batches = []
        data = data.sort_values(by='datetime')
        start = data.iloc[0].at['datetime']
        end = data.iloc[-1].at['datetime']
        while start <= end:
            sub_end = start + timedelta(days=interval)
            batch = data.loc[data['datetime'] >= start].loc[data['datetime'] < sub_end]
            batches.append(batch)
            start = sub_end

        return batches


    def do_dataframe(temps):
        # t = temps.values

        # print(t.shape)
        # t = np.concatenate(t)#np.concatenate(t))
        # print(t.shape)
        ans = pd.DataFrame(data=temps, columns=['product_1', 'product_2', 'timedelta'])
        ans['count'] = pd.Series(data=[1 for _ in range(ans.shape[0])])
        return ans


    def fill_ans(x):
        product_date = x[['product_id', 'datetime']]

        temp = []

        for i1, r1 in product_date.iterrows():
            for i2, r2 in product_date.iterrows():
                if i1 != i2:
                    p1, p2 = r1['product_id'], r2['product_id']
                    timedelta = (r1['datetime'] - r2['datetime']).days

                    is_found = False
                    for t in temp:
                        if t[0] == p1 and t[1] == p2:
                            if abs(timedelta) < abs(t[2]):
                                t[2] = timedelta
                                is_found = True

                    if not is_found:
                        temp.append([p1, p2, timedelta])

        if len(temp) != 0:
            return np.array(temp)

    data = data[['gid', 'product_id', 'datetime']]
    data.loc[:, 'datetime'] = data['datetime'].dt.date

    batches = data_splitting(interval=14)

    pandarallel.initialize(progress_bar=True, use_memory_fs=True)
    temps = []
    for batch in tqdm(batches):
        grouped_by_user = batch.groupby(by='gid')
#         temp = grouped_by_user.parallel_apply(fill_ans)
        temp = grouped_by_user.apply_parallel(fill_ans, num_processes=multiprocessing.cpu_count())
        temp = temp.dropna()
        temp = np.concatenate(np.concatenate(temp.values))


        temps.append(temp)

    temps = np.concatenate(temps)

    #
    # # print(temps)
    ans = do_dataframe(temps)
    #
    # del temps
    #
    ans.groupby(by=['product_1', 'product_2']).progress_apply(np.sum)#parallel_apply(np.sum)
#     ans.groupby(by=['product_1', 'product_2']).apply_parallel(np.sum, num_processes=multiprocessing.cpu_count())
    #
    return ans





In [9]:
dists = get_date_distances(data)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  0%|          | 0/26 [00:00<?, ?it/s]


  0%|                                                  | 0/2007 [00:00<?, ?it/s][A
 19%|███████                               | 375/2007 [00:00<00:00, 3046.92it/s][A
 34%|████████████▊                         | 680/2007 [00:00<00:00, 2974.85it/s][A
 50%|██████████████████▍                  | 1000/2007 [00:00<00:00, 2684.80it/s][A
 63%|███████████████████████▍             | 1271/2007 [00:00<00:00, 2009.32it/s][A
 74%|███████████████████████████▍         | 1489/2007 [00:00<00:00, 2027.62it/s][A
 85%|███████████████████████████████▍     | 1704/2007 [00:00<00:00, 1577.01it/s][A
100%|█████████████████████████████████████| 2007/2007 [00:01<00:00, 1584.96it/s][A

  0%|                                                  | 0/2029 [00:00<?, ?it/s][A
 19%|███████                               | 378/2029 [00:00<00:00, 3435.91it/s][A
 36%|█████████████▌                        | 722/2029 [00:00<00:00, 2624.50it/s][A
 49%|██████████████████▋                   | 995/2029 [00:00<00:00, 2576.2

 90%|█████████████████████████████████▎   | 1926/2142 [00:00<00:00, 1585.82it/s][A
100%|█████████████████████████████████████| 2142/2142 [00:01<00:00, 1853.96it/s][A

  0%|                                                  | 0/1908 [00:00<?, ?it/s][A
 25%|█████████▍                            | 476/1908 [00:00<00:00, 3708.77it/s][A
 44%|████████████████▊                     | 847/1908 [00:00<00:00, 3206.67it/s][A
 61%|██████████████████████▋              | 1168/1908 [00:00<00:00, 2628.20it/s][A
 75%|███████████████████████████▊         | 1437/1908 [00:00<00:00, 2083.12it/s][A
 87%|████████████████████████████████▎    | 1666/1908 [00:00<00:00, 2034.07it/s][A
100%|█████████████████████████████████████| 1908/1908 [00:01<00:00, 1819.94it/s][A

  0%|                                                  | 0/1876 [00:00<?, ?it/s][A
 12%|████▋                                 | 234/1876 [00:00<00:00, 1650.64it/s][A
 31%|███████████▊                          | 585/1876 [00:00<00:00, 2211.9

  0%|          | 0/2166843 [00:00<?, ?it/s]

In [11]:
dists.sort_values(by='timedelta', ascending=False)

Unnamed: 0,product_1,product_2,timedelta,count
1440681,42349,1655,13,1
2490961,44458,4127,13,1
2490971,7449,4127,13,1
2490976,2985,4127,13,1
2490981,17660,4127,13,1
...,...,...,...,...
12482,10543,7646,-13,1
12483,10543,7645,-13,1
12492,567,7646,-13,1
12493,567,7645,-13,1


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   gid              250000 non-null  int64         
 1   transaction_key  250000 non-null  int64         
 2   store_id         250000 non-null  int64         
 3   product_id       250000 non-null  int64         
 4   line_item_price  250000 non-null  float64       
 5   line_item_cost   233291 non-null  float64       
 6   line_type        250000 non-null  int64         
 7   datetime         250000 non-null  datetime64[ns]
 8   category_id      12 non-null      float64       
 9   weekday          250000 non-null  int64         
 10  line_quantity    250000 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int64(6)
memory usage: 21.0 MB


In [42]:
def get_date_distances_map(data):
    """
    Считаем по каждому пользователю ближайшие (по модулю даты) покупки товаров.
    Усредняем значения по каждому пользователю

    :param data:
    :return:
    """

    # def concat_ans(ans):
        # return np.sum(ans['timedelta']), ans.shape[0]
        
    ans = dict()
    
    

    def data_splitting(interval):
        nonlocal data
        batches = []
        data = data.sort_values(by='datetime')
        start = data.iloc[0].at['datetime']
        end = data.iloc[-1].at['datetime']
        while start <= end:
            sub_end = start + timedelta(days=interval)
            batch = data.loc[data['datetime'] >= start].loc[data['datetime'] < sub_end]
            batches.append(batch)
            start = sub_end

        return batches


    def do_dataframe(temps):
        # t = temps.values

        # print(t.shape)
        # t = np.concatenate(t)#np.concatenate(t))
        # print(t.shape)
        ans = pd.DataFrame(data=temps, columns=['product_1', 'product_2', 'timedelta'])
        ans['count'] = pd.Series(data=[1 for _ in range(ans.shape[0])])
        return ans

    
    
    def fill_ans(x):
        product_date = x[['product_id', 'datetime']]

        temp = []

        for i1, r1 in product_date.iterrows():
            for i2, r2 in product_date.iterrows():
                if i1 != i2:
                    p1, p2 = r1['product_id'], r2['product_id']
                    timedelta = (r1['datetime'] - r2['datetime']).days

                    is_found = False
                    for t in temp:
                        if t[0] == p1 and t[1] == p2:
                            if abs(timedelta) < abs(t[2]):
                                t[2] = timedelta
                                is_found = True

                    if not is_found:
                        temp.append([p1, p2, timedelta])

        if len(temp) != 0:
            return np.array(temp)

    
    
    def concat_for_all_users(arrs):
        ans  = dict()
        for arr in arrs:
            if arr is not None:
                for one_arr in arr:
                    p1, p2, td = one_arr
                    if (p1, p2) in ans:
                        ans[(p1, p2)].append(td)
#                         ans[(p1, p2)] = (ans[(p1, p2)][0] + td, ans[(p1, p2)][1] + 1)
                    else:
                        ans[(p1, p2)] = [td]
#                         ans[(p1, p2)] = (td, 1)
                        
        return ans
    
    
#     def get_average(ans):
        
        
        

    data = data[['gid', 'product_id', 'datetime']]
    data.loc[:, 'datetime'] = data['datetime'].dt.date

    batches = data_splitting(interval=14)

    pandarallel.initialize(progress_bar=True, use_memory_fs=True)
    temps = []
    for batch in tqdm(batches):
        grouped_by_user = batch.groupby(by='gid')
        temp = grouped_by_user.parallel_apply(fill_ans)
#         temp = grouped_by_user.apply_parallel(fill_ans, num_processes=multiprocessing.cpu_count())
        temp = temp.dropna()
        temp = np.concatenate(np.concatenate(temp.values))


        temps.append(temp)
        
    ans = concat_for_all_users(temps)
    
    ans = pd.DataFrame(ans)

#     temps = np.concatenate(temps)

#     #
#     # # print(temps)
#     ans = do_dataframe(temps)
#     #
#     # del temps
#     #
#     ans.groupby(by=['product_1', 'product_2']).progress_apply(np.sum)#parallel_apply(np.sum)
# #     ans.groupby(by=['product_1', 'product_2']).apply_parallel(np.sum, num_processes=multiprocessing.cpu_count())
#     #
    return ans





In [43]:
dists_map = get_date_distances_map(data)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  0%|          | 0/26 [00:00<?, ?it/s]


  0%|                                                  | 0/2007 [00:00<?, ?it/s][A
 19%|███████                               | 375/2007 [00:00<00:00, 3400.04it/s][A
 37%|██████████████▏                       | 750/2007 [00:00<00:00, 3428.68it/s][A
 56%|████████████████████▋                | 1125/2007 [00:00<00:00, 3305.67it/s][A
 75%|███████████████████████████▋         | 1500/2007 [00:00<00:00, 3190.32it/s][A
100%|█████████████████████████████████████| 2007/2007 [00:00<00:00, 2576.45it/s][A

  0%|                                                  | 0/2029 [00:00<?, ?it/s][A
 19%|███████                               | 378/2029 [00:00<00:00, 3580.12it/s][A
 37%|██████████████▏                       | 756/2029 [00:00<00:00, 3419.22it/s][A
 56%|████████████████████▋                | 1134/2029 [00:00<00:00, 2977.92it/s][A
 71%|██████████████████████████▏          | 1438/2029 [00:00<00:00, 2491.31it/s][A
 84%|██████████████████████████████▉      | 1697/2029 [00:00<00:00, 1985.0

 75%|████████████████████████████▎         | 1596/2142 [00:01<00:00, 914.62it/s][A
 81%|██████████████████████████████▋       | 1729/2142 [00:01<00:00, 822.62it/s][A
 87%|█████████████████████████████████     | 1862/2142 [00:01<00:00, 784.41it/s][A
 93%|███████████████████████████████████▍  | 1995/2142 [00:02<00:00, 804.15it/s][A
100%|██████████████████████████████████████| 2142/2142 [00:02<00:00, 994.61it/s][A

  0%|                                                  | 0/1908 [00:00<?, ?it/s][A
 12%|████▋                                 | 238/1908 [00:00<00:00, 2277.29it/s][A
 31%|███████████▊                          | 595/1908 [00:00<00:00, 2346.08it/s][A
 43%|████████████████▌                     | 829/1908 [00:00<00:00, 2329.30it/s][A
 56%|████████████████████▌                | 1062/1908 [00:00<00:00, 1688.48it/s][A
 65%|████████████████████████▏            | 1248/1908 [00:00<00:00, 1361.21it/s][A
 73%|███████████████████████████▏         | 1400/1908 [00:00<00:00, 1365.19

 74%|████████████████████████████          | 756/1023 [00:00<00:00, 1098.97it/s][A
 86%|█████████████████████████████████▌     | 882/1023 [00:00<00:00, 957.13it/s][A
100%|██████████████████████████████████████| 1023/1023 [00:01<00:00, 988.14it/s][A

  0%|                                                  | 0/1194 [00:00<?, ?it/s][A
 25%|█████████▍                            | 296/1194 [00:00<00:00, 2602.77it/s][A
 47%|█████████████████▋                    | 557/1194 [00:00<00:00, 2483.10it/s][A
 68%|█████████████████████████▋            | 806/1194 [00:00<00:00, 2312.34it/s][A
100%|█████████████████████████████████████| 1194/1194 [00:00<00:00, 1456.62it/s][A

  0%|                                                  | 0/1281 [00:00<?, ?it/s][A
 19%|███████                               | 240/1281 [00:00<00:00, 2284.66it/s][A
 37%|██████████████▏                       | 480/1281 [00:00<00:00, 2000.31it/s][A
 53%|████████████████████▎                 | 683/1281 [00:00<00:00, 1734.4

ValueError: All arrays must be of the same length

In [None]:
dists_map