In [3]:
from distances import Distances

In [4]:
d = Distances(nrows=2_000_000)
pp = d.get_pp(batch_size=250_000, top_lim=50_000, field='category_id')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  0%|          | 0/8 [00:00<?, ?it/s]

In [3]:
pp

{(533994.0, 200386.0): [0.0, 1, 0.0],
 (533994.0, 121150.0): [0.0, 1, 0.0],
 (533994.0, 412675.0): [0.0, 1, 0.0],
 (533994.0, 39606.0): [0.0, 1, 0.0],
 (121150.0, 184657.0): [0.0, 1, 0.0],
 (121150.0, 134119.0): [0.0, 1, 0.0],
 (121150.0, 146996.0): [0.0, 1, 0.0],
 (121150.0, 79813.0): [0.0, 2, 0.0],
 (121150.0, 200386.0): [0.0, 1, 0.0],
 (121150.0, 533994.0): [0.0, 1, 0.0],
 (121150.0, 412675.0): [0.0, 4, 0.0],
 (121150.0, 267840.0): [0.0, 4, 0.0],
 (121150.0, 39606.0): [0.0, 2, 0.0],
 (412675.0, 184657.0): [0.0, 1, 0.0],
 (412675.0, 134119.0): [0.0, 1, 0.0],
 (412675.0, 146996.0): [0.0, 2, 0.0],
 (412675.0, 200386.0): [0.0, 1, 0.0],
 (412675.0, 533994.0): [0.0, 1, 0.0],
 (412675.0, 121150.0): [0.0, 4, 0.0],
 (412675.0, 267840.0): [0.0, 2, 0.0],
 (412675.0, 39606.0): [0.0, 4, 0.0],
 (267840.0, 184657.0): [0.0, 1, 0.0],
 (267840.0, 134119.0): [0.0, 1, 0.0],
 (267840.0, 146996.0): [0.0, 1, 0.0],
 (267840.0, 79813.0): [0.0, 3, 0.0],
 (267840.0, 200386.0): [0.0, 1, 0.0],
 (267840.0, 12115

In [1]:
FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'
import pandas as pd
from datetime import datetime, timedelta
import os
import multiprocessing
from tqdm.notebook import tqdm
from pandarallel import pandarallel
import psutil

import pickle
import gc

tqdm.pandas()
from helper import *



In [5]:
class Distances:
    def __init__(self, data_path='data_processed', nrows=None):
        """
        Initialize Distances class with the data

        :param data_path: name of file
        :param nrows: number of rows to read
        """
        self.data = pd.read_csv(DATASETS_PATH + data_path + '.csv', nrows=nrows).drop(columns=['Unnamed: 0'])
        self.data['datetime'] = pd.to_datetime(self.data['datetime'])
        self.nrows = nrows

    @staticmethod
    def save_dists(file, path):
        with open(DATASETS_PATH + path + '.pkl', 'wb') as f:
            pickle.dump(file, f)


    def top_users(self, top_lim, field='product_id'):
        data = self.data[['gid', field]]
        d = data.groupby(by='gid').apply(lambda x: len(x)).sort_values(ascending=False)
        return data.loc[data['gid'].isin(d.index.values[:top_lim])]


    def top_products(self, top_lim, field='product_id'):
        data = self.data[[field, 'datetime', 'gid']]
        d = data.groupby(by=field).apply(lambda x: len(x)).sort_values(ascending=False)
        return data.loc[data[field].isin(d.index.values[:top_lim])]


    def user_product(self, top_lim=None, field='product_id'):
        """
        Get distances between all pairs of users by counting purchases

        :return: dict[user] = {product: count}
        """

        def process_batch(x):
            ans = dict()
            for i in x[field].values:
                if i in ans:
                    ans[i] += 1
                else:
                    ans[i] = 1
            return ans


        data = self.top_users(top_lim, field)
        print(len(data['gid'].drop_duplicates()))

        pandarallel.initialize(progress_bar=True, use_memory_fs=True, nb_workers=psutil.cpu_count(logical=False))
        ans = data.groupby(by='gid').parallel_apply(process_batch)

        print(len(ans))
        ans = dict(ans)
        print(len(ans))
        self.save_dists(ans, f'up_{field}_{self.nrows}')
        return ans

    def product_product(self, top_lim=None, interval=None, batch_size=100_000, field='product_id'):
        """
        Get distances between all pairs of products by date differences

        :param interval: date interval to split data with, default: None
        :param batch_size: data batching size, default: 100_000
        :return: dict[(product_1, product_2)] = an array of mean of date distance by one user
        """
        ans = dict()

        data = self.top_products(top_lim=top_lim, field=field)

        print(len(data[field].drop_duplicates()))
        data.loc[:, 'datetime'] = data['datetime'].dt.date

        def data_splitting(interval):
            nonlocal data
            batches = []
            data = data.sort_values(by='datetime')
            start = data.iloc[0].at['datetime']
            end = data.iloc[-1].at['datetime']
            while start <= end:
                sub_end = start + timedelta(days=interval)
                batch = data.loc[data['datetime'] >= start].loc[data['datetime'] < sub_end]
                batches.append(batch)
                start = sub_end

            return batches


        def fill_ans(x):
            product_date = x[[field, 'datetime']]
            res = dict()
            for i1, r1 in product_date.iterrows():
                for i2, r2 in product_date.iterrows():
                    if i1 != i2:
                        p1, p2 = r1[field], r2[field]
                        timedelta = (r1['datetime'] - r2['datetime']).days

                        if (p1, p2) in res and abs(res[(p1, p2)]) > abs(timedelta):
                            res[(p1, p2)] = timedelta
                        else:
                            res[(p1, p2)] = timedelta
            return res


        def concat_dicts(res):
            nonlocal ans
            res = res.values
            for r in res:
                for key in r.keys():
                    if key in ans:
                        ans[key].append(r[key])
                    else:
                        ans[key] = [r[key]]
            return ans

        if interval is not None:
            batches = data_splitting(interval=interval)
        else:
            batches = np.array_split(data, data.shape[0] // batch_size + 1)

        pandarallel.initialize(progress_bar=False, use_memory_fs=True, nb_workers=psutil.cpu_count(logical=False))

        for batch in tqdm(batches):
            if psutil.virtual_memory().percent >= 90:
                break
            grouped_by_user = batch.groupby(by='gid')
            temp = grouped_by_user.parallel_apply(fill_ans)
            temp = temp.dropna()
            ans = concat_dicts(temp)

        self.save_dists(ans, f'pp_{field}_{self.nrows}')
        return ans


    def get_up_matrix(self, field='product_id', top_lim=None, batch_size=100_000):
        def fill_ans(x):
            nonlocal ans
            user, product = x[0], x[1]
            ans[user, product] += 1

        data = self.top_users(top_lim=top_lim, field=field)


        max_user, max_product = max(list(data['gid'].values)), max(list(data[field].values))
        print(max_user)
        print(max_product)
        ans = np.full((max_user, max_product), 0)
        for batch in tqdm(np.array_split(data, data.shape[0] // batch_size)):
            batch.apply(fill_ans, axis=1)

        self.save_dists(ans, f'up_matrix_{field}_{self.nrows}')
        return ans



In [6]:
d = Distances(nrows=5_000_000)

In [None]:
pp = d.product_product(batch_size=200_000, top_lim=20_000, field='category_id')


In [None]:
del pp

In [None]:
up = d.user_product(field='category_id', top_lim=10_000)

In [None]:
# print(len(up))