In [1]:
FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'
import pandas as pd
from datetime import datetime, timedelta
import os
import multiprocessing
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import random
from tqdm.notebook import tqdm
from multiprocesspandas import applyparallel
from pandarallel import pandarallel
import psutil
from sys import getsizeof

import pickle
import gc

tqdm.pandas()
from helper import *



In [2]:

# NROWS = 1_000_000
# data = pd.read_csv(DATASETS_PATH + 'data_processed.csv', nrows=NROWS).drop(columns=['Unnamed: 0'])
# data['datetime'] = pd.to_datetime(data['datetime'])
class Distances:
    def __init__(self, data_path='data_processed', nrows=None):
        """
        Initialize Distances class with the data

        :param data_path: name of file
        :param nrows: number of rows to read
        """
        self.data = pd.read_csv(DATASETS_PATH + data_path + '.csv', nrows=nrows).drop(columns=['Unnamed: 0'])
        self.data['datetime'] = pd.to_datetime(self.data['datetime'])
        self.nrows = nrows

    @staticmethod
    def save_dists(file, path):
        with open(DATASETS_PATH + path + '.pkl', 'wb') as f:
            pickle.dump(file, f)

    def user_product(self):
        """
        Get distances between all pairs of users by counting purchases

        :return: dict[user] = {product: count}
        """

        def process_batch(x):
            ans = dict()
            for i in x['product_id'].values:
                if i in ans:
                    ans[i] += 1
                else:
                    ans[i] = 1
            return ans

        data = self.data[['gid', 'product_id']]
        pandarallel.initialize(progress_bar=True, use_memory_fs=True, nb_workers=psutil.cpu_count(logical=False))
        ans = data.groupby(by='gid').parallel_apply(process_batch)

        self.save_dists(ans, 'up_' + str(self.nrows))
        return ans

    def product_product(self, interval=None, batch_size=100_000):
        """
        Get distances between all pairs of products by date differences
        Считаем по каждому пользователю ближайшие (по модулю даты) покупки товаров.
        Усредняем значения по каждому пользователю.

        :param interval: date interval to split data with, default: None
        :param batch_size: data batching size, default: 100_000
        :return: dict[(product_1, product_2)] = an array of mean of date distance by one user
        """
        ans = dict()

        def data_splitting(interval):
            nonlocal data
            batches = []
            data = data.sort_values(by='datetime')
            start = data.iloc[0].at['datetime']
            end = data.iloc[-1].at['datetime']
            while start <= end:
                sub_end = start + timedelta(days=interval)
                batch = data.loc[data['datetime'] >= start].loc[data['datetime'] < sub_end]
                batches.append(batch)
                start = sub_end

            return batches

        def fill_ans(x):
            product_date = x[['product_id', 'datetime']]
            res = dict()
            for i1, r1 in product_date.iterrows():
                for i2, r2 in product_date.iterrows():
                    if i1 != i2:
                        p1, p2 = r1['product_id'], r2['product_id']
                        timedelta = (r1['datetime'] - r2['datetime']).days

                        if (p1, p2) in res and abs(res[(p1, p2)]) > abs(timedelta):
                            res[(p1, p2)] = timedelta
                        else:
                            res[(p1, p2)] = timedelta
            return res

        def concat_dicts(res):
            nonlocal ans
            res = res.values
            for r in res:
                for key in r.keys():
                    if key in ans:
                        ans[key].append(r[key])
                    else:
                        ans[key] = [r[key]]
            return ans

        data = self.data[['gid', 'product_id', 'datetime']]
        data.loc[:, 'datetime'] = data['datetime'].dt.date

        if interval is not None:
            batches = data_splitting(interval=interval)
        else:
            batches = np.array_split(data, data.shape[0] // batch_size + 1)

        pandarallel.initialize(progress_bar=False, use_memory_fs=True, nb_workers=psutil.cpu_count(logical=False) // 2)

        for batch in tqdm(batches):
            if psutil.virtual_memory().percent >= 90:
                break
            grouped_by_user = batch.groupby(by='gid')
            temp = grouped_by_user.parallel_apply(fill_ans)
            temp = temp.dropna()
            ans = concat_dicts(temp)

        self.save_dists(ans, 'pp_' + str(self.nrows))
        return ans

    def get_up_matrix(self, num_users=50_000, num_products=10_000, batch_size=100_000):
        def fill_ans(x):
            nonlocal ans
            user, product = x[0], x[1]
            if user >= num_users or product >= num_products:
                return
            ans[user, product] += 1

        data = self.data[['gid', 'product_id']]
        # users = data['gid'].drop_duplicates().values
        # print(users)
        # products = data['product_id'].drop_duplicates().values
        # print(products)
        # # users = users.sort()
        # # return users
        # #
        ans = np.full((num_users, num_products), 0)
        # pandarallel.initialize(progress_bar=True)
        #
        for batch in tqdm(np.array_split(data, data.shape[0] // batch_size)):
            batch.apply(fill_ans, axis=1)

        self.save_dists(ans, 'up_matrix')
        return ans



In [3]:

d = Distances(nrows=None)

In [None]:

up = d.get_up_matrix(80_000, 50_000)

In [None]:
print(up[:10, :10])