In [6]:
import pandas as pd

from distances import Distances
from clustering import Clustering, get_dists
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import numpy as np
import json

In [7]:
FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'
CLUSTERS_PATH = 'out/clusters/'
DICTS_PATH = 'out/dicts/'

In [8]:
class Recommender:
    def __init__(self, data=None, nrows=1_000_000, data_path='data_processed'):
        self.d = Distances(data_path=data_path, nrows=nrows)

    @staticmethod
    def normalize(h):
        return (h - np.min(h)) / (np.max(h) - np.min(h))

    @staticmethod
    def save(ans):
        with open('out/results.json', "wb") as fp:
            json.dump(ans, fp)

    @staticmethod
    def print_ans(ans, prices, field):
        for p in list(ans.keys()):
            print(f'{field}: {p}\n\tprice: {prices[p]} -> {ans[p]} ({((ans[p] / prices[p] - 1) * 100):.2f}%)')

    @staticmethod
    def analyze(bases: list[int],
                helps: list[int],
                data: pd.DataFrame
                ):
        """
        Analysis, testing cluster for being project of sales
        :param bases: KVI products or categories
        :param helps: related products or categories
        :param data: preprocessed data
        :return: coefficient, declaring how trade turnover changing based on prices
        """
        when_decreased = []
        when_increased = []

        def fill_ans(x: pd.DataFrame, bases: list[int]):
            nonlocal when_decreased
            nonlocal when_increased
            if x.name in bases:
                x = x.sort_values(by='datetime')
                x['changed'] = x['line_item_price'].diff()
                when_decreased.extend(x.loc[x['changed'] < 0]['datetime'].values)
                when_increased.extend(x.loc[x['changed'] > 0]['datetime'].values)

        def fill_helps(x: pd.DataFrame, helps: list[int]):
            nonlocal time_points
            all_cnt = []
            if x.name in helps:
                for i in range(len(time_points) - 1):
                    start, end = time_points[i], time_points[i + 1]
                    count = x.loc[x['datetime'] >= start].loc[x['datetime'] < end]
                    cnt_0 = (count['line_quantity'] * count['line_item_price']).values.sum()
                    all_cnt.append(cnt_0)
            return all_cnt

        def plot(cnt: pd.Series, when_decreased, when_increased, time_points):
            format = '%Y-%m-%d %H:%M:%S'
            time_points = [datetime.strptime(i, format) for i in time_points]
            height = max(cnt.values, key=lambda x: max(x))

            for i in range(len(time_points) - 1):
                if time_points[i] in when_decreased:
                    w = time_points[i + 1] - time_points[i]
                    plt.bar(x=time_points[i], width=w, height=height, alpha=0.3, color='g', align='edge')

                if time_points[i] in when_increased:
                    w = time_points[i + 1] - time_points[i]
                    plt.bar(x=time_points[i], width=w, height=height, alpha=0.3, color='r', align='edge')

            for r in cnt:
                plt.plot(time_points[:len(time_points) - 1], r)

        def for_each(x, gz, rz, gcnt, rcnt):
            gc, rc = 0, 0
            for i, v in enumerate(x):
                if i in gz:
                    gc += v
                else:
                    rc += v
            g, r = gc / gcnt, rc / rcnt
            return g / (g + r), r / (g + r)


        data.loc[np.isin(data['product_id'], bases)].sort_values(by='datetime').groupby(by='product_id').apply(lambda x: fill_ans(x, bases))

        format = '%Y-%m-%d %H:%M:%S'
        time_points = []
        time_points.extend(when_decreased)
        time_points.extend(when_increased)
        time_points = sorted(time_points)

        cnt = data.loc[np.isin(data['product_id'], helps)].sort_values(by='datetime').groupby(by='product_id').apply(lambda x: fill_helps(x, helps))

        gz, rz = [], []
        gcnt, rcnt = np.timedelta64(), np.timedelta64()
        for i in range(len(time_points) - 1):
            if time_points[i] in when_decreased:
                gz.append(i)
                gcnt += (time_points[i + 1] - time_points[i])
            else:
                rz.append(i)
                rcnt += (time_points[i + 1] - time_points[i])

        cnt2 = cnt.apply(lambda x: for_each(x, gz, rz, gcnt / np.timedelta64(1, 'D'), rcnt / np.timedelta64(1, 'D')))

        c1, c2 = 0, 0
        for c in cnt2:
            c1 += c[0]
            c2 += c[1]

        return c1 / len(cnt2), c2 / len(cnt2)


    def correct_prices(self, cluster, helping_cf, prices, base_cf=0.4, related_cf=0.2, aggression=0.5):
        res = dict()

        h, price = [], []
        for product in cluster:
            h.append(helping_cf[product][1])# / helping_cf[product][0])
            price.append(prices[product])

        bases = [cluster[i] for i, a in enumerate(h) if a >= base_cf]
        helps = [i for i, a in enumerate(h) if a < base_cf / 2]

        if len(bases) == 0:
            return res, [], []

        h = self.normalize(h)
        for ind in helps:
            help = cluster[ind]
            cf = aggression / 100 * (1 - h[ind])
            res[help] = prices[help] * (1 + cf)

        helps = [cluster[i] for i in helps]

        return res, bases, helps


    @staticmethod
    def print_analysis(metrics):
        a, b, cnt = 0, 0, 0
        for m in metrics[0]:
            if not np.isnan(m[0]):
                a += m[0]
                b += m[1]
                cnt += 1
        print(f'Спрос на сопутствующие при понижении/повышении цены на основные товары:\n\t{a / cnt:.3f}\n\t{b / cnt:.3f}')

        a, b, cnt = 0, 0, 0
        for m in metrics[1]:
            if not np.isnan(m[0]):
                a += m[0]
                b += m[1]
                cnt += 1

        print(f'Спрос на основные при понижении/повышении цены на сопутствующие товары:\n\t{a / cnt:.3f}\n\t{b / cnt:.3f}')

    def run(self,
            method: str = 'ward',
            field: str = 'product_id',
            top_lim: int = 1000,
            interval: int = None,
            batch_size: int = 100_000,
            base_cf: float = 4,
            related_cf: float = 0.6,
            aggression: float = 2
            ):
        """
        Returns dictionary of new prices based on recommender system and KVI analysis
        :param related_cf: coefficient to count related products, from 0 to 1, quantile
        :param method: method to clusterize with
        :param field: field to clusterize on
        :param top_lim: top of [field] to cut the data
        :param interval: to split data by date
        :param batch_size: to split data normally
        :param base_cf: coefficient to determine whether the object is KVI
        :param aggression: coefficient to change prices (maximum), %
        :return: dictionary of new prices
        """
        product_product = self.d.get_pp(field=field, top_lim=top_lim, batch_size=batch_size, interval=interval)

        is_helping = self.d.get_helping(field=field)
        prices = self.d.get_prices(field=field)

        c = Clustering(get_dists=get_dists)
        clusters, _ = c.fit(method=method, top_lim=top_lim, k=top_lim // 5, dists=product_product)

        ans = dict()
        clusters_info = []
        clusters_info_reversed = []
        for cluster in clusters:
            res, bases, helps = self.correct_prices(cluster, is_helping, prices, base_cf=base_cf, related_cf=related_cf, aggression=aggression)
            if len(bases) == 0 or len(helps) == 0:
                continue
            # clusters_info.append(self.analyze(bases, helps, self.d.data))
            # clusters_info_reversed.append(self.analyze(helps, bases, self.d.data))
            ans.update(res)

        self.print_ans(ans, prices, field)
        self.print_analysis((clusters_info, clusters_info_reversed))

        return ans, (clusters_info, clusters_info_reversed)

In [9]:
r = Recommender()
ans, metrics = r.run(method='min_dist', top_lim=1_000, base_cf=0.35, related_cf=0.2)

Top of dataset length: 324997
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


  0%|          | 0/4 [00:00<?, ?it/s]

Starting counting distances between clusters...


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting collapsing closest clusters...


  0%|          | 0/800 [00:00<?, ?it/s]

product_id: 15693
	price: 63.69117647058823 -> 64.965 (2.00%)
product_id: 6644
	price: 44.33519553072626 -> 45.22189944134079 (2.00%)
product_id: 5457
	price: 47.121212121212125 -> 48.06363636363637 (2.00%)
product_id: 755
	price: 78.5680473372781 -> 80.06358144068558 (1.90%)
product_id: 927
	price: 82.5925925925926 -> 84.24444444444445 (2.00%)
product_id: 1722
	price: 132.75590551181102 -> 135.39390181762394 (1.99%)
product_id: 1721
	price: 405.86567164179104 -> 412.88907731309416 (1.73%)
product_id: 1730
	price: 42.2111801242236 -> 43.05540372670808 (2.00%)
product_id: 3058
	price: 31.66860465116279 -> 32.30197674418605 (2.00%)
product_id: 1576
	price: 20.004744525547444 -> 20.392349133522547 (1.94%)
product_id: 3551
	price: 21.270646766169154 -> 21.696059701492537 (2.00%)
product_id: 11945
	price: 18.699645390070923 -> 19.073638297872343 (2.00%)
product_id: 554
	price: 48.2 -> 49.164 (2.00%)
product_id: 238
	price: 154.93449781659388 -> 157.53736975061915 (1.68%)
product_id: 3228
	p