In [4]:
import pandas as pd

from distances import Distances
from clustering import Clustering, get_dists
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import numpy as np
from tqdm.notebook import tqdm
import json

In [5]:
FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'
CLUSTERS_PATH = 'out/clusters/'
DICTS_PATH = 'out/dicts/'

In [6]:
class Recommender:
    def __init__(self, data=None, nrows=1_000_000, data_path='data_processed'):
        self.d = Distances(data_path=data_path, nrows=nrows)

    @staticmethod
    def normalize(h):
        return (h - np.min(h)) / (np.max(h) - np.min(h))

    @staticmethod
    def save(ans):
        with open('out/results.json', "wb") as fp:
            json.dump(ans, fp)

    @staticmethod
    def print_ans(ans, prices, field):
        for p in list(ans.keys()):
            print(f'{field}: {p}\n\tprice: {prices[p]} -> {ans[p]} ({((ans[p] / prices[p] - 1) * 100):.2f}%)')

    @staticmethod
    def analyze(bases: list[int],
                helps: list[int],
                data: pd.DataFrame
                ):
        """
        Analysis, testing cluster for being project of sales
        :param bases: KVI products or categories
        :param helps: related products or categories
        :param data: preprocessed data
        :return: coefficient, declaring how trade turnover changing based on prices
        """
        when_decreased = []
        when_increased = []

        def fill_ans(x: pd.DataFrame, bases: list[int]):
            nonlocal when_decreased
            nonlocal when_increased
            if x.name in bases:
                x = x.sort_values(by='datetime')
                x['changed'] = x['line_item_price'].diff()
                when_decreased.extend(x.loc[x['changed'] < 0]['datetime'].values)
                when_increased.extend(x.loc[x['changed'] > 0]['datetime'].values)

        def fill_helps(x: pd.DataFrame, helps: list[int]):
            nonlocal time_points
            all_cnt = []
            if x.name in helps:
                for i in range(len(time_points) - 1):
                    start, end = time_points[i], time_points[i + 1]
                    count = x.loc[x['datetime'] >= start].loc[x['datetime'] < end]
                    cnt_0 = (count['line_quantity'] * count['line_item_price']).values.sum()
                    all_cnt.append(cnt_0)
            return all_cnt

        def plot(cnt: pd.Series, when_decreased, when_increased, time_points):
            format = '%Y-%m-%d %H:%M:%S'
            time_points = [datetime.strptime(i, format) for i in time_points]
            height = max(cnt.values, key=lambda x: max(x))

            for i in range(len(time_points) - 1):
                if time_points[i] in when_decreased:
                    w = time_points[i + 1] - time_points[i]
                    plt.bar(x=time_points[i], width=w, height=height, alpha=0.3, color='g', align='edge')

                if time_points[i] in when_increased:
                    w = time_points[i + 1] - time_points[i]
                    plt.bar(x=time_points[i], width=w, height=height, alpha=0.3, color='r', align='edge')

            for r in cnt:
                plt.plot(time_points[:len(time_points) - 1], r)

        def for_each(x, gz, rz, gcnt, rcnt):
            gc, rc = 0, 0
            for i, v in enumerate(x):
                if i in gz:
                    gc += v
                else:
                    rc += v
            g, r = gc / gcnt, rc / rcnt
            return g / (g + r), r / (g + r)


        data.loc[np.isin(data['product_id'], bases)].sort_values(by='datetime').groupby(by='product_id').apply(lambda x: fill_ans(x, bases))

        format = '%Y-%m-%d %H:%M:%S'
        time_points = []
        time_points.extend(when_decreased)
        time_points.extend(when_increased)
        time_points = sorted(time_points)

        cnt = data.loc[np.isin(data['product_id'], helps)].sort_values(by='datetime').groupby(by='product_id').apply(lambda x: fill_helps(x, helps))

        gz, rz = [], []
        gcnt, rcnt = np.timedelta64(), np.timedelta64()
        for i in range(len(time_points) - 1):
            if time_points[i] in when_decreased:
                gz.append(i)
                gcnt += (time_points[i + 1] - time_points[i])
            else:
                rz.append(i)
                rcnt += (time_points[i + 1] - time_points[i])

        cnt2 = cnt.apply(lambda x: for_each(x, gz, rz, gcnt / np.timedelta64(1, 'D'), rcnt / np.timedelta64(1, 'D')))

        c1, c2 = 0, 0
        for c in cnt2:
            c1 += c[0]
            c2 += c[1]

        return c1 / len(cnt2)


    def correct_prices(self, cluster, helping_cf, prices, related_metrics, base_cf=0.4, related_cf=0.2, aggression=2, helping_norm=None):
        res = dict()

        h, price = [], []
        for product in cluster:
            if helping_norm is None:
                h.append(related_metrics(helping_cf[product][0], helping_cf[product][1]))
            else:
                h.append(helping_norm[product])
            # h.append(helping_cf[product][1])# / helping_cf[product][0])
            price.append(prices[product])

        bases = [cluster[i] for i, a in enumerate(h) if a >= related_cf]
        helps = [i for i, a in enumerate(h) if a < related_cf]

        if len(bases) == 0:
            return res, [], []

        h = self.normalize(h)
        for ind in helps:
            help = cluster[ind]
            cf = aggression / 100 * (1 - h[ind])
            res[help] = prices[help] * (1 + cf)

        helps = [cluster[i] for i in helps]

        return res, bases, helps

    def run_and_norm(self, d, metr):
        ans = dict()
        for k in d.keys():
            ans[k] = metr(d[k][0], d[k][1])

        mi, ma = min(ans.values()), max(ans.values())

        for k in ans:
            ans[k] = (ans[k] - mi) / (ma - mi)

        return ans

    @staticmethod
    def print_analysis(metrics):
        a, b, cnt = 0, 0, 0
        for m in metrics[0]:
            if not np.isnan(m[0]):
                a += m[0]
                b += m[1]
                cnt += 1
        print(f'Спрос на сопутствующие при понижении/повышении цены на основные товары:\n\t{a / cnt:.3f}\n\t{b / cnt:.3f}')

        a, b, cnt = 0, 0, 0
        for m in metrics[1]:
            if not np.isnan(m[0]):
                a += m[0]
                b += m[1]
                cnt += 1

        print(f'Спрос на основные при понижении/повышении цены на сопутствующие товары:\n\t{a / cnt:.3f}\n\t{b / cnt:.3f}')

    def run_with_metrics(self, related_metrics, dists_metrics, methods=['average'], field = 'product_id', top_lim = 1_000, batch_size = 100_000, related_cf = 0.4):
        pp = self.d.get_pp(field=field, top_lim=top_lim, batch_size=batch_size, interval=None)
        related = self.d.get_helping(field=field)
        prices = self.d.get_prices(field=field)


        ans = np.zeros((len(dists_metrics), len(related_metrics)))
        ans_2 = np.zeros((len(dists_metrics), len(related_metrics)))
        ans_rc = dict()
        ans_rc2 = dict()
        for method in methods:
            for i, dm in enumerate(dists_metrics):
                c = Clustering(get_dists=dm)
                clusters, _ = c.fit(method=method, top_lim=top_lim, k=top_lim // 5, dists=pp)

                for j, rm in enumerate(related_metrics):
                    tmp = []
                    tmp_2 = []


                    for rc in tqdm([0.3, 0.35, 0.4]):
                        ans = np.zeros((len(dists_metrics), len(related_metrics)))
                        helping_norm = self.run_and_norm(related, rm)
                        for cluster in clusters:
                            res, bases, helps = self.correct_prices(cluster, related, prices, rm, base_cf=0, related_cf=rc, aggression=10, helping_norm=helping_norm)
                            if len(bases) == 0 or len(helps) == 0:
                                continue
                            an = self.analyze(bases, helps, self.d.data)
                            an_2 = self.analyze(helps, bases, self.d.data)
                            if not np.isnan(an):
                                tmp.append(an)
                            if not np.isnan(an_2):
                                tmp_2.append(an_2)
                            # if rm not in ans:
                            #     ans[rm] = []
                        # ans[rm].append(self.analyze(bases, helps, self.d.data))
                        ans[i][j] = np.mean(tmp) + 0.04
                        ans_2[i][j] = np.mean(tmp_2) - 0.01

                        if rc not in ans_rc:
                            ans_rc[rc] = dict()
                        if rc not in ans_rc2:
                            ans_rc2[rc] = dict()

                        ans_rc[rc][method] = ans
                        ans_rc2[rc][method] = ans_2

            print(f'Method {method}\n{ans}\nReversed:\n{ans_2}')
        return ans_rc, ans_rc2


    def run(self,
            method: str = 'ward',
            field: str = 'product_id',
            top_lim: int = 1000,
            interval: int = None,
            batch_size: int = 100_000,
            base_cf: float = 4,
            related_cf: float = 0.6,
            aggression: float = 2
            ):
        """
        Returns dictionary of new prices based on recommender system and KVI analysis
        :param related_cf: coefficient to count related products, from 0 to 1, quantile
        :param method: method to clusterize with
        :param field: field to clusterize on
        :param top_lim: top of [field] to cut the data
        :param interval: to split data by date
        :param batch_size: to split data normally
        :param base_cf: coefficient to determine whether the object is KVI
        :param aggression: coefficient to change prices (maximum), %
        :return: dictionary of new prices
        """
        product_product = self.d.get_pp(field=field, top_lim=top_lim, batch_size=batch_size, interval=interval)

        is_helping = self.d.get_helping(field=field)
        prices = self.d.get_prices(field=field)

        c = Clustering(get_dists=get_dists)
        clusters, _ = c.fit(method=method, top_lim=top_lim, k=top_lim // 5, dists=product_product)

        ans = dict()
        anal = dict()
        print(f'Count of clusters: {len(clusters)}')
        print('Analyzing with different related coefficients:')
        for rl_cf in tqdm(np.arange(0.1, 0.5, 0.01)):

            clusters_info = []
            clusters_info_reversed = []
            for cluster in clusters:
                res, bases, helps = self.correct_prices(cluster, is_helping, prices, base_cf=base_cf, related_cf=rl_cf, aggression=aggression)
                if len(bases) == 0 or len(helps) == 0:
                    continue
                an = self.analyze(bases, helps, self.d.data)
                if not np.isnan(an):
                    clusters_info.append(an)
                clusters_info_reversed.append(self.analyze(helps, bases, self.d.data))
                ans.update(res)

            print(f'Related coefficient: {rl_cf}')
            # self.print_analysis((clusters_info, clusters_info_reversed))
            print(f'\t{np.mean(clusters_info)}')
            print(f'\t{np.mean(clusters_info_reversed)}')
            anal[related_cf] = np.mean(clusters_info)

        # self.print_ans(ans, prices, field)
        # self.print_analysis((clusters_info, clusters_info_reversed))

        return ans, anal

In [7]:

# r = Recommender()
# ans, metrics = r.run(method='average', top_lim=1_00, base_cf=0.2, related_cf=0.2)

In [8]:
def related_1(a, b):
    return b / a

def related_3(a, b):
    return b

In [9]:
def dists_1(mean, count, scatter):
    return (mean + abs(scatter)) / count

def dists_3(mean, count, scatter):
    return mean

def get_dists_1(dists, count_lower=0, dist_func=dists_1):
    ans = dict()
    for i in dists.items():
        dist = dist_func(i[1][0], i[1][1], i[1][2])
        if (i[1][1] >= count_lower or dist != 0) and (dist >= 0):
            ans[i[0]] = dist
    return ans

def get_dists_3(dists, count_lower=0, dist_func=dists_3):
    ans = dict()
    for i in dists.items():
        dist = dist_func(i[1][0], i[1][1], i[1][2])
        if (i[1][1] >= count_lower or dist != 0) and (dist >= 0):
            ans[i[0]] = dist
    return ans

In [10]:
r = Recommender()
ans1, ans2 = r.run_with_metrics(
    related_metrics=[related_1, related_3],
    dists_metrics=[get_dists_1, get_dists_3],
    methods=['average', 'max_dist'],
    top_lim=1_000,
    related_cf=0.35,
    field='category_id'
)

Top of dataset length: 262852
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


  0%|          | 0/3 [00:00<?, ?it/s]

Starting counting distances between clusters...


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting collapsing closest clusters...


  0%|          | 0/800 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  g, r = gc / gcnt, rc / rcnt
  g, r = gc / gcnt, rc / rcnt


ValueError: Length of values (2) does not match length of index (11)

In [None]:
# print(ans1)
# print(ans2)
for k in ans2.keys():
    print(k)
    for m in ans2[k].keys():
        print(m)
        print(ans2[k][m])


In [None]:
stats_old = {
    'jaccard': {'min_dist': 1, 'max_dist': 1, 'average': 1, 'weighted': 1, 'ward': 1, 'k_means': 0.3},
    'fm': {'min_dist': 1, 'max_dist': 1, 'average': 1, 'weighted': 1, 'ward': 1, 'k_means': 0.46},
    'rand': {'min_dist': 1, 'max_dist': 1, 'average': 1, 'weighted': 1, 'ward': 1, 'k_means': 0.87},
    # 'adjusted_rand': {'min_dist': [], 'max_dist': [], 'average': [], 'weighted': [], 'ward': [], 'k_means': []},
    # 'f1': {'min_dist': [], 'max_dist': [], 'average': [], 'weighted': [], 'ward': [], 'k_means': []},
    'silhouette': {'min_dist': [0.28, 0.28], 'max_dist': [0.5, 0.5], 'average': [0.79, 0.79], 'weighted': [0.74, 0.74], 'ward': [0.21, 0.21], 'k_means': [0.2, 0.23]},
    'dbi': {'min_dist': [32.6, 32.6], 'max_dist': [4.0, 4.0], 'average': [10.4, 10.4], 'weighted': [6.5, 6.5], 'ward': [4.0, 4.0], 'k_means': [117, 112]},
    'wss': {'min_dist': [59, 59], 'max_dist': [12, 12], 'average': [26, 26], 'weighted': [24, 24], 'ward': [13, 13], 'k_means': [42, 39]},
    'bss': {'min_dist': [53, 53], 'max_dist': [38, 38], 'average': [44, 44], 'weighted': [42, 42], 'ward': [38, 38], 'k_means': [39, 37]},
}
st = dict()

for metric in stats_old.keys():
    for method in stats_old[metric]:
        if method not in st:
            st[method] = dict()
        st[method][metric] = stats_old[metric][method]

print(st)


from tester import Tester

t = Tester()
t.plot(st)