In [1]:
from distances import Distances
from clustering import Clustering, get_dists
import numpy as np
import json

In [2]:
FIGURES_PATH = 'out/figures/'
DATASETS_PATH = 'out/datasets/'
CLUSTERS_PATH = 'out/clusters/'
DICTS_PATH = 'out/dicts/'

In [3]:
class Recommender:
    def __init__(self, data=None, nrows=1_000_000, data_path='data_processed'):
        self.d = Distances(data_path=data_path, nrows=nrows)

    @staticmethod
    def normalize(h):
        return (h - np.min(h)) / (np.max(h) - np.min(h))

    @staticmethod
    def save(ans):
        with open('out/results.json', "wb") as fp:
            json.dump(ans, fp)

    @staticmethod
    def print_ans(ans, prices, field):
        for p in list(ans.keys()):
            print(f'{field}: {p}\n\tprice: {prices[p]} -> {ans[p]} ({ans[p] / prices[p] - 1}.1f%)')


    def correct_prices(self, cluster, helping_cf, prices, base_cf=5, aggression=0.5):
        res = dict()

        h, price = [], []
        for product in cluster:
            h.append(helping_cf[product][1] / helping_cf[product][0])
            price.append(prices[product])

        av = np.mean(h)
        bases = [cluster[i] for i, a in enumerate(h) if a >= base_cf * av]

        if len(bases) == 0:
            return res

        av = np.mean(bases)
        helps = [i for i, a in enumerate(h) if a <= av / base_cf]

        h = self.normalize(h)
        for ind in helps:
            help = cluster[ind]
            cf = aggression / 100 * (1 - h[ind])
            res[help] = prices[help] * (1 + cf)

        return res


    def run(self, method='ward', field='product_id', top_lim=1000, interval=None, batch_size=100_000, base_cf=4, aggression=2):
        product_product = self.d.get_pp(field=field, top_lim=top_lim, batch_size=batch_size, interval=interval)

        is_helping = self.d.get_helping(field=field)
        prices = self.d.get_prices(field=field)

        c = Clustering(get_dists=get_dists)
        clusters, _ = c.fit(method=method, top_lim=top_lim, k=top_lim // 10, dists=product_product)

        ans = dict()
        for cluster in clusters:
            res = self.correct_prices(cluster, is_helping, prices, base_cf=base_cf, aggression=aggression)
            ans.update(res)

        self.print_ans(ans, prices, field)
        self.save(ans)

        return ans



In [4]:
r = Recommender()
ans = r.run()

Top of dataset length: 324997
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  0%|          | 0/4 [00:00<?, ?it/s]

Starting counting distances between clusters...


  0%|          | 0/1000 [00:00<?, ?it/s]

Starting collapsing closest clusters...


  0%|          | 0/900 [00:00<?, ?it/s]

product_id: 2730
	price: 144.29470198675497 -> 147.12069029639594 (0.01958483763250274%)
product_id: 5822
	price: 461.8526570048309 -> 470.8283034892323 (0.019434004218162437%)
product_id: 2716
	price: 399.01271186440675 -> 406.826514563998 (0.019582841516704752%)
product_id: 3702
	price: 376.2241379310345 -> 383.65375393466087 (0.01974784511297978%)
product_id: 2919
	price: 261.1619718309859 -> 265.5830922757469 (0.01692865317934622%)
product_id: 2285
	price: 37.014084507042256 -> 37.745278280810915 (0.019754474101055797%)
product_id: 5496
	price: 28.980544747081712 -> 29.560155642023346 (0.020000000000000018%)
product_id: 3414
	price: 55.92417061611374 -> 57.03736133580711 (0.019905359479262863%)
product_id: 4710
	price: 97.87900355871886 -> 99.80699978288858 (0.019697750835939942%)
product_id: 3396
	price: 452.9655172413793 -> 460.70219567392655 (0.017080060485982873%)
product_id: 4947
	price: 81.5673076923077 -> 82.93297560634235 (0.016742834263775208%)
product_id: 2439
	price: 95.