In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from sklearn.metrics import pairwise_distances
import multiprocessing

In [2]:
mco = load_npz('D:/Google Drive/_hyperloop_data/recom_compl_2014_2017/_data/all/mco_f32.npz')

In [3]:
class OpExperimentsHelper:
    def __init__(self, model_csv, emb_start_slice, emb_end_slice, distances='cosine'):
        self.df = pd.read_csv(model_csv, encoding='ISO-8859-1')
        self.embeddings = np.array(self.df.loc[:, emb_start_slice:emb_end_slice], dtype=np.float32)
        self.distances = distances
        print("Loaded model with {} distances.".format(self.distances.upper()))
        return

    def DifAvgExp(self, dict_pairs):
        for key, value in dict_pairs.items():
            print(key)
            p1 = value[0]
            p2 = value[1]
            print("#1. SELECTED:\n{}".format(self.df.loc[[p1,p2],'ITEM_NAME']))
            prod_a = self.embeddings[p1]
            prod_b = self.embeddings[p2]
            if self.distances.lower() == "cosine":
                sym = prod_a.dot(prod_b)
            else:
                sym = np.sqrt(np.sum((prod_a-prod_b)**2))
            print("\n#2. {} DISTANCE BETWEEN SELECTED PRODUCTS: {:.3f}".format(self.distances.upper(), sym))

            
            if self.distances.lower() == "cosine":
                dist_a = 1 - self.embeddings.dot(prod_a)
                dist_b = 1 - self.embeddings.dot(prod_b)
            else:
                dist_a = pairwise_distances(prod_a.reshape(1,-1), self.embeddings).flatten()
                dist_b = pairwise_distances(prod_b.reshape(1,-1), self.embeddings).flatten()
            
            K = 10
            print("\n#3.1. TOP K P1")
            top_k_indexes = np.argsort(dist_a)[1 : (K + 1)]
            print("{}".format(self.df.loc[top_k_indexes,['ITEM_NAME']]))
            
            print("\n#3.2. TOP K P2")
            top_k_indexes = np.argsort(dist_b)[1 : (K + 1)]
            print("{}".format(self.df.loc[top_k_indexes,['ITEM_NAME']]))

            print("\n#4. Diff vect")
            diff_vect = prod_a - prod_b
            
            if self.distances.lower() == "cosine":
                diff_vect = diff_vect / np.sqrt(np.sum(diff_vect ** 2))
                dist = self.embeddings.dot(diff_vect)
            else:
                dist = pairwise_distances(diff_vect.reshape(1,-1), self.embeddings).flatten()
            
            top_k_indexes = np.argsort(dist)
            top_k_indexes_minus = top_k_indexes[:K]
            top_k_indexes_plus = top_k_indexes[-(K+1):]

            distances_minus = dist[top_k_indexes_minus]
            distances_plus = dist[top_k_indexes_plus]
            df_minus = self.df.loc[top_k_indexes_minus,['ITEM_NAME']]
            df_minus['COSDIST'] = distances_minus
            df_plus = self.df.loc[top_k_indexes_plus,['ITEM_NAME']]
            df_plus['COSDIST'] = distances_plus

            co_occ_scores = mco[prod_a]
            list_scores = []
            for idx in (top_k_indexes_plus + 1):
                list_scores.append(co_occ_scores[0, idx])
            
            df_plus['MCO'] = list_scores
            
            co_occ_scores = mco[prod_b]
            list_scores = []
            for idx in (top_k_indexes_minus + 1):
                list_scores.append(co_occ_scores[0, idx])
            df_minus['MCO'] = list_scores


            print("{}".format(df_minus))
            print("{}".format(df_plus))

            print("\n#5. Average vector:")
            avg_vect = (prod_a + prod_b) / 2
            dist = self.embeddings.dot(avg_vect)

            top_k_indexes = np.argsort(dist)
            top_k_indexes_minus = top_k_indexes[:K]
            top_k_indexes_plus = top_k_indexes[-(K+1):]

            distances_minus = dist[top_k_indexes_minus]
            distances_plus = dist[top_k_indexes_plus]
            df_minus = self.df.loc[top_k_indexes_minus,['ITEM_NAME']]
            df_minus['COSDIST'] = distances_minus
            df_plus = self.df.loc[top_k_indexes_plus,['ITEM_NAME']]
            df_plus['COSDIST'] = distances_plus


            print("{}".format(df_minus))
            print("{}".format(df_plus))

            print()
            print()
            print()

        return

In [4]:
dict_pairs = {
    "NUROFEN_IBUPROFEN": (7, 1638),
    "SECOM-C 1G_GNCC 1G": (871, 2125),
    "CREMAMAINI_VITA": (3317, 456)
}

In [5]:
sg_128_cos = OpExperimentsHelper('D:/Google Drive/_hyperloop_data/recom_compl_2014_2017/_models/11.02.2018/20180210_Prod2Vec_fullseasons_cosine.csv',
                                'EMB1',
                                'EMB128',
                                distances='cosine')

sg_128_euc = OpExperimentsHelper('D:/Google Drive/_hyperloop_data/recom_compl_2014_2017/_models/11.02.2018/20180210_Prod2Vec_fullseasons_euclid.csv',
                                'EMB1',
                                'EMB128',
                                distances='euclidean')

Loaded model with COSINE distances.
Loaded model with EUCLIDEAN distances.


In [6]:
sg_128_cos.DifAvgExp(dict_pairs)

NUROFEN_IBUPROFEN
#1. SELECTED:
7       NUROFEN RACEALA SI GRIPA *24CPR F
1638       IBUPROFEN 200MG CT*20CPR CIPLA
Name: ITEM_NAME, dtype: object

#2. COSINE DISTANCE BETWEEN SELECTED PRODUCTS: 0.039

#3.1. TOP K P1
                                             ITEM_NAME
15             NUROFEN RACEALA SI GRIPA *12CPR F BOOTS
152                       ASPIRIN BAYER+C CT*10CPR EFF
83                        ASPIRIN BAYER+C CT*20CPR EFF
47            IBUSINUS RACEALA & GRIPA*20 CPR SOLACIUM
160                      MODAFEN CT*24CPR FILM ZENTIVA
3464         ABOCA GRINTUSS TUSE SIROP ADULTI FL*180ML
250                       HUMAGRIP CT*12 CPR+4CPS URGO
1629          MUCOSOLVAN SIR 30MG/5ML 100ML BOEHRINGER
1428                  STREPSILS INTENSIV SPRAY FL*15ML
594   TANTUM VERDE SPRAY 0.3% 15ML CSC PHARMACEUTICALS

#3.2. TOP K P2
                                    ITEM_NAME
327            IBUPROFEN 400MG CT*20CPR CIPLA
189            IBUPROFEN 600MG CT*20CPR CIPLA
3349  IBUFEN 200MG CT* 2

In [7]:
### NUROFEN EXPRESS FORTE (8) & IBUPROFEN 600MG (189)

ibuprofen = sg_128_euc.embeddings[189]
nurofen_exp = sg_128_euc.embeddings[8]
concept1 = abs(nurofen_exp-ibuprofen)
mask_concept1 = concept1 <= 0.01
print(np.argwhere(mask_concept1 == np.amax(mask_concept1)).flatten())

[79]


In [8]:
### BRUFEN (36) & ADAGIN (133)

brufen = sg_128_euc.embeddings[36]
adagin = sg_128_euc.embeddings[133]
concept2 = abs(adagin-brufen)
mask_concept2 = concept2 <= 0.01
print(np.argwhere(mask_concept2 == np.amax(mask_concept2)).flatten())

[ 59  86 106]


In [9]:
### DR. HART VITAMINA C (37) & VITAMINA C GNC (2125)

prod1 = sg_128_cos.embeddings[37]
prod2 = sg_128_cos.embeddings[2125]
concept3 = abs(prod1-prod2)
mask_concept3 = concept3 <= 0.001
print(np.argwhere(mask_concept3 == np.amax(mask_concept3)).flatten())

[101]


In [10]:
### SECOM VITAMINA C (871) & REDOXON VIT C (6028)

prod3 = sg_128_cos.embeddings[871]
prod4 = sg_128_cos.embeddings[6028]
concept4 = abs(prod3-prod4)
mask_concept4 = concept4 <= 0.005
print(np.argwhere(mask_concept4 == np.amax(mask_concept4)).flatten())

[  2  13  19  42  48  59  65  71  79 101]


In [15]:
nurofen = sg_128_cos.embeddings[7]
vitc = sg_128_cos.embeddings[871]
crema = sg_128_cos.embeddings[3317]
cetebe = sg_128_cos.embeddings[717]

K = 10
result = vitc-cetebe+crema
result = result / np.sqrt(np.sum(result ** 2))
dist = sg_128_cos.embeddings.dot(result)
top_k_indexes = np.argsort(dist)
top_k_indexes_minus = top_k_indexes[:K]
top_k_indexes_plus = top_k_indexes[-(K+1):]

distances_minus = dist[top_k_indexes_minus]
distances_plus = dist[top_k_indexes_plus]
df_minus = sg_128_euc.df.loc[top_k_indexes_minus,['ITEM_NAME']]
df_minus['COSDIST'] = distances_minus
df_plus = sg_128_euc.df.loc[top_k_indexes_plus,['ITEM_NAME']]
df_plus['COSDIST'] = distances_plus

In [16]:
print(df_minus)

                                               ITEM_NAME   COSDIST
7353                     BROMHEXIN 8MG CT*20CPR FARMACOM -0.308306
26689       L'OCCITANE TESTER IRIS BLEU CREMA MAINI 30ML -0.295875
26605                             ARTSANA TERM VEDO EASY -0.291648
25313             MAVALA F TANOA ULEI FRANGIPANIER 125ML -0.273227
24173     K'S KIDS KA10496 ZEBRA RYAN MARE 28 ACTIVITATI -0.270940
26425               POLAROID16 PLD 1017/S 3YG LIGHT GOLD -0.268627
23690                    POLAROID16 PLD 4024/S D28 BLACK -0.268250
25798      HARTMANN TENSIOMETRU DIGITAL TENSOVAL COMPACT -0.265443
23829             POLAROID15 PLD 5008/S 000 8W ROSE GOLD -0.262004
26253  TONUS ELAST CENTURA ELASTICA HERNIE OMBILICALA... -0.258667


In [17]:
print(df_plus)

                                              ITEM_NAME   COSDIST
8995                   NEUTROGENA CR MAINI SCENTED 50ML  0.485655
8511                 NEUTROGENA CR MAINI UNSCENTED 50ML  0.504254
5691  NEUTROGENA LOTIUNE HID. P. USCATA 400 ML (OVER...  0.504904
3861           NEUTROGENA NORDIC BERRY LIP STICK 4.8 GR  0.516796
7296              NEUTROGENA ULTRA INTENSIVE BALM 300ML  0.533349
4543           NEUTROGENA NORDIC BERRY HAND CREAM 75 ML  0.555779
4541      NEUTROGENA CR MAINI UNSCENTED 75ML (OVERFILL)  0.569865
3519    NEUTROGENA BALSAM BUZE LIPCARE (BLISTER) 4.8 GR  0.596062
5098        NEUTROGENA CREMA MAINI ANTI-AGE SPF 15 50ML  0.630968
4116        NEUTROGENA CR MAINI SCENTED 75ML (OVERFILL)  0.642729
3317  NEUTROGENA CREMA MAINI CU GRAD DE ABSORTIE RID...  0.742812
