Creates article embeddings for all articles

In [1]:
import pandas as pd, os, numpy as np
import plotly.express as px
pd.options.display.max_columns = 50
import swifter, datetime, pickle as pkl
import tensorflow_hub as hub
from tqdm.notebook import tqdm

In [2]:
adf = pd.read_parquet('./data/articles.parquet')

In [3]:
adf.head(1)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_no,department_name,index_name,index_group_name,section_no,section_name,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,Solid,Black,Dark,Black,1676,Jersey Basic,Ladieswear,Ladieswear,16,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.


In [4]:
adf['detail_desc'].fillna('no description given', inplace=True)

In [5]:
adf['perceived_colour_value_name'].value_counts().head(20)

Dark            42706
Dusty Light     22152
Light           15739
Medium Dusty    12630
Bright           6471
Medium           5711
Undefined         105
Unknown            28
Name: perceived_colour_value_name, dtype: int64

In [6]:
adf['garment_group_name'].value_counts().head(20)

Jersey Fancy                     21445
Accessories                      11519
Jersey Basic                      8126
Knitwear                          7490
Under-, Nightwear                 7441
Trousers                          6727
Blouses                           5838
Shoes                             5145
Dresses Ladies                    4874
Outdoor                           4501
Unknown                           3873
Trousers Denim                    3100
Swimwear                          2787
Socks and Tights                  2272
Shirts                            2116
Woven/Jersey/Knitted mix Baby     1965
Shorts                            1559
Dresses/Skirts girls              1541
Skirts                            1254
Special Offers                    1061
Name: garment_group_name, dtype: int64

In [7]:
adf.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_no,department_name,index_name,index_group_name,section_no,section_name,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,Solid,Black,Dark,Black,1676,Jersey Basic,Ladieswear,Ladieswear,16,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,Solid,White,Light,White,1676,Jersey Basic,Ladieswear,Ladieswear,16,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,Stripe,Off White,Dusty Light,White,1676,Jersey Basic,Ladieswear,Ladieswear,16,Womens Everyday Basics,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,Solid,Black,Dark,Black,1339,Clean Lingerie,Lingeries/Tights,Ladieswear,61,Womens Lingerie,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,Solid,White,Light,White,1339,Clean Lingerie,Lingeries/Tights,Ladieswear,61,Womens Lingerie,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [8]:
adf['prod_desc_custom'] = adf['detail_desc'] + '. It has a ' + adf['graphical_appearance_name'] + ' print on it.' + ' It has a ' + \
                        adf['perceived_colour_value_name'] + ' ' +  adf['colour_group_name'] + ' colour.'

In [9]:
# adf['prod_desc_custom'] = adf['product_type_name'] + ' named as ' + adf['prod_name'] + '. It has a ' + adf['graphical_appearance_name'] + ' print on it.' + ' It has a ' + \
#                         adf['perceived_colour_value_name'] + ' ' +  adf['colour_group_name'] + ' colour'\
#                         '. It is from the ' + adf['index_name'] + ' group and ' + adf['garment_group_name'] + ', ' + \
#                         adf['section_name'] + ' section. Description: ' + adf['detail_desc']

In [10]:
adf['prod_desc_custom'].iloc[np.random.randint(low=0, high=len(adf) - 1)]

'Checked scarf in a soft wool weave with fringes along the short sides. Size 45x185 cm.. It has a Check print on it. It has a Medium Yellow colour.'

In [11]:
adf['prod_desc_custom'].isna().sum()

0

In [12]:
model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [13]:
# adf['prod_desc_custom'] = adf['prod_desc_custom'].astype(str)

In [14]:
xx = model(['testing query'])

In [15]:
texts = adf['prod_desc_custom'].values.tolist()

In [16]:
# calculates end index for a particular iteration for looping through documents in batches
def calcEndIdx(start_idx, batch_size, ndocs):
    end_idx = start_idx + batch_size
    end_idx = ndocs if end_idx > ndocs - 1 else end_idx
    return end_idx

In [17]:
embeddings = [] # empty array to store embeddings as we iterate through docs
BATCH_SIZE = 64
NUM_DOCS = len(texts)

for start_idx in tqdm(range(0, NUM_DOCS, BATCH_SIZE)):
    end_idx = calcEndIdx(start_idx, BATCH_SIZE, NUM_DOCS)
    curr_embeddings = model(texts[start_idx:end_idx]).numpy()
    embeddings.append(curr_embeddings)
    
embeddings = np.concatenate(embeddings) # convert batched arrays to shape (N, Vector Size)

  0%|          | 0/1650 [00:00<?, ?it/s]

In [18]:
len(texts)

105542

In [19]:
aids = adf['article_id'].values

In [20]:
emb_map = {}
for i in range(len(texts)):
    emb = embeddings[i]
    emb_map[aids[i]] = emb

In [21]:
len(emb_map)

105542

In [22]:
len(aids)

105542

In [23]:
with open('./data/emb_map.pkl', 'wb') as handle:
    pkl.dump(emb_map, handle, protocol=3)