In [27]:
from nltk.tokenize import word_tokenize
import warnings
import nltk
import os, re
import pandas as pd
import numpy as np
import pickle
from gensim.models import KeyedVectors

In [28]:
nltk.download('punkt')
warnings.filterwarnings(action = 'ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\90539\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
LOCALE = 'IT'

In [30]:
def get_products(csv_file,locale):
    products = pd.read_csv(os.path.join('.', csv_file)).query(f'locale == "{locale}"')

    p_ids, titles, descs = [],[],[]
    for p_id in products.id:
        p_ids.append(p_id)
    for title in products.title:
        titles.append(title)
    for desc in products.desc:
        descs.append(desc)

    language_regex = ''

    if locale ==  'UK':
        language_regex = r"[^a-zA-Z0-9 '.]+"
    elif locale ==  'DE':
        language_regex = r"[^a-zA-Z0-9 äöüÄÖÜß'.]+"
    elif locale == 'JP':
        language_regex = r""
    elif locale == 'ES':
        language_regex = r"[^a-zA-Z0-9 zñáéíóúü'.]+"
    elif locale == 'FR':
        language_regex = r"[^a-zA-Z0-9 zàâçéèêëîïôûùüÿñæœ '.]+"
    elif locale == 'IT':
        language_regex = r"[^a-zA-Z0-9 zàèéòù'.]+"

    clean_descs = []
    for desc in descs:
        if pd.isna(desc):
            desc = ''
        desc = re.sub(language_regex, "", desc)
        clean_desc = []
        for word in word_tokenize(desc):
            clean_desc.append(word.lower())
        clean_descs.append(clean_desc)

    clean_titles = []
    for title in titles:
        if pd.isna(title):
            title = ''
        title = re.sub(language_regex, "", title)
        clean_title = []
        for word in word_tokenize(title):
            clean_title.append(word.lower())
        clean_titles.append(clean_title)

    return pd.DataFrame(data = {'id': p_ids, 'title': clean_titles, 'desc': clean_descs})

In [31]:
#get product titles and descriptions by locale
ps = get_products('../data/raw-data/products_train.csv',LOCALE)

In [32]:
#load Wikipedia2Vec model
wv_for_locale = KeyedVectors.load_word2vec_format(f'{LOCALE}wiki_20180420_100d.txt', binary=False)

In [33]:
title_vectors = {}
desc_vectors = {}

#keep missing value ids
missing_title_embedding_ids = []
missing_desc_embedding_ids = []

#keep not missing title and description values for average
available_title_values = []
available_desc_values = []

for i in range(len(ps)):

    title_vectors_for_one_item = []
    desc_vectors_for_one_item = []

    current_item_title = ps['title'][i]
    for j in range(len(current_item_title)):
        try:
            word_vector = wv_for_locale.get_vector(current_item_title[j])
            title_vectors_for_one_item.append(word_vector)
        except KeyError:
            pass

    current_item_description = ps['desc'][i]
    for k in range(len(current_item_description)):
        try:
            word_vector = wv_for_locale.get_vector(current_item_description[k])
            desc_vectors_for_one_item.append(word_vector)
        except KeyError:
            pass


    if len(title_vectors_for_one_item) == 0: #if title is empty
        missing_title_embedding_ids.append(ps['id'][i])
    else:
        title_mean_for_item = np.mean(title_vectors_for_one_item, axis=0)
        available_title_values.append(title_mean_for_item)
        title_vectors[ps['id'][i]] = title_mean_for_item


    if len(desc_vectors_for_one_item) == 0: #if description is empty
        missing_desc_embedding_ids.append(ps['id'][i])
    else:
        desc_mean_for_item = np.mean(desc_vectors_for_one_item, axis=0)
        available_desc_values.append(desc_mean_for_item)
        desc_vectors[ps['id'][i]] = desc_mean_for_item

In [34]:
for i in missing_title_embedding_ids:
    title_vectors[i] = np.mean(available_title_values, axis = 0)

for i in missing_desc_embedding_ids:
    desc_vectors[i] = np.mean(available_desc_values, axis = 0)

In [35]:
with open('../embeddings/' + LOCALE + '-title_vector_embedding.pkl', 'wb') as fp:
    pickle.dump(title_vectors, fp)

with open('../embeddings/' + LOCALE + '-desc_vector_embedding.pkl', 'wb') as fp:
    pickle.dump(desc_vectors, fp)

In [36]:
#set weighted average coefficient
alpha = 0.65

In [37]:
with open('../embeddings/' + LOCALE + '-title_vector_embedding.pkl', 'rb') as fp:
    title_vectors_for_locale = pickle.load(fp)

with open('../embeddings/' + LOCALE + '-desc_vector_embedding.pkl', 'rb') as fp:
    desc_vectors_for_locale = pickle.load(fp)

In [38]:
weighted_title_desc_embeddings = {}

for item_id in list(title_vectors_for_locale.keys()):
    weighted_title_desc_embeddings[item_id] = alpha*title_vectors_for_locale[item_id] + (1 - alpha)*desc_vectors_for_locale[item_id]

In [39]:
with open('../embeddings/' + LOCALE + '-weighted_title_desc_embedding.pkl', 'wb') as fp:
    pickle.dump(weighted_title_desc_embeddings, fp)