In [1]:
import gensim
from gensim.models.word2vec import Word2Vec
import pandas as pd
import numpy as np
# Tokenize the data and add it to the data list

In [2]:
product_detail_path = "clean_data/cleaned_products_detailed.csv"
df_product_detailed = pd.read_csv(product_detail_path)
df_detailed = df_product_detailed[['ctr_product_num','attr_value_en_sentence']]
df_detailed = df_detailed.dropna()
df_detailed = df_detailed.drop_duplicates()
df_detailed_subset = df_detailed.sample(frac=1)

In [6]:
df_detailed_subset.shape

(235418, 2)

# Data Preview

the input data is a list of vocabulary corresponding to each "ctr_product_num". each word will gain a vector representation after word2vec, and in order to convert to doc2vec, we average the vector value for all words in that product description

In [3]:
data = []
for i, row in df_detailed_subset.iterrows():
    data.append(row['attr_value_en_sentence'].split())

print(data[0])
df_detailed_subset.head()

['4.1', 'cm', 'maximum', 'detection', 'depth', 'Center', 'and', 'Edge', 'Finder', 'LEDs', 'track', 'the', 'center', 'and', 'edges', 'of', 'studs', 'simultaneously', 'Finds', 'wood', 'and', 'metal', 'studs', 'Deep-scan', 'mode', 'is', 'always', 'on', 'ACCURATE,', 'The', 'multiple', 'LEDs', 'track', 'the', 'location', 'of', 'studs', 'to', 'accurately', 'and', 'reliably', 'identify', 'the', 'center', 'and', 'edges', 'of', 'studs', 'simultaneously', 'EASY,', 'Just', 'press', 'and', 'hold', 'te', 'botton', 'to', 'instantly', 'detect', 'studs.', 'No', 'calibraiton', 'required.', 'HOW', 'IT', 'WORKS,', 'The', 'ProSensor', 'T11', 'has', '11', 'sensors', 'and', 'has', 'Multi-Sense', 'Technology.', 'In', 'comparison,', 'conventional', 'stud', 'finders', 'have', '1', 'or', '2', 'sensors.', 'With', 'more', 'sensors,', 'the', 'T11', 'more', 'accurately', 'identifies', 'the', 'location', 'of', 'studs']


Unnamed: 0,ctr_product_num,attr_value_en_sentence
210785,574627,4.1 cm maximum detection depth\nCenter and Edg...
1215023,8423863,Turn into an old geezer this Halloween with a ...
1193812,8421970,Fill the room with shine and color for a loved...
328004,766404,Perfect heated solution for all your camp/outd...
136326,482267,Peelable rubber coating that enables endless c...


In [4]:
# Train the model
model = Word2Vec(data, min_count=1)

In [5]:
# Use the model to generate product embeddings
product_embeddings = {}
for i, row in df_detailed_subset.iterrows():
    # Generate an embedding for the entire product sentence by concatenating the individual word embeddings
    product_embedding = np.mean([model.wv[word] for word in row['attr_value_en_sentence'].split()], axis=0)
    product_embeddings[row['ctr_product_num']] = product_embedding

embeddings_dict = pd.DataFrame.from_dict(product_embeddings, orient='index')
embeddings_dict.index.names = ["ctr_product_num"]

embeddings_dict.to_csv("embeddings/prod2vec_all.csv")

In [None]:
import plotly.express as px
from sklearn.manifold import TSNE

# Extract the product embeddings from the product_embeddings dictionary
product_embeddings_list = np.array(list(product_embeddings.values()))

# Use t-SNE to reduce the dimensions of the product embeddings
tsne = TSNE(n_components=3)
product_embeddings_3d = tsne.fit_transform(product_embeddings_list)

# Extract the product names from the product_embeddings dictionary
product_names = list(product_embeddings.keys())

# Create a dataframe with the product embeddings and names
df = pd.DataFrame({'x': product_embeddings_3d[:, 0], 'y': product_embeddings_3d[:, 1], 'z': product_embeddings_3d[:, 2], 'product': product_names})

# Plot the product embeddings in 3D using plotly
fig = px.scatter_3d(df, x='x', y='y', z='z', text='product')
fig.show()


In [None]:
#! pip install tensorflow-datasets

In [None]:
path_standard = "Data\product\product_standard_attributes.csv" 

# put it in data farme
df_product_standard = pd.read_csv(path_standard, low_memory=False) #lazyway to solve low memory issue
df_right =df_product_standard[['ctr_product_num','short_desc']]


In [None]:
df_right

In [None]:
df_right['ctr_product_num'] = pd.to_numeric(df_right['ctr_product_num'], errors='coerce')
df_right = df_right.dropna(subset=['ctr_product_num'])
df_right['ctr_product_num'] = df_right['ctr_product_num'].astype('int')
df_right

In [None]:
df_desc = df_detailed_subset.merge(df_right, on='ctr_product_num', how='left')
df_desc


In [None]:
for i, row in df_desc.iterrows():
    # Generate an embedding for the entire product sentence by concatenating the individual word embeddings
    product_embedding = np.mean([model.wv[word] for word in row['attr_value_en_sentence'].split()], axis=0)
    product_embeddings[row['ctr_product_num']] = product_embedding

In [None]:
# Save Labels separately on a line-by-line manner.
with open('visualization/metadata.tsv', "w") as f:
  for i, row in df_desc.iterrows():
    f.write("{}\n".format(row['short_desc']))

In [None]:
import tensorflow as tfflow
from sklearn.manifold import TSNE
import os
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorboard.plugins import projector
# Extract the product embeddings from the product_embeddings dictionary
product_embeddings_list = list(product_embeddings.values())

# Convert the product embeddings to a numpy array
product_embeddings_array = np.array(product_embeddings_list, dtype=np.float32)

# Use t-SNE to reduce the dimensions of the product embeddings
tsne = TSNE(perplexity=30,n_components=3)
product_embeddings_3d = tsne.fit_transform(product_embeddings_array)
#print(product_embeddings_array)
#print(product_embeddings_3d)
# Save the dataset to a file
with open('visualization/product_embeddings_3d.tsv', 'w') as f:
    for x, y, z in product_embeddings_3d:
        f.write(f"{x}\t{y}\t{z}\n")
