# BERT embeddings generation

This notebook documents the process of generating word embeddings from BERT. Code is written with the assitance of ChatGPT.

In [2]:
def augment_descriptions(data):
    augmented_descriptions = []
    
    for item in data:
        name = item.get('name', '').strip()
        color = item.get('color', '').strip()
        brand = item.get('brand', '').strip()
        description = item.get('description', '').strip()

        if name and color:
            new_description = f"{brand.capitalize()} {color.lower()} {name.lower()}."
            if description:
                new_description += f" {description}"
                item['description'] = new_description
                augmented_descriptions.append(new_description)

    return augmented_descriptions

def extract_depop_descriptions(data):
    descriptions = []

    for item in data:
        if 'structuredData' in item and 'description' in item['structuredData']:
            descriptions.append(item['structuredData']['description'])

    return descriptions

In [3]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast, BertModel


In [4]:
import json

with open('zarascraper/zara-data/unique-products.json', 'r') as f:
    data = json.load(f)

with open('depopscraper/output/depop-detailed-products.json', 'r') as f:
    depop_data_1 = json.load(f)

with open('depopscraper/output/depop-detailed-tops.json', 'r') as f:
    depop_data_2 = json.load(f)

with open('depopscraper/output/depop-under100.json', 'r') as f:
    depop_data_3 = json.load(f)


docs = augment_descriptions(data) + extract_depop_descriptions(depop_data_1) + extract_depop_descriptions(depop_data_2) + extract_depop_descriptions(depop_data_3)

model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [14]:
def preprocess_docs(docs):
    return [doc.replace("#", '').replace("\n", ' ') for doc in docs]

docs = remove_tags(docs)

docs[300]

'Red longsleeve north face fleece large\nPolyester'

In [6]:
with torch.no_grad():
    outputs = model(**tokenized_docs)
    token_embeddings = outputs.last_hidden_state

token_embeddings

tensor([[[-0.4350, -1.0114,  0.1051,  ...,  0.4537, -0.0800, -0.5084],
         [-1.0477, -1.0375,  0.4049,  ...,  0.2341,  0.3324, -0.0244],
         [-0.3705, -0.7926,  0.3314,  ...,  0.3742, -0.4639, -0.9471],
         ...,
         [-0.6867, -0.9967,  0.0539,  ...,  0.2823, -0.0484, -0.5245],
         [ 0.1110, -0.5638,  0.0754,  ...,  0.1041, -0.1005, -0.3362],
         [-0.2298, -0.6035,  0.3043,  ...,  0.2206, -0.1228, -0.1904]],

        [[-0.3496, -0.8891,  0.0036,  ...,  0.3803,  0.0040, -0.5172],
         [-1.1269, -1.0006,  0.4286,  ...,  0.1546,  0.2656,  0.1796],
         [-0.2593, -0.3361, -0.0223,  ...,  0.2039, -0.3239, -0.8709],
         ...,
         [-0.6706, -0.9002, -0.0285,  ...,  0.3062, -0.0299, -0.5062],
         [ 0.1470, -0.5199, -0.0224,  ...,  0.1190, -0.0333, -0.3480],
         [-0.1991, -0.5363,  0.1850,  ...,  0.1848, -0.0919, -0.1916]],

        [[-0.3552, -0.9601,  0.1010,  ...,  0.3299, -0.1149, -0.4594],
         [-1.0163, -1.1146,  0.3122,  ...,  0

In [27]:
attention_mask = tokenized_docs["attention_mask"]

filtered_embeddings = []
max_tokens = 0

for i in range(len(docs)):
    valid_tokens = attention_mask[i] == 1
    valid_embeddings = token_embeddings[i][valid_tokens]  # [num_valid_tokens, 768]
    
    # Convert each embedding to a string
    embeddings_str = [str(embedding.tolist()) for embedding in valid_embeddings]
    filtered_embeddings.append(embeddings_str)
    max_tokens = max(max_tokens, len(embeddings_str))

In [28]:
import csv

csv_filename = "bert_wordlevel_embeds.csv"
with open(csv_filename, mode="w", newline="") as file:
    writer = csv.writer(file)

    header = [f"token_{i}" for i in range(max_tokens)]
    writer.writerow(header)

    for doc_tokens in filtered_embeddings:
        padded = doc_tokens + [""] * (max_tokens - len(doc_tokens))
        writer.writerow(padded)

In [None]:
from sklearn.decomposition import PCA

all_valid_embeddings = []
embedding_index = []
attention_mask = tokenized_docs["attention_mask"]

for doc_idx in range(tokenized_docs["input_ids"].size(0)):
    valid_mask = attention_mask[doc_idx] == 1
    valid_embs = token_embeddings[doc_idx][valid_mask]
    all_valid_embeddings.append(valid_embs)
    for token_idx in range(valid_embs.size(0)):
        embedding_index.append((doc_idx, token_idx))

flat_embeddings = torch.cat(all_valid_embeddings, dim=0).cpu().numpy()

pca = PCA(n_components=64)
reduced_embeddings = pca.fit_transform(flat_embeddings)

doc_reduced_embeddings = [[] for _ in docs]
for (doc_idx, _), emb in zip(embedding_index, reduced_embeddings):
    doc_reduced_embeddings[doc_idx].append(str(emb.tolist()))

['[-0.9181731343269348, -0.7194636464118958, 2.791292667388916, -8.242653846740723, -3.1209616661071777, -6.27514123916626, 0.603439211845398, 2.718503952026367, 1.0329097509384155, 1.1649236679077148, -0.30948999524116516, -0.5575347542762756, -1.1287506818771362, 0.0022011403925716877, -0.16550517082214355, 1.137358546257019, -0.10285671800374985, -0.8078657984733582, -0.28376471996307373, 0.4951093792915344, 0.38195788860321045, 0.08170489966869354, 0.47718435525894165, 1.0315865278244019, -0.04679812118411064, 0.1729489266872406, -0.005005544517189264, 0.11138129979372025, -0.2561616003513336, 0.2980010211467743, 0.8712463974952698, -0.13740274310112, 0.4002978801727295, 0.504547655582428, -0.11889498680830002, 0.1613977551460266, -0.04462823644280434, -0.2036004513502121, -0.2827610671520233, -0.2645743489265442, 0.032265886664390564, 0.07547857612371445, 1.520215392112732, -0.20878705382347107, 0.4603743255138397, 0.2536281943321228, 0.04591186344623566, -0.5963879227638245, 0.49

In [21]:
import csv

csv_filename = "bert_tokenlevel_embeds_reduced.csv"
with open(csv_filename, mode="w", newline="") as file:
    writer = csv.writer(file)
    
    header = [f"token_{i}" for i in range(84)]
    writer.writerow(header)

    for doc_tokens in doc_reduced_embeddings:
        padded = doc_tokens + [""] * (84 - len(doc_tokens))
        writer.writerow(padded)

In [23]:
input_file = "bert/bert_tokenlevel_embeds.csv"

first_split = "bert/bert_tokenlevel_embeds_zara.csv"
second_split = "bert/bert_tokenlevel_embeds_depop.csv"

split_index = 168

with open(input_file, mode="r", newline="") as infile:
    reader = list(csv.reader(infile))
    header = reader[0]
    rows = reader[1:]

    part1_rows = rows[:split_index]
    part2_rows = rows[split_index:]

with open(first_split, mode="w", newline="") as out1:
    writer = csv.writer(out1)
    writer.writerow(header)
    writer.writerows(part1_rows)

with open(second_split, mode="w", newline="") as out2:
    writer = csv.writer(out2)
    writer.writerow(header)
    writer.writerows(part2_rows)

In [10]:
import csv

with open('descs.csv', 'w') as f:
    writer = csv.writer(f)

    for desc in [doc.replace('\n', ' ') for doc in docs]:
        writer.writerow([desc])