# Initializing Notebook

In [120]:
import pandas as pd
import numpy as np
# Tokenize the data and add it to the data list

import os
import openai
import wandb

from openai.embeddings_utils import get_embedding
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage


from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

In [121]:
try:
    os.mkdir("embeddings")
except:
    None

In [122]:
product_detail_detail_path = "clean_data/cleaned_products_detailed.csv"
product_standard_path = "clean_data/cleaned_products.csv"

df_product_detail = pd.read_csv(product_detail_detail_path)
df_product_standard = pd.read_csv(product_standard_path)

In [123]:
np.random.seed(42)

In [124]:
df_detailed = df_product_detail[['ctr_product_num','attr_value_en_sentence']]
df_detailed = df_detailed.drop_duplicates()
df_detailed = df_detailed.dropna()
df_detailed_subset = df_detailed.sample(frac=1)[:100000]

In [125]:
df_detailed.shape

(235418, 2)

In [126]:
suffix = "_100k"

Only a subset of 100k samples is used due to hardware and computational limitations

In [127]:
df_detailed

Unnamed: 0,ctr_product_num,attr_value_en_sentence
1,5044,Travel poker chips
2,5045,40 piece poker chips
6,22726,General Tire GMAX UHP; Features: Wide circumfe...
16,31702,Top 3 Vehicle Applications: Toyota Tercel (199...
19,31703,Top 3 Vehicle Applications: Hyundai Accent (20...
...,...,...
1559079,8997335,Reliable performance through consistency and u...
1559082,8997336,Reliable performance through consistency and u...
1559085,8997337,Made from heavy duty 600 denier water-resistan...
1559091,8997338,Strong 600D polyester exteriorRemovable divide...


## Data preview

This data is from the product detailed dataset. The team has merged all the attributes of each product to become paragraph. Sample instance of the data will be displayed below.

In [128]:
df_product_standard[df_product_standard.ctr_product_num == 73603]

Unnamed: 0.1,Unnamed: 0,ctr_product_num,ctr_style_name,short_desc,long_desc,merch_division_nm,merch_lob_nm,merch_bus_cat_nm,merch_subcat_nm,merch_fineline_nm,...,ctr_product_profile_cd,ctr_consumer_role_cd,package_depth_qty,package_height_qty,package_width_qty,package_volume_qty,package_weight_qty,national_consumer_price_amt,cold_sensitive_ind,heat_sensitive_ind
232151,233862,73603,,P235/55R17 PIRP8FSUV,235/55R17 103V XL Pirelli P8FSUV.,AUTOMOTIVE,TIRES,ALL SEASON TIRES,All Season Passenger & CUV Tires,Pirelli P8,...,JOB_JOY,DESTINATION,27.2,9.3,27.2,3.95453,29.572,265.99,N,Y


In [129]:
list(df_detailed[df_detailed.ctr_product_num == 73603].attr_value_en_sentence)

['STABLE AND PRECISE HANDLING - The reinforced shoulder blocks and transversal grooves on central ribs provide a high stability. SAFE DRIVING - The high density tread sipes and internal shoulder blocks provide and excellent all-season grip. HIGHER MILEAGE - Compound materials developed with optimized polymer blend promote a long lasting experience']

Different product but product detail is identical.

In [130]:
df_product_standard[df_product_standard.ctr_product_num == 73600]

Unnamed: 0.1,Unnamed: 0,ctr_product_num,ctr_style_name,short_desc,long_desc,merch_division_nm,merch_lob_nm,merch_bus_cat_nm,merch_subcat_nm,merch_fineline_nm,...,ctr_product_profile_cd,ctr_consumer_role_cd,package_depth_qty,package_height_qty,package_width_qty,package_volume_qty,package_weight_qty,national_consumer_price_amt,cold_sensitive_ind,heat_sensitive_ind
210813,212375,73600,,P215/45R17 PIR P8FSP,215/45R17 91V XL Pirelli P8FSP.,AUTOMOTIVE,TIRES,ALL SEASON TIRES,All Season Passenger & CUV Tires,Pirelli P8,...,JOB_JOY,DESTINATION,24.6,8.5,24.6,2.968853,21.563,240.99,N,Y


In [131]:
list(df_detailed[df_detailed.ctr_product_num == 73600].attr_value_en_sentence)

['SAFE DRIVING - The silica enhanced compound provides a responsive handling and an excellent traction PRECISE STEERING - The stiffer tire carcass structure make for an efficient handling and steering ENHANCED BRAKING PERFORMANCE AND OUTSTANDING DRIVING COMFORT - The optimized pitch sequencing provides an exceptional braking performance and a quiet drive OUTSTANDING GRIP ON WET - Improved lateral stability due to central grooves']

In [132]:
df_product_standard[df_product_standard.ctr_product_num == 73601]

Unnamed: 0.1,Unnamed: 0,ctr_product_num,ctr_style_name,short_desc,long_desc,merch_division_nm,merch_lob_nm,merch_bus_cat_nm,merch_subcat_nm,merch_fineline_nm,...,ctr_product_profile_cd,ctr_consumer_role_cd,package_depth_qty,package_height_qty,package_width_qty,package_volume_qty,package_weight_qty,national_consumer_price_amt,cold_sensitive_ind,heat_sensitive_ind
714569,720508,73601,,P225/45R17 PIR P8FSP,225/45R17 94W XL Pirelli P8FSP.,AUTOMOTIVE,TIRES,ALL SEASON TIRES,All Season Passenger & CUV Tires,Pirelli P8,...,JOB_JOY,DESTINATION,25.0,8.9,25.0,3.196676,21.999,244.99,N,Y


In [133]:
list(df_detailed[df_detailed.ctr_product_num == 73601].attr_value_en_sentence)

['SAFE DRIVING - The silica enhanced compound provides a responsive handling and an excellent traction PRECISE STEERING - The stiffer tire carcass structure make for an efficient handling and steering ENHANCED BRAKING PERFORMANCE AND OUTSTANDING DRIVING COMFORT - The optimized pitch sequencing provides an exceptional braking performance and a quiet drive OUTSTANDING GRIP ON WET - Improved lateral stability due to central grooves']

Similar description types

In [134]:
df_product_standard[df_product_standard.ctr_product_num == 31703]

Unnamed: 0.1,Unnamed: 0,ctr_product_num,ctr_style_name,short_desc,long_desc,merch_division_nm,merch_lob_nm,merch_bus_cat_nm,merch_subcat_nm,merch_fineline_nm,...,ctr_product_profile_cd,ctr_consumer_role_cd,package_depth_qty,package_height_qty,package_width_qty,package_volume_qty,package_weight_qty,national_consumer_price_amt,cold_sensitive_ind,heat_sensitive_ind
517143,521166,31703,,*175/70R13 82T CWTRK,*175/70R13 82T Certified WinterTrek,AUTOMOTIVE,TIRES,WINTER TIRES,Winter Passenger & CUV Tires,Certified WinterTrek,...,JOB_JOY,DESTINATION,22.7,7.0,22.7,2.073948,15.829,109.99,N,N


In [135]:
list(df_detailed[df_detailed.ctr_product_num == 31703].attr_value_en_sentence)

['Top 3 Vehicle Applications: Hyundai Accent (2002), Hyundai Accent (2003), Hyundai Accent (2001) Studdable option']

In [136]:
df_product_standard[df_product_standard.ctr_product_num == 31702]

Unnamed: 0.1,Unnamed: 0,ctr_product_num,ctr_style_name,short_desc,long_desc,merch_division_nm,merch_lob_nm,merch_bus_cat_nm,merch_subcat_nm,merch_fineline_nm,...,ctr_product_profile_cd,ctr_consumer_role_cd,package_depth_qty,package_height_qty,package_width_qty,package_volume_qty,package_weight_qty,national_consumer_price_amt,cold_sensitive_ind,heat_sensitive_ind
315007,317492,31702,,*155/80R13 79T CWTRK,*155/80R13 79T Certified WinterTrek,AUTOMOTIVE,TIRES,WINTER TIRES,Winter Passenger & CUV Tires,Certified WinterTrek,...,JOB_JOY,DESTINATION,22.8,6.2,22.8,1.852281,13.536,104.99,N,N


In [137]:
list(df_detailed[df_detailed.ctr_product_num == 31702].attr_value_en_sentence)

['Top 3 Vehicle Applications: Toyota Tercel (1999), Hyundai Accent (2002), Toyota Tercel (1998) Studdable option']

now these data will be fed into the transformer based language model as a paragraph to generate embedding of the product based on the description.

# GPT 3

In [138]:
'''

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
 
# In order to use this we have to obtain a key.
openai.api_key = os.environ.get('OpenaiKey')

'''

'\n\ndef get_embedding(text, model="text-embedding-ada-002"):\n   text = text.replace("\n", " ")\n   return openai.Embedding.create(input = [text], model=model)[\'data\'][0][\'embedding\']\n \n# In order to use this we have to obtain a key.\nopenai.api_key = os.environ.get(\'OpenaiKey\')\n\n'

In [139]:
# Conclusion is in this way this doesn't work due to the fact that my account is free account. There might be a get away by controlling the rate of the embedding generation
# But that possibility might explore later if this NLP is even possible or not.
'''
df_detailed['ada_similarity'] = df_detailed.attr_value_en_sentence.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df_detailed['ada_search'] = df_detailed.attr_value_en_sentence.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))

'''

"\ndf_detailed['ada_similarity'] = df_detailed.attr_value_en_sentence.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))\ndf_detailed['ada_search'] = df_detailed.attr_value_en_sentence.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))\n\n"

A limitation of the GPT model is paid services is required for our data size. To avoid paying we can try to adjust the rate but We'll try other language models like bert.

# Advanced language models

Useful websites for reference:

https://www.topbots.com/leading-nlp-language-models-2020/ 

https://medium.com/@nils_reimers/openai-gpt-3-text-embeddings-really-a-new-state-of-the-art-in-dense-text-embeddings-6571fe3ec9d9

## Sentence-Transformers Model

### all-MiniLM-L6-v2

all-MiniLM-L6-v2. An extremely small (80 MB) and fast model, with only 6 layers which producing embeddings with 384 dimensions.

In [140]:
MiniLM_L6_v2_model = SentenceTransformer('all-MiniLM-L6-v2')

In [141]:
generated_embeddings = {}
for i, row in df_detailed_subset.iterrows():
    # Generate an embedding for the entire product sentence
    generated_embedding = MiniLM_L6_v2_model.encode(row["attr_value_en_sentence"])
    generated_embeddings[row['ctr_product_num']] = generated_embedding
embeddings_dict = pd.DataFrame.from_dict(generated_embeddings, orient='index')
embeddings_dict.index.names = ["ctr_product_num"]

embeddings_dict.to_csv("embeddings/minilm"+ suffix + ".csv")

In [None]:
#df_detailed_subset['all-MiniLM-L6-v2_embedding'] = df_detailed_subset.attr_value_en_sentence.apply(lambda x: MiniLM_L6_v2_model.encode(x))

In [None]:
#df_detailed_subset

### all-mpnet-base-v2

all-mpnet-base-v2: A bert-base sized model (418 MB) with 12 layers and 768 dimensions.

In [None]:
mpnet_base_v2_model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
generated_embeddings = {}
for i, row in df_detailed_subset.iterrows():
    # Generate an embedding for the entire sentence
    generated_embedding = mpnet_base_v2_model.encode(row["attr_value_en_sentence"])
    generated_embeddings[row['ctr_product_num']] = generated_embedding
embeddings_dict = pd.DataFrame.from_dict(generated_embeddings, orient='index')
embeddings_dict.index.names = ["ctr_product_num"]

embeddings_dict.to_csv("embeddings/mpnet"+ suffix + ".csv")

In [None]:
#df_detailed_subset['all-mpnet-base-v2_embedding'] = df_detailed_subset.attr_value_en_sentence.apply(lambda x: mpnet_base_v2_model.encode(x))

In [None]:
#df_detailed_subset

### all-roberta-large-v1

all-roberta-large-v1: A model based on RoBERTA-large (1.3 GB) with 24 layers and 1024 dimensions.

In [None]:
roberta_large_v1_model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')

In [None]:
generated_embeddings = {}
for i, row in df_detailed_subset.iterrows():
    # Generate an embedding for the entire sentence
    generated_embedding = roberta_large_v1_model.encode(row["attr_value_en_sentence"])
    generated_embeddings[row['ctr_product_num']] = generated_embedding
embeddings_dict = pd.DataFrame.from_dict(generated_embeddings, orient='index')
embeddings_dict.index.names = ["ctr_product_num"]

embeddings_dict.to_csv("embeddings/roberta"+ suffix + ".csv")

In [None]:
#df_detailed_subset['all-roberta-large-v1_embedding'] = df_detailed_subset.attr_value_en_sentence.apply(lambda x: roberta_large_v1_model.encode(x))

In [None]:

#df_detailed_subset

## Google embedding model

### Sentence-T5
Sentence-T5: The most recent text embedding model from Google published in August 2021.

https://arxiv.org/pdf/2108.08877.pdf

In [None]:
sentence_t5_base_model = SentenceTransformer('sentence-transformers/sentence-t5-base')

In [None]:
generated_embeddings = {}
for i, row in df_detailed_subset.iterrows():
    # Generate an embedding for the entire sentence
    generated_embedding = sentence_t5_base_model.encode(row["attr_value_en_sentence"])
    generated_embeddings[row['ctr_product_num']] = generated_embedding
embeddings_dict = pd.DataFrame.from_dict(generated_embeddings, orient='index')
embeddings_dict.index.names = ["ctr_product_num"]

embeddings_dict.to_csv("embeddings/t5"+ suffix + ".csv")

In [None]:
#df_detailed_subset['sentence-t5-base_embedding'] = df_detailed_subset.attr_value_en_sentence.apply(lambda x: sentence_t5_base_model.encode(x))

In [49]:
#df_detailed_subset

# Mask Language Model task

## Trial 1

In [7]:
import multiprocessing
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import transformers


from datasets import Dataset
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoConfig
from transformers import BertForMaskedLM, DistilBertForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from tokenizers import BertWordPieceTokenizer


In [8]:
# HYPERPARAMS
SEED_SPLIT = 0
SEED_TRAIN = 0

MAX_SEQ_LEN = 128
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 2e-5 
LR_WARMUP_STEPS = 100
WEIGHT_DECAY = 0.01

In [9]:
# load data
dtf_mlm = df_detailed
#dtf_mlm = dtf_mlm.rename(columns={"review_content": "text"})

# Train/Valid Split
df_train, df_valid = train_test_split(
    dtf_mlm, test_size=0.15, random_state=SEED_SPLIT
)

len(df_train), len(df_valid)


# Convert to Dataset object
train_dataset = Dataset.from_pandas(df_train[['attr_value_en_sentence']].dropna())
valid_dataset = Dataset.from_pandas(df_valid[['attr_value_en_sentence']].dropna())

In [10]:

'''
bert-base-uncased  # 12-layer, 768-hidden, 12-heads, 109M parameters
distilbert-base-uncased  # 6-layer, 768-hidden, 12-heads, 65M parameters
'''

MODEL = 'bert'
bert_type = 'bert-base-cased'

if MODEL == 'distilbert':
    TokenizerClass = DistilBertTokenizer 
    ModelClass = DistilBertForMaskedLM 
elif MODEL == 'bert':
    TokenizerClass = BertTokenizer
    ModelClass = BertForMaskedLM 
elif MODEL == 'roberta':
    TokenizerClass = RobertaTokenizer
    ModelClass = RobertaForMaskedLM
elif MODEL == 'scibert':
    TokenizerClass = AutoTokenizer
    ModelClass = AutoModelForMaskedLM

tokenizer = TokenizerClass.from_pretrained(
            bert_type, use_fast=True, do_lower_case=False, max_len=MAX_SEQ_LEN
            )

model = ModelClass.from_pretrained(bert_type)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
def tokenize_function(row):
    return tokenizer(
        row['attr_value_en_sentence'],
        padding='max_length',
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_special_tokens_mask=True)
  
column_names = train_dataset.column_names

train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=column_names,
)

valid_dataset = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=column_names,
)

NameError: name 'tokenizer' is not defined

In [22]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)


steps_per_epoch = int(len(train_dataset) / TRAIN_BATCH_SIZE)

training_args = TrainingArguments(
    output_dir='./bert-news',
    logging_dir='./LMlogs',             
    num_train_epochs=2,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    warmup_steps=LR_WARMUP_STEPS,
    save_steps=steps_per_epoch,
    save_total_limit=3,
    weight_decay=WEIGHT_DECAY,
    learning_rate=LEARNING_RATE, 
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='loss', 
    greater_is_better=False,
    seed=SEED_TRAIN
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("embeddings/") #save your custom model

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: attr_value_en_sentence, __index_level_0__. If attr_value_en_sentence, __index_level_0__ are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 0
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 33476
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize

## Trial 2

Followed this tutorial:
https://towardsdatascience.com/masked-language-modelling-with-bert-7d49793e5d2c 

In [34]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
from transformers import AdamW

from GPUtil import showUtilization as gpu_usage
from numba import cuda

In [22]:
!pip install GPUtil

Collecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Installing collected packages: GPUtil
  Running setup.py install for GPUtil: started
  Running setup.py install for GPUtil: finished with status 'done'
Successfully installed GPUtil-1.4.0


  DEPRECATION: GPUtil is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559

[notice] A new release of pip available: 22.3.1 -> 23.0
[notice] To update, run: C:\Users\Jihoon.DESKTOP-1HIBMQO\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [35]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Tokenization — tokenization is simple, we’ve already initialized a BertTokenizer, all we do now is tokenize our input text.

In [36]:
# load data
dtf_mlm = df_detailed[:500]

# Train/Valid Split
df_train, df_valid = train_test_split(
    dtf_mlm, test_size=0.15, random_state=0
)

In [37]:
inputs = tokenizer(df_train['attr_value_en_sentence'].tolist(), return_tensors='pt', max_length=512, truncation=True, padding='max_length')

2. Create labels — The next step is easy, all we need to do here is clone our input_ids tensor into a new labels tensor. We’ll store this within the inputs variable too.

In [38]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [39]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

3. Masking — Now we need to mask a random selection of tokens in our input_ids tensor.

In [40]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [41]:
mask_arr

tensor([[False, False, False,  ..., False, False, False],
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        ...,
        [False,  True, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]])

And now we take take the indices of each True value, within each individual vector.

In [42]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

4. Getting the data ready

In [43]:
class ProductDetailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [44]:
dataset = ProductDetailDataset(inputs)

In [45]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [46]:
gpu_usage()       


| ID | GPU | MEM |
------------------
|  0 |  5% |  9% |


In [50]:
'''
cuda.select_device(0)
cuda.close()
cuda.select_device(0)
'''

'\ncuda.select_device(0)\ncuda.close()\ncuda.select_device(0)\n'

In [51]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()

RuntimeError: CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

4. Calculate Loss — Our final step here no different from the typical model training process.

In [30]:

# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

#### Train

In [33]:
from tqdm import tqdm  # for our progress bar

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  0%|          | 0/27 [00:00<?, ?it/s]


RuntimeError: CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
torch.save(model, 'embeddings/preliminary_nlp_model')

Load the model

In [12]:
fine_tuned_model = torch.load('embeddings/preliminary_nlp_model.pt')

In [13]:
# Saving in a form that the huggingface provided.
fine_tuned_model.save_pretrained("embeddings/preliminary_nlp_model")

In [14]:
load_st_model = BertForMaskedLM.from_pretrained('embeddings/preliminary_nlp_model')

In [16]:
generated_embeddings = {}
for i, row in df_detailed_subset.iterrows():
    # Generate an embedding for the entire product sentence
    generated_embedding = fine_tuned_model.bert(**tokenizer(row["attr_value_en_sentence"],return_tensors="pt"))[0][:,0,:].squeeze(0)
    generated_embeddings[row['ctr_product_num']] = generated_embedding
embeddings_dict = pd.DataFrame.from_dict(generated_embeddings, orient='index')
embeddings_dict.index.names = ["ctr_product_num"]

embeddings_dict.to_csv("embeddings/custom_nlp_1k.csv")

In [14]:
df_detailed_subset['iteration_1'] = df_detailed_subset.attr_value_en_sentence.apply(lambda x: fine_tuned_model.bert(**tokenizer(x,return_tensors="pt"))[0][:,0,:].squeeze(0))

KeyboardInterrupt: 

In [None]:
df_detailed_subset

#### Train using wandbd

In [None]:
'''
from transformers import TrainingArguments
from transformers import Trainer

args = TrainingArguments(
    output_dir='out',
    per_device_train_batch_size=16,
    num_train_epochs=2
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset
)

#os.environ["WANDB_MODE"] = "online"
os.environ["WANDB_DISABLED"] = "true"

trainer.train()

'''
