# Initializing Notebook

In [None]:
import pandas as pd
import numpy as np
# Tokenize the data and add it to the data list

import os
import openai
import wandb

from openai.embeddings_utils import get_embedding
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage


from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

In [None]:
try:
    os.mkdir("embeddings")
except:
    None

In [None]:
product_detail_detail_path = "clean_data/cleaned_products_detailed.csv"
product_standard_path = "clean_data/cleaned_products.csv"

df_product_detail = pd.read_csv(product_detail_detail_path)
df_product_standard = pd.read_csv(product_standard_path)

In [None]:
np.random.seed(42)

In [None]:
df_detailed = df_product_detail[['ctr_product_num','attr_value_en_sentence']]
df_detailed = df_detailed.drop_duplicates()
df_detailed = df_detailed.dropna()
df_detailed_subset = df_detailed.sample(frac=1)[:100000]

In [None]:
df_detailed.shape

In [None]:
suffix = "_100k"

Only a subset of 100k samples is used due to hardware and computational limitations

In [None]:
df_detailed

## Data preview

This data is from the product detailed dataset. The team has merged all the attributes of each product to become paragraph. Sample instance of the data will be displayed below.

In [None]:
df_product_standard[df_product_standard.ctr_product_num == 73603]

In [None]:
list(df_detailed[df_detailed.ctr_product_num == 73603].attr_value_en_sentence)

Different product but product detail is identical.

In [None]:
df_product_standard[df_product_standard.ctr_product_num == 73600]

In [None]:
#list(df_detailed[df_detailed.ctr_product_num == 73600].attr_value_en_sentence)

In [None]:
#df_product_standard[df_product_standard.ctr_product_num == 73601]

In [None]:
#list(df_detailed[df_detailed.ctr_product_num == 73601].attr_value_en_sentence)

Similar description types

In [None]:
#df_product_standard[df_product_standard.ctr_product_num == 31703]

In [None]:
#list(df_detailed[df_detailed.ctr_product_num == 31703].attr_value_en_sentence)

In [None]:
#df_product_standard[df_product_standard.ctr_product_num == 31702]

In [None]:
#list(df_detailed[df_detailed.ctr_product_num == 31702].attr_value_en_sentence)

now these data will be fed into the transformer based language model as a paragraph to generate embedding of the product based on the description.

# GPT 3

In [None]:
'''

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
 
# In order to use this we have to obtain a key.
openai.api_key = os.environ.get('OpenaiKey')

'''

In [None]:
# Conclusion is in this way this doesn't work due to the fact that my account is free account. There might be a get away by controlling the rate of the embedding generation
# But that possibility might explore later if this NLP is even possible or not.
'''
df_detailed['ada_similarity'] = df_detailed.attr_value_en_sentence.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df_detailed['ada_search'] = df_detailed.attr_value_en_sentence.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))

'''

A limitation of the GPT model is paid services is required for our data size. To avoid paying we can try to adjust the rate but We'll try other language models like bert.

# Advanced language models

Useful websites for reference:

https://www.topbots.com/leading-nlp-language-models-2020/ 

https://medium.com/@nils_reimers/openai-gpt-3-text-embeddings-really-a-new-state-of-the-art-in-dense-text-embeddings-6571fe3ec9d9

## Sentence-Transformers Model

### all-MiniLM-L6-v2

all-MiniLM-L6-v2. An extremely small (80 MB) and fast model, with only 6 layers which producing embeddings with 384 dimensions.

In [None]:
MiniLM_L6_v2_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
generated_embeddings = {}
for i, row in df_detailed_subset.iterrows():
    # Generate an embedding for the entire product sentence
    generated_embedding = MiniLM_L6_v2_model.encode(row["attr_value_en_sentence"])
    generated_embeddings[row['ctr_product_num']] = generated_embedding
embeddings_dict = pd.DataFrame.from_dict(generated_embeddings, orient='index')
embeddings_dict.index.names = ["ctr_product_num"]

embeddings_dict.to_csv("embeddings/minilm"+ suffix + ".csv")

In [None]:
#df_detailed_subset['all-MiniLM-L6-v2_embedding'] = df_detailed_subset.attr_value_en_sentence.apply(lambda x: MiniLM_L6_v2_model.encode(x))

In [None]:
#df_detailed_subset

### all-mpnet-base-v2

all-mpnet-base-v2: A bert-base sized model (418 MB) with 12 layers and 768 dimensions.

In [None]:
mpnet_base_v2_model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
generated_embeddings = {}
for i, row in df_detailed_subset.iterrows():
    # Generate an embedding for the entire sentence
    generated_embedding = mpnet_base_v2_model.encode(row["attr_value_en_sentence"])
    generated_embeddings[row['ctr_product_num']] = generated_embedding
embeddings_dict = pd.DataFrame.from_dict(generated_embeddings, orient='index')
embeddings_dict.index.names = ["ctr_product_num"]

embeddings_dict.to_csv("embeddings/mpnet"+ suffix + ".csv")

In [None]:
#df_detailed_subset['all-mpnet-base-v2_embedding'] = df_detailed_subset.attr_value_en_sentence.apply(lambda x: mpnet_base_v2_model.encode(x))

In [None]:
#df_detailed_subset

### all-roberta-large-v1

all-roberta-large-v1: A model based on RoBERTA-large (1.3 GB) with 24 layers and 1024 dimensions.

In [None]:
roberta_large_v1_model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')

In [None]:
generated_embeddings = {}
for i, row in df_detailed_subset.iterrows():
    # Generate an embedding for the entire sentence
    generated_embedding = roberta_large_v1_model.encode(row["attr_value_en_sentence"])
    generated_embeddings[row['ctr_product_num']] = generated_embedding
embeddings_dict = pd.DataFrame.from_dict(generated_embeddings, orient='index')
embeddings_dict.index.names = ["ctr_product_num"]

embeddings_dict.to_csv("embeddings/roberta"+ suffix + ".csv")

In [None]:
#df_detailed_subset['all-roberta-large-v1_embedding'] = df_detailed_subset.attr_value_en_sentence.apply(lambda x: roberta_large_v1_model.encode(x))

In [None]:

#df_detailed_subset

## Google embedding model

### Sentence-T5
Sentence-T5: The most recent text embedding model from Google published in August 2021.

https://arxiv.org/pdf/2108.08877.pdf

In [None]:
sentence_t5_base_model = SentenceTransformer('sentence-transformers/sentence-t5-base')

In [None]:
generated_embeddings = {}
for i, row in df_detailed_subset.iterrows():
    # Generate an embedding for the entire sentence
    generated_embedding = sentence_t5_base_model.encode(row["attr_value_en_sentence"])
    generated_embeddings[row['ctr_product_num']] = generated_embedding
embeddings_dict = pd.DataFrame.from_dict(generated_embeddings, orient='index')
embeddings_dict.index.names = ["ctr_product_num"]

embeddings_dict.to_csv("embeddings/t5"+ suffix + ".csv")

In [None]:
#df_detailed_subset['sentence-t5-base_embedding'] = df_detailed_subset.attr_value_en_sentence.apply(lambda x: sentence_t5_base_model.encode(x))

In [None]:
#df_detailed_subset

# Mask Language Model task

## Trial 1

In [None]:
import multiprocessing
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import transformers


from datasets import Dataset
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoConfig
from transformers import BertForMaskedLM, DistilBertForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from tokenizers import BertWordPieceTokenizer


In [None]:
# HYPERPARAMS
SEED_SPLIT = 0
SEED_TRAIN = 0

MAX_SEQ_LEN = 128
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 2e-5 
LR_WARMUP_STEPS = 100
WEIGHT_DECAY = 0.01

In [None]:
# load data
dtf_mlm = df_detailed
#dtf_mlm = dtf_mlm.rename(columns={"review_content": "text"})

# Train/Valid Split
df_train, df_valid = train_test_split(
    dtf_mlm, test_size=0.15, random_state=SEED_SPLIT
)

len(df_train), len(df_valid)


# Convert to Dataset object
train_dataset = Dataset.from_pandas(df_train[['attr_value_en_sentence']].dropna())
valid_dataset = Dataset.from_pandas(df_valid[['attr_value_en_sentence']].dropna())

In [None]:

'''
bert-base-uncased  # 12-layer, 768-hidden, 12-heads, 109M parameters
distilbert-base-uncased  # 6-layer, 768-hidden, 12-heads, 65M parameters
'''

MODEL = 'bert'
bert_type = 'bert-base-cased'

if MODEL == 'distilbert':
    TokenizerClass = DistilBertTokenizer 
    ModelClass = DistilBertForMaskedLM 
elif MODEL == 'bert':
    TokenizerClass = BertTokenizer
    ModelClass = BertForMaskedLM 
elif MODEL == 'roberta':
    TokenizerClass = RobertaTokenizer
    ModelClass = RobertaForMaskedLM
elif MODEL == 'scibert':
    TokenizerClass = AutoTokenizer
    ModelClass = AutoModelForMaskedLM

tokenizer = TokenizerClass.from_pretrained(
            bert_type, use_fast=True, do_lower_case=False, max_len=MAX_SEQ_LEN
            )

model = ModelClass.from_pretrained(bert_type)

In [None]:
def tokenize_function(row):
    return tokenizer(
        row['attr_value_en_sentence'],
        padding='max_length',
        truncation=True,
        max_length=MAX_SEQ_LEN,
        return_special_tokens_mask=True)
  
column_names = train_dataset.column_names

train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=column_names,
)

valid_dataset = valid_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=multiprocessing.cpu_count(),
    remove_columns=column_names,
)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)


steps_per_epoch = int(len(train_dataset) / TRAIN_BATCH_SIZE)

training_args = TrainingArguments(
    output_dir='./bert-news',
    logging_dir='./LMlogs',             
    num_train_epochs=2,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    warmup_steps=LR_WARMUP_STEPS,
    save_steps=steps_per_epoch,
    save_total_limit=3,
    weight_decay=WEIGHT_DECAY,
    learning_rate=LEARNING_RATE, 
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='loss', 
    greater_is_better=False,
    seed=SEED_TRAIN
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("embeddings/") #save your custom model

## Trial 2

Followed this tutorial:
https://towardsdatascience.com/masked-language-modelling-with-bert-7d49793e5d2c 

In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
from transformers import AdamW

from GPUtil import showUtilization as gpu_usage
from numba import cuda

In [None]:
!pip install GPUtil

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Tokenization — tokenization is simple, we’ve already initialized a BertTokenizer, all we do now is tokenize our input text.

In [None]:
# load data
dtf_mlm = df_detailed[:500]

# Train/Valid Split
df_train, df_valid = train_test_split(
    dtf_mlm, test_size=0.15, random_state=0
)

In [None]:
inputs = tokenizer(df_train['attr_value_en_sentence'].tolist(), return_tensors='pt', max_length=512, truncation=True, padding='max_length')

2. Create labels — The next step is easy, all we need to do here is clone our input_ids tensor into a new labels tensor. We’ll store this within the inputs variable too.

In [None]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [None]:
inputs.keys()

3. Masking — Now we need to mask a random selection of tokens in our input_ids tensor.

In [None]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [None]:
mask_arr

And now we take take the indices of each True value, within each individual vector.

In [None]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

4. Getting the data ready

In [None]:
class ProductDetailDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
dataset = ProductDetailDataset(inputs)

In [None]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [None]:
gpu_usage()       


In [None]:
'''
cuda.select_device(0)
cuda.close()
cuda.select_device(0)
'''

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)
# activate training mode
model.train()

4. Calculate Loss — Our final step here no different from the typical model training process.

In [None]:

# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

#### Train

In [None]:
from tqdm import tqdm  # for our progress bar

epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
torch.save(model, 'embeddings/preliminary_nlp_model')

Load the model

In [None]:
fine_tuned_model = torch.load('embeddings/preliminary_nlp_model.pt')

In [None]:
# Saving in a form that the huggingface provided.
fine_tuned_model.save_pretrained("embeddings/preliminary_nlp_model")

In [None]:
load_st_model = BertForMaskedLM.from_pretrained('embeddings/preliminary_nlp_model')

In [None]:
generated_embeddings = {}
for i, row in df_detailed_subset.iterrows():
    # Generate an embedding for the entire product sentence
    generated_embedding = fine_tuned_model.bert(**tokenizer(row["attr_value_en_sentence"],return_tensors="pt"))[0][:,0,:].squeeze(0)
    generated_embeddings[row['ctr_product_num']] = generated_embedding
embeddings_dict = pd.DataFrame.from_dict(generated_embeddings, orient='index')
embeddings_dict.index.names = ["ctr_product_num"]

embeddings_dict.to_csv("embeddings/custom_nlp_1k.csv")

In [None]:
df_detailed_subset['iteration_1'] = df_detailed_subset.attr_value_en_sentence.apply(lambda x: fine_tuned_model.bert(**tokenizer(x,return_tensors="pt"))[0][:,0,:].squeeze(0))

In [None]:
df_detailed_subset

#### Train using wandbd

In [None]:
'''
from transformers import TrainingArguments
from transformers import Trainer

args = TrainingArguments(
    output_dir='out',
    per_device_train_batch_size=16,
    num_train_epochs=2
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset
)

#os.environ["WANDB_MODE"] = "online"
os.environ["WANDB_DISABLED"] = "true"

trainer.train()

'''
