In [None]:
!pip install beautifulsoup4

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from torch.utils.data import DataLoader, Dataset

In [None]:
#Loading the Domain Specific Trained Model 
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.load_state_dict(torch.load('Model_Domain_weigths'))
bert_for_masked_lm_weights = model.state_dict()

In [None]:
Task_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Copy the weights of the BertModel layers from the BertForMaskedLM 
bert_model_dict = Task_model.state_dict()
for key in bert_for_masked_lm_weights.keys():
    if key in bert_model_dict:
        bert_model_dict[key] = bert_for_masked_lm_weights[key]

# Load the weights into the BertModel object
Task_model.load_state_dict(bert_model_dict)

In [None]:
import pandas as pd
df1 = pd.read_csv('cosmetics.csv')
df2 = pd.read_csv('sephora_website_dataset.csv')

In [None]:
df2.rename(columns = {'ingredients':'Ingredients', 'category':'Label'}, inplace = True)

In [None]:
df1 = df1[['Ingredients', 'Label']]
df2 = df2[['Ingredients', 'Label']]

In [None]:
#Converting the ingredients into a list
def preprocess_sephora(x):
    return list(x.split('-'))[:10]
def preprocess_cosmetics(x):
    return list(x.split(','))[:10]

In [None]:
df1['Ingredients'] = df1['Ingredients'].apply(preprocess_cosmetics)
df2['Ingredients'] = df2['Ingredients'].apply(preprocess_sephora)

In [None]:
dataset = pd.concat([df1, df2])

dataset.reset_index(inplace=True)

num_labels = len(dataset['Label'].unique())

labels = pd.get_dummies(dataset['Label']).values

train_data = dataset[:9000]
train_labels= labels[:9000]
train_data.reset_index(inplace=True)

test_data= dataset[9000:]
test_labels= labels[9000:]
test_data.reset_index(inplace=True)

In [None]:
import requests
from bs4 import BeautifulSoup

#Function to append external knowledge to the input 
def append_external_knowledge(ingredient_list):
    # List of ingredients to search for
    for i in range(len(ingredient_list)):
      new_name = ""
      if len(ingredient_list[i].split())>1:
        for j in ingredient_list[i].split():
          if new_name=="":
            new_name=j
          else:
            new_name=new_name+"+"+j
      if new_name!="":
        ingredient_list[i]=new_name
        new_name=""

    # URL to search for products
    url = f"https://incidecoder.com/search/product?query=&include={'&include='.join(ingredient_list)}"

    #URL to which a GET request is sent
#     print(url)

    # Retrieve the webpage content
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all the product names on the webpage
    product_names = []
    for result in soup.find_all("a", class_="klavika simpletextlistitem"):
        name = result.text.strip()
        product_names.append(name)

    # Print the list of product names
    return ' '.join(product_names)

In [None]:
#Inclusion of the External Knowledge Base into the model
dataset['EK'] = dataset['Ingredients'].apply(append_external_knowledge)

In [None]:
dataset['Ingredients'] = dataset['Ingredients'] + dataset['EK']

In [None]:
class Ingredients(Dataset):
  def __init__(self, data, labels, tokenizer):
    self.data = data
    self.labels = labels
    self.tokenizer = tokenizer
  def __getitem__(self, index):
    text = dataset.loc[index, 'Ingredients']
    input = self.tokenizer(text, return_tensors='pt', add_special_tokens=True, max_length=512, padding='max_length', truncation=True)
    input_ids = input.input_ids.squeeze()
    attention_mask = input.attention_mask.squeeze()
    label = self.labels[index]
    return {'input_ids' : input_ids, 'attention_mask': attention_mask, 'labels' : label}
  def __len__(self):
        return len(self.data)

train_dataset = Ingredients(train_data, train_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_dataset = Ingredients(test_data, test_labels, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

In [None]:
import torch.nn as nn
class BertClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        _, outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask,return_dict=False)
        
        '''
        The first variable, which we named _ in the code above, contains the embedding vectors of all of the tokens in a sequence.
The second variable, which we named outputs, contains the embedding vector of [CLS] token. For a text classification task, it is enough to use this embedding as an input for our classifier.
We then pass the outputs variable into a linear layer. At the end of the linear layer, we have a vector of size 149, each corresponds to a category of our labels
        '''
        
        pooled_output = outputs
        dropout_output = self.dropout(pooled_output)
        logits = self.classifier(dropout_output)
        return logits

In [None]:
model = BertClassifier(Task_model,num_labels)

In [None]:
from torch.optim import Adam
# Set up the optimizer
optimizer = Adam(model.parameters(), lr=2e-5)

# Define the loss function
criterion = nn.CrossEntropyLoss()

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

In [None]:
model.train()
from tqdm import tqdm

epochs = 15
for epoch in range(epochs):
    loop = tqdm(train_loader, leave = True)
    for batch in loop:
        # Move the inputs and labels to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        labels = torch.argmax(labels, dim=1).to(device)
#         labels = labels.long()
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask = attention_mask)
        
        # Compute the loss
        loss = criterion(outputs, labels)
        
        # Backward pass and update the parameters
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}') 
        loop.set_postfix(loss=loss.item())

In [None]:
# Evaluate the model on the test data
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    loop = tqdm(test_loader, leave = True)
    for batch in loop:
        # Move the inputs and labels to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        labels = torch.argmax(labels, dim=1).to(device)
        
        # Forward pass
        outputs = model(input_ids = input_ids, attention_mask=attention_mask)
        
        # Predict the labels
        _, predicted = torch.max(outputs, 1)
        
        # Compute the accuracy
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    print(f'Test accuracy: {accuracy:.2f}%')

In [None]:
torch.save(model.state_dict(), 'CheMapBERT_weights')