**Loading the Model**

In [None]:
!pip install transformers

In [1]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from torch.utils.data import DataLoader, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Loading the Domain Specific Trained Model 
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.load_state_dict(torch.load('Model_Domain_weigths'))
bert_for_masked_lm_weights = model.state_dict()

Task_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.

In [3]:
# Copy the weights of the BertModel layers from the BertForMaskedLM 
bert_model_dict = Task_model.state_dict()
for key in bert_for_masked_lm_weights.keys():
    if key in bert_model_dict:
        bert_model_dict[key] = bert_for_masked_lm_weights[key]

# Load the weights into the BertModel object
Task_model.load_state_dict(bert_model_dict)

<All keys matched successfully>

**Dataset Preparation**

In [None]:
#Loading the datasets
!unzip 'cosmetics.csv.zip'
!unzip 'sephora_website_dataset.csv.zip'

In [4]:
import pandas as pd
df1 = pd.read_csv('cosmetics.csv')
df2 = pd.read_csv('sephora_website_dataset.csv')

In [None]:
df1.head()

In [None]:
df2.head()

In [5]:
df2.rename(columns = {'ingredients':'Ingredients', 'category':'Label'}, inplace = True)

In [6]:
df2 = df2[['Ingredients', 'Label']]
df1 = df1[['Ingredients', 'Label']]

In [7]:
dataset = pd.concat([df1, df2])

In [8]:
dataset.reset_index(inplace=True)

In [None]:
dataset.head()

In [None]:
len(dataset)

In [None]:
dataset['Label'].unique()

In [None]:
#Have to preprocess/combine some labels

In [9]:
num_labels = len(dataset['Label'].unique())

In [10]:
# Preprocessing 
# 1) Converting labels into numberical values
# 2) Data Split
# 3) Tokenizing - includes padding and truncation

labels = pd.get_dummies(dataset['Label']).values

train_data = dataset[:9000]
train_labels= labels[:9000]
train_data.reset_index(inplace=True)

test_data= dataset[9000:]
test_labels= labels[9000:]
test_data.reset_index(inplace=True)


class Ingredients(Dataset):
  def __init__(self, data, labels, tokenizer):
    self.data = data
    self.labels = labels
    self.tokenizer = tokenizer
  def __getitem__(self, index):
    text = dataset.loc[index, 'Ingredients']
    input = self.tokenizer(text, return_tensors='pt', add_special_tokens=True, max_length=512, padding='max_length', truncation=True)
    input_ids = input.input_ids.squeeze()
    attention_mask = input.attention_mask.squeeze()
    label = self.labels[index]
    return {'input_ids' : input_ids, 'attention_mask': attention_mask, 'labels' : label}
  def __len__(self):
        return len(self.data)

train_dataset = Ingredients(train_data, train_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_dataset = Ingredients(test_data, test_labels, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

In [None]:
labels[0]

In [None]:
print(dataset.loc[0]['Ingredients'])

In [None]:
train_dataset.__getitem__(0)

**Model Architecture Changes**

In [11]:
import torch.nn as nn

In [12]:
class BertClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, num_classes)

    def forward(self, input_ids, attention_mask):
        _, outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask,return_dict=False)
        
        '''
        The first variable, which we named _ in the code above, contains the embedding vectors of all of the tokens in a sequence.
The second variable, which we named outputs, contains the embedding vector of [CLS] token. For a text classification task, it is enough to use this embedding as an input for our classifier.
We then pass the outputs variable into a linear layer. At the end of the linear layer, we have a vector of size 149, each corresponds to a category of our labels
        '''
        
        pooled_output = outputs
        dropout_output = self.dropout(pooled_output)
        logits = self.classifier(dropout_output)
        return logits

In [13]:
model = BertClassifier(Task_model,num_labels)

In [None]:
print(model)

**Model Training**

In [14]:
from torch.optim import Adam
# Set up the optimizer
optimizer = Adam(model.parameters(), lr=2e-5)

# Define the loss function
criterion = nn.CrossEntropyLoss()

In [15]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [16]:
model.train()

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [17]:
from tqdm import tqdm

epochs = 15
for epoch in range(epochs):
    loop = tqdm(train_loader, leave = True)
    for batch in loop:
        # Move the inputs and labels to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels']
        labels = torch.argmax(labels, dim=1).to(device)
#         labels = labels.long()
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask = attention_mask)
        
        # Compute the loss
        loss = criterion(outputs, labels)
        
        # Backward pass and update the parameters
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}') 
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|███████████████████████████████████████████████████████████| 1125/1125 [03:54<00:00,  4.79it/s, loss=3.1]
Epoch 1: 100%|██████████████████████████████████████████████████████████| 1125/1125 [03:52<00:00,  4.84it/s, loss=2.98]
Epoch 2: 100%|██████████████████████████████████████████████████████████| 1125/1125 [03:52<00:00,  4.85it/s, loss=1.75]
Epoch 3: 100%|██████████████████████████████████████████████████████████| 1125/1125 [03:52<00:00,  4.84it/s, loss=1.97]
Epoch 4: 100%|██████████████████████████████████████████████████████████| 1125/1125 [03:52<00:00,  4.84it/s, loss=1.59]
Epoch 5: 100%|██████████████████████████████████████████████████████████| 1125/1125 [03:52<00:00,  4.83it/s, loss=1.63]
Epoch 6: 100%|██████████████████████████████████████████████████████████| 1125/1125 [03:52<00:00,  4.84it/s, loss=1.49]
Epoch 7: 100%|██████████████████████████████████████████████████████████| 1125/1125 [03:52<00:00,  4.84it/s, loss=2.37]
Epoch 8: 100%|██████████████████████████

**Model Evaluation**

In [18]:
# Evaluate the model on the test data
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    loop = tqdm(test_loader, leave = True)
    for batch in loop:
        # Move the inputs and labels to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        labels = torch.argmax(labels, dim=1).to(device)
        
        # Forward pass
        outputs = model(input_ids = input_ids, attention_mask=attention_mask)
        
        # Predict the labels
        _, predicted = torch.max(outputs, 1)
        
        # Compute the accuracy
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    print(f'Test accuracy: {accuracy:.2f}%')

100%|████████████████████████████████████████████████████████████████████████████████| 205/205 [00:15<00:00, 12.87it/s]

Test accuracy: 0.18%





In [None]:
torch.save(model.state_dict(), 'Model_Task_weights')