**Loading the Model**

In [None]:
!pip install transformers

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
from torch.utils.data import DataLoader, Dataset

In [None]:
#Loading the Domain Specific Trained Model 
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.load_state_dict(torch.load('Path to stored weights - Domain Training'))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

**Dataset Preparation**

In [None]:
#Loading the datasets
!unzip '/content/drive/MyDrive/591 Project /cosmetics.csv.zip'
!unzip '/content/drive/MyDrive/591 Project /sephora_website_dataset.csv.zip'

In [None]:
import pandas as pd
df1 = pd.read_csv('/content/cosmetics.csv')
df2 = pd.read_csv('/content/sephora_website_dataset.csv')

In [None]:
df1.head()

In [None]:
df2.head()

In [None]:
df2.rename(columns = {'ingredients':'Ingredients', 'category':'Label'}, inplace = True)

In [None]:
df2 = df2[['Ingredients', 'Label']]
df1 = df1[['Ingredients', 'Label']]

In [None]:
dataset = pd.concat([df1, df2])

In [None]:
dataset.reset_index(inplace=True)

In [None]:
dataset.head()

In [None]:
len(dataset)

In [None]:
dataset['Label'].unique()

In [None]:
#Have to preprocess/combine some labels

In [None]:
len(dataset['Label'].unique())

In [None]:
# Preprocessing 
# 1) Converting labels into numberical values
# 2) Data Split
# 3) Tokenizing - includes padding and truncation

labels = pd.get_dummies(dataset['Label']).values

train_data = dataset[:8000]
train_labels= labels[:8000]

val_data= dataset[8000:9000]
val_labels= labels[8000:9000]

test_data= dataset[9000:]
test_labels= labels[9000:]

class Ingredients(Dataset):
  def __init__(self, data, labels, tokenizer):
    self.data = data
    self.labels = labels
    self.tokenizer = tokenizer
  def __getitem__(self, index):
    text = dataset.loc[index, 'Ingredients']
    input = self.tokenizer(text, return_tensors='pt', add_special_tokens=True, max_length=512, padding='max_length', truncation=True)
    input_ids = input.input_ids.squeeze()
    attention_mask = input.attention_mask.squeeze()
    label = self.labels[index]
    return {'input_ids' : input_ids, 'attention_mask': attention_mask, 'labels' : label}
  def __len__(self):
        return len(self.data)

train_dataset = Ingredients(train_data, train_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

val_dataset = Ingredients(val_data, val_labels, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)

test_dataset = Ingredients(test_data, test_labels, tokenizer)
train_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

In [None]:
print(dataset.loc[0]['Ingredients'])

In [None]:
train_dataset.__getitem__(0)

**Model Architecture Changes**

In [None]:
import torch.nn as nn

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        logits = self.classifier(pooled_output)
        return logits

In [None]:
model = BertClassifier(model,149)

In [None]:
print(model)

**Model Training**

In [None]:
from torch.optim import Adam
# Set up the optimizer
optimizer = Adam(model.parameters(), lr=2e-5)

# Define the loss function
criterion = nn.CrossEntropyLoss()

In [None]:
model.train()

In [None]:
from tqdm import tqdm

epochs = 1
for epoch in range(epochs):
    loop = tqdm(train_loader, leave = True)
    for batch in loop:
        # Move the inputs and labels to the device
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask = attention_mask)[0]
        
        # Compute the loss
        loss = criterion(outputs.view(-1, 2), labels.view(-1))
        
        # Backward pass and update the parameters
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}') 
        loop.set_postfix(loss=loss.item())

  0%|          | 0/205 [00:00<?, ?it/s]

**Model Evaluation**

In [None]:
# Evaluate the model on the test data
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    loop = tqdm(test_loader, leave = True)
    for batch in loop:
        # Move the inputs and labels to the device
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        
        # Forward pass
        outputs = model(input_ids = input_ids, attention_mask=attention_mask)[0]
        
        # Predict the labels
        _, predicted = torch.max(outputs.data, 1)
        
        # Compute the accuracy
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    print(f'Test accuracy: {accuracy:.2f}%')

In [None]:
torch.save(model.state_dict(), 'Model_Task_weights')