In [None]:
# Import the necessary libraries
import os
import torch
import pandas as pd
import torch.nn.functional as F
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import ElectraTokenizerFast, ElectraForSequenceClassification, get_linear_schedule_with_warmup, AdamW, pipeline, AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader

In [None]:
def preprocessText(features, tokenizer, train=True, max_length=64, batchsize=8):

  input_word_ids, input_type_ids, input_mask = tokenizer.batch_encode_plus(
      features.Comments.tolist(),
      max_length=max_length,
      padding='max_length',
      truncation=True,
      return_tensors='pt'
  ).values()

  labels = torch.from_numpy(features.Sentiment.values)
  dataset = TensorDataset(input_word_ids, input_mask, labels)
  dataloader = DataLoader(dataset, shuffle=train, batch_size=batchsize)

  return dataloader

In [None]:
def training(model, train_dataloader, train_size, optimizer, scheduler):

  model.train()

  epoch_loss = 0
  num_correct = 0

  for input_words, attention_masks, labels in train_dataloader:
    input_words, attention_masks, labels = input_words.cuda(), attention_masks.cuda(), labels.cuda()
    
    outputs = model(input_words, attention_mask=attention_masks, labels=labels)
    loss = outputs.loss
    preds = torch.argmax(outputs.logits, dim=1)

    num_correct += torch.sum(preds==labels)
    epoch_loss += loss.item()

    loss.backward()
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  
  avg_batch_loss = epoch_loss / len(train_dataloader)
  acc = num_correct / train_size
  print('Training Loss: {:.6f}, \t Training Accuracy: {:.4f}'.format(avg_batch_loss, acc))

  return avg_batch_loss, acc

In [None]:
def validating(model, val_dataloader, val_size):

  model.eval()

  epoch_loss = 0
  num_correct = 0

  with torch.no_grad():
    for input_words, attention_masks, labels in val_dataloader:
      input_words, attention_masks, labels = input_words.cuda(), attention_masks.cuda(), labels.cuda()
      
      outputs = model(input_words, attention_mask=attention_masks, labels=labels)
      loss = outputs.loss
      preds = torch.argmax(outputs.logits, dim=1)

      num_correct += torch.sum(preds==labels)
      epoch_loss += loss.item()
  
  avg_batch_loss = epoch_loss / len(val_dataloader)
  acc = num_correct / val_size
  print('Validation Loss: {:.6f}, \t Validation Accuracy: {:.4f}'.format(avg_batch_loss, acc))
  print('-----------------------------------------------------------------------------------------------------------')
  
  return avg_batch_loss, acc

In [None]:
def testing(modelpath, tokenizerpath, test, label2id, max_length, batchsize, savepath='./Electra/results.csv'):

  test_size = len(test) 
  tokenizer = ElectraTokenizerFast.from_pretrained(tokenizerpath)
  test_dataloader = preprocessText(test, tokenizer, train=False, max_length=max_length, batchsize=batchsize)
  
  model = AutoModelForSequenceClassification.from_pretrained(modelpath).cuda()
  model.eval()

  num_correct = 0

  with torch.no_grad():
    for input_words, attention_masks, labels in test_dataloader:
      input_words, attention_masks, labels = input_words.cuda(), attention_masks.cuda(), labels.cuda()
      
      outputs = model(input_words, attention_mask=attention_masks, labels=labels)
      probs, preds = torch.max(F.softmax(outputs.logits, dim=1), dim=1)
      num_correct += torch.sum(preds == labels)

      try:
        all_preds = np.append(all_preds, preds.detach().cpu().numpy())
      except:
        all_preds = preds.detach().cpu().numpy()

      try:
        all_labels = np.append(all_labels, labels.detach().cpu().numpy())
      except:
        all_labels = labels.detach().cpu().numpy()

      try:
        all_probs = np.append(all_probs, probs.detach().cpu().numpy())
      except:
        all_probs = probs.detach().cpu().numpy()

  # Compute the accuracy
  acc = num_correct / test_size

  # Print out model performance
  print('-----------------------------------------------------------------------------------------------------------')
  print('Test Accuracy: {:.4f}'.format(acc))
  print('-----------------------------------------------------------------------------------------------------------')
  print(classification_report(all_labels, all_preds, target_names=['Positive', 'Neutral', 'Negative']))
  print('-----------------------------------------------------------------------------------------------------------')

  # Append results to dataframe
  id2label={v:k for k,v in label2id.items()}
  results = test.copy()
  results.Sentiment = results.Sentiment.apply(lambda x: id2label[x])
  results['Prediction'] = np.array([id2label[x] for x in all_preds])
  results['Confidence'] = all_probs

  # Save results to a csv file
  results.to_csv(savepath, index=False)

  print('Model predictions have been saved successfully to {}!'.format(savepath))
  print('-----------------------------------------------------------------------------------------------------------')

  return results

In [None]:
def main(num_epochs, lr, train, val, test, modelpath, tokenizerpath, resultspath, max_length=64, batchsize=8):

  electra_tokenizer = ElectraTokenizerFast.from_pretrained('google/electra-small-discriminator')

  train_dataloader = preprocessText(train, electra_tokenizer, train=True, max_length=max_length, batchsize=batchsize)
  val_dataloader = preprocessText(val, electra_tokenizer, train=False, max_length=max_length, batchsize=batchsize)

  train_size = len(train)
  val_size = len(val)

  label2id = {'Positive': 0,
              'Neutral': 1,
              'Negative': 2}

  model = ElectraForSequenceClassification.from_pretrained(
      'google/electra-small-discriminator', 
      num_labels=3, 
      id2label={v:k for k,v in label2id.items()},
      label2id=label2id,
      max_length=64,
      ).cuda()

  num_train_steps = len(train_dataloader) * num_epochs
  num_warmup_steps = int(0.1 * num_train_steps)

  no_decay = ['bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]
  optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_train_steps)

  best_epoch = 0
  best_val_loss = float('inf')
  best_val_acc = 0

  for epoch in range(num_epochs):
    print('Epoch {}'.format(epoch))

    train_loss, train_acc = training(model, train_dataloader, train_size, optimizer, scheduler)
    val_loss, val_acc = validating(model, val_dataloader, val_size)

    if val_acc > best_val_acc:
      best_epoch = epoch
      best_val_loss = val_loss
      best_val_acc = val_acc
      model.save_pretrained(modelpath)

  electra_tokenizer.save_pretrained(tokenizerpath)
  print('Best Epoch {} \t Validation Loss: {:.6f}, \t Validation Accuracy: {:.4f}'.format(best_epoch, best_val_loss, best_val_acc))

  testing(modelpath, tokenizerpath, test, label2id, max_length, batchsize, savepath=resultspath)

In [None]:
def modelPipeline(modelpath, tokenizerpath, return_all_scores=True):
  tokenizer = ElectraTokenizerFast.from_pretrained(tokenizerpath)
  model = AutoModelForSequenceClassification.from_pretrained(modelpath)
  return pipeline(task='sentiment-analysis', model=model, tokenizer=tokenizer, device=0, return_all_scores=return_all_scores)

In [None]:
# Training dataset
train = pd.read_csv('survey_2018_2019.csv', encoding='ISO-8859-1')
train.Comments = train.Comments.str.replace(r'\n', '')
train.Comments = train.Comments.str.replace(r'\r', '')

# Validation and test dataset
val = pd.read_csv('survey_2020_Jan.csv').append(pd.read_csv('survey_2020_Aug.csv'))
val.Comments = val.Comments.str.replace(r'\n', '')
val.Comments = val.Comments.str.replace(r'\r', '')

# Perform validation test split
val, test = train_test_split(val, test_size=0.5, random_state=20210501, stratify=val.Sentiment)

# Convert labels to integers
mapping = {'Positive': 0,
           'Neutral': 1,
           'Negative': 2}

train.Sentiment = train.Sentiment.apply(lambda x: mapping[x])
val.Sentiment = val.Sentiment.apply(lambda x: mapping[x])
test.Sentiment = test.Sentiment.apply(lambda x: mapping[x]) 

In [None]:
# Set hyperparameters here
num_epochs = 5
lr = 2e-5
modelpath = 'Electra/model'
tokenizerpath = 'Electra/tokenizer'
resultspath = 'Electra/results.csv'

# Commence training
main(num_epochs, lr, train, val, test, modelpath, tokenizerpath, resultspath)

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

Epoch 0
Training Loss: 0.913478, 	 Training Accuracy: 0.5773
Validation Loss: 0.630132, 	 Validation Accuracy: 0.7421
-----------------------------------------------------------------------------------------------------------
Epoch 1
Training Loss: 0.576044, 	 Training Accuracy: 0.7818
Validation Loss: 0.589571, 	 Validation Accuracy: 0.7639
-----------------------------------------------------------------------------------------------------------
Epoch 2
Training Loss: 0.425640, 	 Training Accuracy: 0.8459
Validation Loss: 0.527594, 	 Validation Accuracy: 0.7916
-----------------------------------------------------------------------------------------------------------
Epoch 3
Training Loss: 0.342950, 	 Training Accuracy: 0.8834
Validation Loss: 0.531823, 	 Validation Accuracy: 0.7916
-----------------------------------------------------------------------------------------------------------
Epoch 4
Training Loss: 0.303958, 	 Training Accuracy: 0.8940
Validation Loss: 0.543473, 	 Valida

In [None]:
# Test model
classifier = modelPipeline(modelpath, tokenizerpath, return_all_scores=True)
classifier("senior officer did a good job by ensure officer well being, but working in woodland checkpoint is tiring, and we tend to get sick easily. that's one of issue we couldn't control for the health. its not easy working on shift to maintain yourself to stay heathy and fit.")

[[{'label': 'Positive', 'score': 0.005596943665295839},
  {'label': 'Neutral', 'score': 0.050930269062519073},
  {'label': 'Negative', 'score': 0.9434728026390076}]]