In [None]:
!pip install transformers --quiet

[K     |████████████████████████████████| 2.6 MB 15.7 MB/s 
[K     |████████████████████████████████| 895 kB 53.4 MB/s 
[K     |████████████████████████████████| 636 kB 59.2 MB/s 
[K     |████████████████████████████████| 3.3 MB 58.5 MB/s 
[?25h

In [None]:
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer, ElectraModel, ElectraTokenizer

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
MAX_LEN = 50
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 20
LEARNING_RATE = 5e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
etokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

In [None]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.message[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'pn_targets': torch.tensor(self.data.encode_pn[index], dtype=torch.long),
            'tense_targets': torch.tensor(self.data.encode_tense[index], dtype=torch.long),
            'form_targets': torch.tensor(self.data.encode_form[index], dtype=torch.long),
            'subject_targets': torch.tensor(self.data.encode_subject[index], dtype=torch.long),
            'primary_targets': torch.tensor(self.data.encode_primary[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Reading in data from CSV with 
df = pd.read_csv("/content/drive/MyDrive/Lucy Lucas Classification Model/encoded_model_data.csv")

In [None]:
df = df[['message', 'positive', 'tense', 'form', 'subject', 'primary', 'secondary', 'tertiary', 'classification', 'intent']]
df = df.dropna()

In [None]:
# Creating a subset of unique intent values so there are no missing training sentences in the validation set
hand_df = df.drop_duplicates(subset=["intent"])
encoded_df = df.drop(hand_df.index)

In [None]:
# Creating dictionaries for inferencing
pn_df = df.drop_duplicates(subset=["positive"])[['positive', 'encode_pn']].sort_values(by=['encode_pn']).set_index('encode_pn')
tense_df = df.drop_duplicates(subset=['tense'])[['tense', 'encode_tense']].sort_values(by=['encode_tense']).set_index('encode_tense')
form_df = df.drop_duplicates(subset=['form'])[['form', 'encode_form']].sort_values(by=['encode_form']).set_index('encode_form')
subject_df = df.drop_duplicates(subset=['subject'])[['subject', 'encode_subject']].sort_values(by=['encode_subject']).set_index('encode_subject')
primary_df = df.drop_duplicates(subset=['primary'])[['primary', 'encode_primary']].sort_values(by=['encode_primary']).set_index('encode_primary')

In [None]:
pn_dict = pn_df.to_dict('index')
tense_dict = tense_df.to_dict('index')
form_dict = form_df.to_dict('index')
subject_dict = subject_df.to_dict('index')
primary_dict = primary_df.to_dict('index')
secondary_dict = secondary_df.to_dict('index')

In [None]:
# Creating the dataset and dataloader

train_size = 0.78
train_dataset=encoded_df.sample(frac=train_size,random_state=200)
test_dataset=encoded_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = pd.concat([train_dataset, hand_df])
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (35784, 17)
TRAIN Dataset: (28751, 17)
TEST Dataset: (7033, 17)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.pn_class = torch.nn.Linear(768, 3)
        self.tense_class = torch.nn.Linear(768, 4)
        self.form_class = torch.nn.Linear(768, 4)
        self.subject_class = torch.nn.Linear(768, 4)
        self.primary_class = torch.nn.Linear(768, len(primary_dict))

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        pn = self.pn_class(pooler)
        tense = self.tense_class(pooler)
        form = self.form_class(pooler)
        subject = self.subject_class(pooler)
        primary = self.primary_class(pooler)

        return pn, tense, form, subject, primary

In [None]:
class ElectraSmallClass(torch.nn.Module):
    def __init__(self):
        super(ElectraSmallClass, self).__init__()
        self.l1 = ElectraModel.from_pretrained("google/electra-small-discriminator")
        self.pre_classifier = torch.nn.Linear(256, 256)
        self.dropout = torch.nn.Dropout(0.3)
        self.pn_class = torch.nn.Linear(256, 3)
        self.tense_class = torch.nn.Linear(256, 4)
        self.form_class = torch.nn.Linear(256, 4)
        self.subject_class = torch.nn.Linear(256, 4)
        self.primary_class = torch.nn.Linear(256, len(primary_dict))

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        pn = self.pn_class(pooler)
        tense = self.tense_class(pooler)
        form = self.form_class(pooler)
        subject = self.subject_class(pooler)
        primary = self.primary_class(pooler)

        return pn, tense, form, subject, primary

In [None]:
model = DistillBERTClass()
# model = ElectraSmallClass()
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_feat

In [None]:
# settin optimizer and loss function
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
# Calculate accuracy
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [None]:
# Defining the training function

def train(epoch):
    pn_tr_loss = 0
    tense_tr_loss = 0
    form_tr_loss = 0
    subject_tr_loss = 0
    primary_tr_loss = 0
    secondary_tr_loss = 0
    n_pn_correct = 0
    n_tense_correct = 0
    n_form_correct = 0
    n_subject_correct = 0
    n_primary_correct = 0
    n_secondary_correct = 0

    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        pn_targets = data['pn_targets'].to(device, dtype = torch.long)
        tense_targets = data['tense_targets'].to(device, dtype = torch.long)
        form_targets = data['form_targets'].to(device, dtype = torch.long)
        subject_targets = data['subject_targets'].to(device, dtype = torch.long)
        primary_targets = data['primary_targets'].to(device, dtype = torch.long)

        pn_out, tense_out, form_out, subject_out, primary_out = model(ids, mask)
        pn_loss = loss_function(pn_out, pn_targets)
        tense_loss = loss_function(tense_out, tense_targets)
        form_loss = loss_function(form_out, form_targets)
        subject_loss = loss_function(subject_out, subject_targets)
        primary_loss = loss_function(primary_out, primary_targets)

        pn_tr_loss += pn_loss.item()
        tense_tr_loss += tense_loss.item()
        form_tr_loss += form_loss.item()
        subject_tr_loss += subject_loss.item()
        primary_tr_loss += primary_loss.item()

        pn_val, pn_idx = torch.max(pn_out.data, dim=1)
        n_pn_correct += calcuate_accu(pn_idx, pn_targets)

        tense_val, tense_idx = torch.max(tense_out.data, dim=1)
        n_tense_correct += calcuate_accu(tense_idx, tense_targets)

        form_val, form_idx = torch.max(form_out.data, dim=1)
        n_form_correct += calcuate_accu(form_idx, form_targets)

        subject_val, subject_idx = torch.max(subject_out.data, dim=1)
        n_subject_correct += calcuate_accu(subject_idx, subject_targets)

        primary_val, primary_idx = torch.max(primary_out.data, dim=1)
        n_primary_correct += calcuate_accu(primary_idx, primary_targets)

        nb_tr_steps += 1
        nb_tr_examples+= pn_targets.size(0)
        
        if _%1000==0:
            pn_loss_step = pn_tr_loss/nb_tr_steps
            pn_accu_step = (n_pn_correct*100)/nb_tr_examples
            
            tense_loss_step = tense_tr_loss/nb_tr_steps
            tense_accu_step = (n_tense_correct*100)/nb_tr_examples

            form_loss_step = form_tr_loss/nb_tr_steps
            form_accu_step = (n_form_correct*100)/nb_tr_examples

            subject_loss_step = subject_tr_loss/nb_tr_steps
            subject_accu_step = (n_subject_correct*100)/nb_tr_examples

            primary_loss_step = primary_tr_loss/nb_tr_steps
            primary_accu_step = (n_primary_correct*100)/nb_tr_examples

            print(f"Positive/Negative Loss per 1000 steps: {pn_loss_step}")
            print(f"Positive/Negative per 1000 steps: {pn_accu_step}")

            print(f"Tense Loss per 1000 steps: {tense_loss_step}")
            print(f"Tense Accuracy per 1000 steps: {tense_accu_step}")

            print(f"Form Loss per 1000 steps: {form_loss_step}")
            print(f"Form Accuracy per 1000 steps: {form_accu_step}")

            print(f"Subject Loss per 1000 steps: {subject_loss_step}")
            print(f"Subject Accuracy per 1000 steps: {subject_accu_step}")

            print(f"Primary Loss per 1000 steps: {primary_loss_step}")
            print(f"Primary Accuracy per 1000 steps: {primary_accu_step}")
            print()

        
        loss = pn_loss + tense_loss + form_loss + subject_loss + primary_loss
        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'PN Accuracy for Epoch {epoch}: {(n_pn_correct*100)/nb_tr_examples}')
    pn_epoch_loss = pn_tr_loss/nb_tr_steps
    print(f"Training Loss: {pn_epoch_loss}")

    print(f'Tense Accuracy for Epoch {epoch}: {(n_tense_correct*100)/nb_tr_examples}')
    tense_epoch_loss = tense_tr_loss/nb_tr_steps
    print(f"Training Loss: {tense_epoch_loss}")

    print(f'Form Accuracy for Epoch {epoch}: {(n_form_correct*100)/nb_tr_examples}')
    form_epoch_loss = form_tr_loss/nb_tr_steps
    print(f"Training Loss: {form_epoch_loss}")
  
    print(f'Subject Accuracy for Epoch {epoch}: {(n_subject_correct*100)/nb_tr_examples}')
    subject_epoch_loss = subject_tr_loss/nb_tr_steps
    print(f"Training Loss: {subject_epoch_loss}")

    print(f'Primary Accuracy for Epoch {epoch}: {(n_primary_correct*100)/nb_tr_examples}')
    primary_epoch_loss = primary_tr_loss/nb_tr_steps
    print(f"Training Loss: {primary_epoch_loss}")

    print()

    return 

In [None]:
# train model
for epoch in range(EPOCHS):
    train(epoch)



Positive/Negative Loss per 1000 steps: 1.0436015129089355
Positive/Negative per 1000 steps: 62.5
Tense Loss per 1000 steps: 1.4297971725463867
Tense Accuracy per 1000 steps: 12.5
Form Loss per 1000 steps: 1.4056756496429443
Form Accuracy per 1000 steps: 25.0
Subject Loss per 1000 steps: 1.3669897317886353
Subject Accuracy per 1000 steps: 37.5
Primary Loss per 1000 steps: 4.5963335037231445
Primary Accuracy per 1000 steps: 0.0

Positive/Negative Loss per 1000 steps: 0.24983686149365955
Positive/Negative per 1000 steps: 91.30869130869131
Tense Loss per 1000 steps: 0.36266138952980925
Tense Accuracy per 1000 steps: 87.88711288711289
Form Loss per 1000 steps: 0.42057408248032485
Form Accuracy per 1000 steps: 86.07017982017982
Subject Loss per 1000 steps: 0.5602377463917394
Subject Accuracy per 1000 steps: 78.75874125874125
Primary Loss per 1000 steps: 2.8467230054882022
Primary Accuracy per 1000 steps: 33.21678321678322

PN Accuracy for Epoch 0: 92.79329414629056
Training Loss: 0.211147470

In [None]:
# defining validation function

def valid(model, testing_loader):
    model.eval()
    n_pn_correct = 0; n_pn_wrong = 0; pn_total = 0
    n_tense_correct = 0; n_tense_wrong = 0; tense_total = 0
    n_form_correct = 0; n_form_wrong = 0; form_total = 0
    n_subject_correct = 0; n_subject_wrong = 0; subject_total = 0
    n_primary_correct = 0; n_primary_wrong = 0; primary_total = 0

    nb_tr_steps = 0
    nb_tr_examples = 0

    pn_tr_loss = 0; tense_tr_loss = 0; form_tr_loss = 0; subject_tr_loss = 0; primary_tr_loss = 0

    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            pn_targets = data['pn_targets'].to(device, dtype = torch.long)
            tense_targets = data['tense_targets'].to(device, dtype = torch.long)
            form_targets = data['form_targets'].to(device, dtype = torch.long)
            subject_targets = data['subject_targets'].to(device, dtype = torch.long)
            primary_targets = data['primary_targets'].to(device, dtype = torch.long)

            pn_out, tense_out, form_out, subject_out, primary_out = model(ids, mask)
            pn_loss = loss_function(pn_out, pn_targets)
            tense_loss = loss_function(tense_out, tense_targets)
            form_loss = loss_function(form_out, form_targets)
            subject_loss = loss_function(subject_out, subject_targets)
            primary_loss = loss_function(primary_out, primary_targets)

            pn_tr_loss += pn_loss.item()
            tense_tr_loss += tense_loss.item()
            form_tr_loss += form_loss.item()
            subject_tr_loss += subject_loss.item()
            primary_tr_loss += primary_loss.item()

            pn_val, pn_idx = torch.max(pn_out.data, dim=1)
            n_pn_correct += calcuate_accu(pn_idx, pn_targets)

            tense_val, tense_idx = torch.max(tense_out.data, dim=1)
            n_tense_correct += calcuate_accu(tense_idx, tense_targets)

            form_val, form_idx = torch.max(form_out.data, dim=1)
            n_form_correct += calcuate_accu(form_idx, form_targets)

            subject_val, subject_idx = torch.max(subject_out.data, dim=1)
            n_subject_correct += calcuate_accu(subject_idx, subject_targets)

            primary_val, primary_idx = torch.max(primary_out.data, dim=1)
            n_primary_correct += calcuate_accu(primary_idx, primary_targets)

            nb_tr_steps += 1
            nb_tr_examples+= pn_targets.size(0)

    pn_epoch_loss = pn_tr_loss/nb_tr_steps
    pn_epoch_accu = (n_pn_correct*100)/nb_tr_examples
    print("Positive and Negative")
    print(f"Loss: {pn_epoch_loss}")
    print(f"Accuracy: {pn_epoch_accu}")

    tense_epoch_loss = tense_tr_loss/nb_tr_steps
    tense_epoch_accu = (n_tense_correct*100)/nb_tr_examples
    print("Tense")
    print(f"Loss: {tense_epoch_loss}")
    print(f"Accuracy: {tense_epoch_accu}")
    
    form_epoch_loss = form_tr_loss/nb_tr_steps
    form_epoch_accu = (n_form_correct*100)/nb_tr_examples
    print("Form")
    print(f"Loss: {form_epoch_loss}")
    print(f"Accuracy: {form_epoch_accu}")

    subject_epoch_loss = subject_tr_loss/nb_tr_steps
    subject_epoch_accu = (n_subject_correct*100)/nb_tr_examples
    print("Subject")
    print(f"Loss: {subject_epoch_loss}")
    print(f"Accuracy: {subject_epoch_accu}")

    primary_epoch_loss = primary_tr_loss/nb_tr_steps
    primary_epoch_accu = (n_primary_correct*100)/nb_tr_examples
    print("Primary")
    print(f"Loss: {primary_epoch_loss}")
    print(f"Accuracy: {primary_epoch_accu}")

    return pn_epoch_accu, tense_epoch_accu, form_epoch_accu, subject_epoch_accu, primary_epoch_accu

In [None]:
# validating model
acc = valid(model, testing_loader)



Positive and Negative
Loss: 0.11639929681237349
Accuracy: 97.75344803071236
Tense
Loss: 0.14718686691489272
Accuracy: 97.21313806341533
Form
Loss: 0.19774747259295758
Accuracy: 96.36001706242001
Subject
Loss: 0.24057713887453824
Accuracy: 94.08502772643253
Primary
Loss: 0.63219550981144
Accuracy: 88.36911701976398


In [None]:
drive_file_path = "/content/drive/MyDrive/Lucy Lucas Classification Model"

In [None]:
# storing train_dataset and test_dataset
train_dataset.to_csv(drive_file_path + "train_dataset.csv")
test_dataset.to_csv(drive_file_path + "test_dataset.csv")

In [None]:
# storing encoded classification values (just in case they change)
pn_df.to_csv(drive_file_path + 'encode_pn.csv')
tense_df.to_csv(drive_file_path + 'encode_tense.csv')
form_df.to_csv(drive_file_path + 'encode_form.csv')
subject_df.to_csv(drive_file_path + 'encode_subject.csv')
primary_df.to_csv(drive_file_path + 'encode_primary.csv')

In [None]:
# saving model
torch.save({
    'epoch': EPOCHS,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()
    }, drive_file_path + 'model_and_optimizer_state_dict.pt')

In [None]:
# inference function
def predict(text, model=model, tokenizer=tokenizer):
  input = tokenizer.encode_plus(text, return_tensors='pt')
  cuda_input = input.to('cuda:0')
  output = model(cuda_input['input_ids'], cuda_input['attention_mask'])

  return returnPrediction(output)

In [None]:
# inference helper
def returnPrediction(outputTensor):
  positive = outputTensor[0].tolist()[0]
  tense = outputTensor[1].tolist()[0]
  form = outputTensor[2].tolist()[0]
  subject = outputTensor[3].tolist()[0]
  primary = outputTensor[4].tolist()[0]

  pos_value = positive.index(max(positive))
  tense_value = tense.index(max(tense))
  form_value = form.index(max(form))
  subject_value = subject.index(max(subject))
  primary_value = primary.index(max(primary))

  pn_pred = pn_dict[pos_value]['positive']
  tense_pred = tense_dict[tense_value]['tense']
  form_pred = form_dict[form_value]['form']
  subject_pred = subject_dict[subject_value]['subject']
  primary_pred = primary_dict[primary_value]['primary']

  return pn_pred, tense_pred, form_pred, subject_pred, primary_pred

In [None]:
predict("")

('positive', 'present', 'declarative', 'You', 'romantic', 'name')

In [None]:
# electra-small predict
# predict("I am hungry")
# ('positive', 'past', 'declarative', 'I', 'state', 'hungry')