In [1]:
# Import libraries
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import TensorDataset, random_split, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
df = pd.read_csv('Dummy_Dataset.csv')

In [3]:
# Preprocess data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 512

In [4]:
def tokenize_answers(answers):
    input_ids = []
    attention_masks = []

    for answer in answers:
        encoded_dict = tokenizer.encode_plus(answer,
                                             add_special_tokens=True,
                                             max_length=max_len,
                                             pad_to_max_length=True,
                                             return_attention_mask=True,
                                             return_tensors='pt')
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks


In [9]:
input_ids, attention_masks = tokenize_answers(df['Essay'])

# Convert scores to numerical values
scores = df['Grade'].astype('category').cat.codes
labels = torch.tensor(scores.values)

# Split dataset
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Load BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Freeze BERT model parameters
for param in bert_model.parameters():
    param.requires_grad = False

score value:  0       1
1       0
2       2
3       0
4       2
       ..
2389    0
2390    0
2391    0
2392    0
2393    1
Length: 2394, dtype: int8


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# Add fully connected layer
class EssayGrader(nn.Module):
    def __init__(self):
        super(EssayGrader, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(768, 3)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

In [19]:
# Train model
batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EssayGrader().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

num_epochs = 5
for epoch in range(num_epochs):
    train_loss = 0
    train_acc = 0
    val_loss = 0
    val_acc = 0

    model.train()
    for batch in train_dataloader:
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)

        optimizer.zero_grad()

        logits = model(batch_input_ids, batch_attention_masks)
        loss = criterion(logits, batch_labels.long())

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += accuracy_score(batch_labels.cpu(), np.argmax(logits.detach().cpu(), axis=1))

    train_loss /= len(train_dataloader)
    train_acc /= len(train_dataloader)
    model.eval()
    with torch.no_grad():
        for batch in val_dataloader:
            batch_input_ids = batch[0].to(device)
            batch_attention_masks = batch[1].to(device)
            batch_labels = batch[2].to(device)

            with torch.no_grad():
                logits = model(batch_input_ids, batch_attention_masks)
                loss = criterion(logits, batch_labels.long())

                val_loss += loss.item()
                val_acc += accuracy_score(batch_labels.cpu(), np.argmax(logits.detach().cpu(), axis=1))

    val_loss /= len(val_dataloader)
    val_acc /= len(val_dataloader)

    print('Epoch [{}/{}], Train Loss: {:.4f}, Train Acc: {:.4f}, Val Loss: {:.4f}, Val Acc: {:.4f}'
        .format(epoch+1, num_epochs, train_loss, train_acc, val_loss, val_acc))

Epoch [1/5], Train Loss: 1.1036, Train Acc: 0.3644, Val Loss: 1.0830, Val Acc: 0.4030
Epoch [2/5], Train Loss: 1.0687, Train Acc: 0.4323, Val Loss: 1.0550, Val Acc: 0.4068
Epoch [3/5], Train Loss: 1.0422, Train Acc: 0.4767, Val Loss: 1.0363, Val Acc: 0.4860
Epoch [4/5], Train Loss: 1.0219, Train Acc: 0.5010, Val Loss: 1.0201, Val Acc: 0.4610
Epoch [5/5], Train Loss: 1.0007, Train Acc: 0.5198, Val Loss: 0.9953, Val Acc: 0.5199


In [2]:
def predict_scores(file_path):
    model.eval()
    
    # Load CSV file
    df = pd.read_csv(file_path)
    score = df['Grade'].to_list()
    
    # Tokenize answers
    input_ids, attention_masks = tokenize_answers(df['Essay'])
    input_ids = input_ids.to(device)
    attention_masks = attention_masks.to(device)
    
    with torch.no_grad():
        logits = model(input_ids, attention_masks)
        scores = nn.functional.softmax(logits, dim=1)
        predicted_scores = torch.argmax(scores, dim=1).cpu().numpy().tolist()
    
    correct_score = [0, 0, 0]
    total_score = [0, 0, 0]

    for i in range(len(score)):
        total_score[score[i]] += 1
        if (score[i] == predicted_scores[i]):
            correct_score[score[i]] += 1
    
    for j in range(len(correct_score)):
        print('Score Level {}, Accuracy percentage: {:.4f}'
        .format(j, correct_score[j]/total_score[j]))

In [3]:
predict_scores('./Dummy_Dataset.csv')

NameError: name 'model' is not defined