# Load and preprocess data

In [1]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Preprocess data function which including lemmatized and POS
def genLemmatizedEssay(Lemmatized_Tokens, POS_Tags):
    tmp = map(lambda x: str(x[0]) + '_' + str(x[1][1]), zip(Lemmatized_Tokens, POS_Tags))
    return ' '.join(tmp)

def preProcess(filePath):
    # Step 1: Read the .txt file into a pandas DataFrame
    df = pd.read_csv(filePath, usecols=['Essay', 'Score'])

    # Step 2: Tokenization
    df['Tokens'] = df['Essay'].apply(word_tokenize)

    # Step 3: Lemmatization
    lemmatizer = WordNetLemmatizer()
    df['Lemmatized_Tokens'] = df['Tokens'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

    # Step 4: Part-of-speech (POS) tagging
    df['POS_Tags'] = df['Lemmatized_Tokens'].apply(nltk.pos_tag)

    # Step 5: Join all word in lemmatized tokens
    df['Lemmatized_Essays'] = df.apply(lambda row: genLemmatizedEssay(row['Lemmatized_Tokens'], row['POS_Tags']), axis=1)

    return df

In [3]:
filePath = './Baseline_Filtered_Synthesized_Essays.csv'

df_processed = preProcess(filePath)

print(df_processed['Lemmatized_Essays'][:5])

0    A_DT combination_NN of_IN cost_NN and_CC struc...
1    The_DT utilization_NN of_IN three_CD slender_N...
2    Smaller_NNP wind_NN power_NN generation_NN tur...
3    There_EX are_VBP several_JJ reason_NN why_WRB ...
4    The_DT majority_NN of_IN wind_NN turbine_NN ar...
Name: Lemmatized_Essays, dtype: object


# Import BERT model and create dataset for training

In [4]:
# Load the pre-trained BERT model and modify the final layer for classification
bert_model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False,
)

# Set the optimizer and learning rate scheduler
optimizer = AdamW(bert_model.parameters(), lr=2e-5, eps=1e-8)

epochs = 2

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [5]:
from torch.utils.data import Dataset, DataLoader

class POSDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label)
        }

# Fine-tune BERT model

In [6]:
from transformers import BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128  # Maximum sequence length

In [7]:
from sklearn.model_selection import KFold

# Define the number of folds for cross-validation
n_folds = 5

device = 'cpu'

# Initialize the cross-validator
kf = KFold(n_splits=n_folds, shuffle=True)

df_test = preProcess('./Data_Augment_Origin_2.csv')

test_dataset = POSDataset(df_test['Lemmatized_Essays'].to_list(), df_test['Score'].to_list(), bert_tokenizer, max_length)

test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

# Iterate over the folds
for fold, (train_indices, val_indices) in enumerate(kf.split(df_processed['Essay'])):
    print(f'Fold {fold+1}')
    print('length of train indices: ', len(train_indices))

    # Initialize the data loaders for training and validation
    df_train = df_processed.loc[train_indices]
    df_val = df_processed.loc[val_indices]

    train_dataset = POSDataset(df_train['Lemmatized_Essays'].to_list(), df_train['Score'].to_list(), bert_tokenizer, max_length)
    val_dataset = POSDataset(df_val['Lemmatized_Essays'].to_list(), df_val['Score'].to_list(), bert_tokenizer, max_length)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    # Train and evaluate the model for each epoch
    for epoch in range(epochs):
        print(f'Epoch {epoch+1}')
        
        # Train the model on the training data
        bert_model.train()
        train_loss = 0.0
        # print('train_loader: ', train_loader[0])
        for batch in train_loader:
            optimizer.zero_grad()
            outputs = bert_model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        print(f'Training Loss: {train_loss}')
        
        # Evaluate the model on the validation data
        bert_model.eval()
        val_loss = 0.0
        val_acc = 0.0
        with torch.no_grad():
            for batch in val_loader:
                outputs = bert_model(**batch)
                loss = outputs.loss
                logits = outputs.logits
                val_loss += loss.item()
                preds = logits.argmax(dim=1)
                val_acc += (preds == batch['labels']).float().mean().item()
        val_loss /= len(val_loader)
        val_acc /= len(val_loader)
        print(f'Validation Loss: {val_loss}')
        print(f'Validation Accuracy: {val_acc}')

        # Test on original data
        bert_model.eval()
        test_loss = 0.0
        test_acc = 0.0
        with torch.no_grad():
            for batch in test_loader:
                outputs = bert_model(**batch)
                loss = outputs.loss
                logits = outputs.logits
                test_loss += loss.item()
                preds = logits.argmax(dim=1)
                test_acc += (preds == batch['labels']).float().mean().item()
        test_loss /= len(test_loader)
        test_acc /= len(test_loader)
        print(f'Test Loss: {test_loss}')
        print(f'Test Accuracy: {test_acc}')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Fold 1
length of train indices:  720
Epoch 1




Training Loss: 0.9399851580460866
Validation Loss: 0.6798028900571491
Validation Accuracy: 0.7663043478260869
Test Loss: 0.8076926052570343
Test Accuracy: 0.65
Epoch 2
Training Loss: 0.47190672068132294
Validation Loss: 0.18401041869883952
Validation Accuracy: 0.9456521739130435
Test Loss: 0.3093110151588917
Test Accuracy: 0.875
Fold 2
length of train indices:  720
Epoch 1




Training Loss: 0.16335236931012737
Validation Loss: 0.10739617146875548
Validation Accuracy: 0.9619565217391305
Test Loss: 0.14929117411375045
Test Accuracy: 0.925
Epoch 2
Training Loss: 0.1009526490026878
Validation Loss: 0.09349371792505616
Validation Accuracy: 0.9782608695652174
Test Loss: 0.49476496372371914
Test Accuracy: 0.85
Fold 3
length of train indices:  720
Epoch 1




Training Loss: 0.04210877879522741
Validation Loss: 0.047477772935172136
Validation Accuracy: 0.9945652173913043
Test Loss: 0.2551157706417143
Test Accuracy: 0.85
Epoch 2
Training Loss: 0.05581254708684153
Validation Loss: 0.041704173230201654
Validation Accuracy: 0.9891304347826086
Test Loss: 0.24559347685426475
Test Accuracy: 0.925
Fold 4
length of train indices:  720
Epoch 1




Training Loss: 0.01852489976833264
Validation Loss: 0.006942187496663436
Validation Accuracy: 1.0
Test Loss: 0.29799986253492533
Test Accuracy: 0.875
Epoch 2
Training Loss: 0.010708240323906972
Validation Loss: 0.0029926699100305204
Validation Accuracy: 1.0
Test Loss: 0.14844702826812864
Test Accuracy: 0.95
Fold 5
length of train indices:  720
Epoch 1




Training Loss: 0.003936752118170262
Validation Loss: 0.0020212222437333803
Validation Accuracy: 1.0
Test Loss: 0.14501011944375933
Test Accuracy: 0.95
Epoch 2
Training Loss: 0.002985651951490177
Validation Loss: 0.002021580294240266
Validation Accuracy: 1.0
Test Loss: 0.13320218885783106
Test Accuracy: 0.925


In [8]:
# Test on original data
df_test = preProcess('./Data_Augment_Origin_2.csv')

test_dataset = POSDataset(df_test['Lemmatized_Essays'].to_list(), df_test['Score'].to_list(), bert_tokenizer, max_length)

import numpy as np
final_test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
bert_model.eval()
test_loss = 0.0
test_acc = 0.0
predictions = []
with torch.no_grad():
    for batch in final_test_loader:
        outputs = bert_model(**batch)
        loss = outputs.loss
        logits = outputs.logits
        test_loss += loss.item()
        preds = logits.argmax(dim=1)
        predictions = np.append(np.array(predictions), np.array(preds.numpy()))
        print('Model predictions: ', predictions)
        print('Real labels: ', batch['labels'].numpy())
        test_acc += (preds == batch['labels']).float().mean().item()
test_loss /= len(final_test_loader)
test_acc /= len(final_test_loader)
df_test['Predictions'] = predictions
df_test.to_csv('Master_data_with_prediction_2_1_BERT.csv', index=False)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_acc}')



Model predictions:  [2. 1. 0. 0. 0. 0. 0. 0.]
Real labels:  [2 1 0 0 0 0 0 0]
Model predictions:  [2. 1. 0. 0. 0. 0. 0. 0. 2. 1. 1. 1. 1. 1. 1. 0.]
Real labels:  [2 1 1 1 1 1 1 0]
Model predictions:  [2. 1. 0. 0. 0. 0. 0. 0. 2. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 2. 2. 2.]
Real labels:  [0 0 0 0 0 2 2 2]
Model predictions:  [2. 1. 0. 0. 0. 0. 0. 0. 2. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 2. 2. 2.
 2. 2. 2. 1. 1. 1. 0. 1.]
Real labels:  [2 2 2 1 1 1 1 1]
Model predictions:  [2. 1. 0. 0. 0. 0. 0. 0. 2. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 2. 2. 2.
 2. 2. 2. 1. 1. 1. 0. 1. 0. 1. 1. 0.]
Real labels:  [1 1 1 0]
Test Loss: 0.11349292274098843
Test Accuracy: 0.925


In [9]:
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score

# Step 1: Load the CSV file and separate the "score" and "prediction" columns
df = pd.read_csv("Master_data_with_prediction_2_1_BERT.csv")
y_true = df["Score"]
y_pred = df["Predictions"]

# Step 2: Calculate accuracy for each score level
score_levels = sorted(df["Score"].unique())
accuracy_per_level = {}
for level in score_levels:
    mask = y_true == level
    accuracy_per_level[level] = accuracy_score(y_true[mask], y_pred[mask])

print("Accuracy for each score level:")
for level, accuracy in accuracy_per_level.items():
    print(f"Score {level}: {accuracy:.2f}")

print("Accuracy for all score level:")
print(f"Score: {accuracy_score(y_true, y_pred):.2f}")

# Step 3 and Step 4: Calculate the confusion matrix and QWK
conf_matrix = confusion_matrix(y_true, y_pred)
qwk= cohen_kappa_score(y_true, y_pred, weights="quadratic")

print("\nQWK:")
print(f"Score: {qwk:.2f}")

Accuracy for each score level:
Score 0: 1.00
Score 1: 0.87
Score 2: 1.00
Accuracy for all score level:
Score: 0.94

QWK:
Score: 0.95
