# Extract data from file

In [1]:
import pandas as pd
import numpy as np

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('Data_Augment_Origin_2.csv')

# Filter the DataFrame to include only the rows where the "Score" column is 0
score_0_essays = df[df['Score'] == 0]['Essay'].tolist()
score_1_essays = df[df['Score'] == 1]['Essay'].tolist()
score_2_essays = df[df['Score'] == 2]['Essay'].tolist()

origin_essays = [score_0_essays, score_1_essays, score_2_essays]

# Synthesized paraphase data

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cpu"

paraphrase_tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=7.0,
    diversity_penalty=2.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=64
):
    input_ids = paraphrase_tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids
    
    outputs = paraphrase_model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = paraphrase_tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

def generate_paraphrase(origin_text, return_num):
    if return_num == 1:
        return paraphrase(origin_text, num_beams=2, num_beam_groups=2, num_return_sequences=return_num)
    else:
        return paraphrase(origin_text, num_beams=return_num, num_beam_groups=return_num, num_return_sequences=return_num)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import math
import nltk
import random
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

SYNTHESIZED_NUMBER_PER_CLASS = 500

synthesized_essays = [[], [], []]

for i in range(3):
    print(i)
    return_num = math.ceil(SYNTHESIZED_NUMBER_PER_CLASS/len(origin_essays[i]))
    for j in range(len(origin_essays[i])):
        
        sentences = sent_tokenize(origin_essays[i][j])
        paraphrase_list = []
        for sentence in sentences:
            if len(paraphrase_list) == 0:
                paraphrase_list = generate_paraphrase(sentence, return_num)
            else:
                paraphrase_list = [a + " " + b for a, b in zip(paraphrase_list, generate_paraphrase(sentence, return_num))]
        paraphrase_and_origin_text = map(lambda x: [x, origin_essays[i][j]], paraphrase_list)
        synthesized_essays[i] += paraphrase_and_origin_text
    random.shuffle(synthesized_essays[i])
    synthesized_essays[i] = synthesized_essays[i][:SYNTHESIZED_NUMBER_PER_CLASS]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dungbeoooiuuuthocuteephomaique/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0
1
2


In [4]:
# Check synthesized data length for each type of score

print(len(synthesized_essays[0]))
print(len(synthesized_essays[1]))
print(len(synthesized_essays[2]))

500
500
500


# Fine-tune BERT as Baseline model to filter valid data

In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, GPT2LMHeadModel
from sklearn.model_selection import train_test_split

df_train_baseline = pd.read_csv('Data_Augment_Origin_2.csv', usecols=['Essay', 'Score'])

# Load the pre-trained BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encoding(texts):
    return bert_tokenizer.batch_encode_plus(
        texts.tolist(),
        add_special_tokens=True,
        max_length=256,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

# Load the pre-trained BERT model and modify the final layer for classification
baseline_model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False,
)

# Set the optimizer and learning rate scheduler
optimizer = AdamW(baseline_model.parameters(), lr=2e-5, eps=1e-8)

epochs = 2

# Create a PyTorch DataLoader for the training and validation data
class EssayDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
from sklearn.model_selection import KFold

# Define the number of folds for cross-validation
n_folds = 5

# Initialize the cross-validator
kf = KFold(n_splits=n_folds, shuffle=True)

# Iterate over the folds
for fold, (train_indices, val_indices) in enumerate(kf.split(df_train_baseline['Essay'])):
    print(f'Fold {fold+1}')
    print('length of train indices: ', len(train_indices))

    # Initialize the data loaders for training and validation
    df_train = df_train_baseline.loc[train_indices]
    df_val = df_train_baseline.loc[val_indices]

    train_encodings = encoding(df_train['Essay'])
    val_encodings = encoding(df_val['Essay'])

    train_dataset = EssayDataset(train_encodings, df_train['Score'].to_list())
    val_dataset = EssayDataset(val_encodings, df_val['Score'].to_list())

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=False)

    # Train and evaluate the model for each epoch
    for epoch in range(epochs):
        print(f'Epoch {epoch+1}')
        
        # Train the model on the training data
        baseline_model.train()
        train_loss = 0.0
        for batch in train_loader:
            optimizer.zero_grad()
            outputs = baseline_model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)
        print(f'Training Loss: {train_loss}')
        
        # Evaluate the model on the validation data
        baseline_model.eval()
        val_loss = 0.0
        val_acc = 0.0
        with torch.no_grad():
            for batch in val_loader:
                outputs = baseline_model(**batch)
                loss = outputs.loss
                logits = outputs.logits
                val_loss += loss.item()
                preds = logits.argmax(dim=1)
                val_acc += (preds == batch['labels']).float().mean().item()
        val_loss /= len(val_loader)
        val_acc /= len(val_loader)
        print(f'Validation Loss: {val_loss}')
        print(f'Validation Accuracy: {val_acc}')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Fold 1
length of train indices:  28
Epoch 1




Training Loss: 1.0996016561985016
Validation Loss: 1.241928219795227
Validation Accuracy: 0.125
Epoch 2
Training Loss: 0.99744513630867
Validation Loss: 1.2876579761505127
Validation Accuracy: 0.125
Fold 2
length of train indices:  29




Epoch 1
Training Loss: 1.0269542336463928
Validation Loss: 0.881104052066803
Validation Accuracy: 0.7142857313156128
Epoch 2
Training Loss: 0.9641855508089066
Validation Loss: 0.9062682390213013
Validation Accuracy: 0.7142857313156128
Fold 3
length of train indices:  29




Epoch 1
Training Loss: 0.9048813879489899
Validation Loss: 0.7303051948547363
Validation Accuracy: 1.0
Epoch 2
Training Loss: 0.8447778522968292
Validation Loss: 0.7724858522415161
Validation Accuracy: 0.8571428656578064
Fold 4
length of train indices:  29




Epoch 1
Training Loss: 0.8393738120794296
Validation Loss: 0.6857599020004272
Validation Accuracy: 1.0
Epoch 2
Training Loss: 0.6927273869514465
Validation Loss: 0.5991281867027283
Validation Accuracy: 1.0
Fold 5
length of train indices:  29




Epoch 1
Training Loss: 0.6535292416810989
Validation Loss: 0.548404335975647
Validation Accuracy: 1.0
Epoch 2
Training Loss: 0.57933659106493
Validation Loss: 0.5044216513633728
Validation Accuracy: 1.0


# Use above baseline model to filter synthesized data

In [1]:
def baselineFilterData(essays, label_index,remain = 0.6):
    baseline_model.eval()
    essay_and_confident_score = []
    with torch.no_grad():
        for essay_and_origin in essays:
            # Tokenize the essay and convert to input format
            inputs = bert_tokenizer(essay_and_origin[0], truncation=True, max_length=512,return_tensors='pt')
            outputs = baseline_model(**inputs)
            # Get the predicted score
            logits = outputs.logits
            essay_and_confident_score.append([essay_and_origin[0], logits[0][label_index].item(), essay_and_origin[1]])
    return sorted(essay_and_confident_score, key=lambda x: x[1], reverse=True)[:math.ceil(len(essay_and_confident_score)*remain)]

In [8]:
# Filter data and save as csv file
baseline_filtered_data_0 = baselineFilterData(synthesized_essays[0], 0)
baseline_filtered_data_1 = baselineFilterData(synthesized_essays[1], 1)
baseline_filtered_data_2 = baselineFilterData(synthesized_essays[2], 2)

baseline_filtered_data = [baseline_filtered_data_0, baseline_filtered_data_1, baseline_filtered_data_2]

# Create a dictionary to store the data
data = {'Essay': [], 'Score': [], 'Confident Score': [], 'Origin': []}

# Loop through the essays and scores and append them to the dictionary
for i in range(3):
    data['Essay'].extend(map(lambda x: x[0], baseline_filtered_data[i]))
    data['Confident Score'].extend(map(lambda x: x[1], baseline_filtered_data[i]))
    data['Score'].extend([i] * len(baseline_filtered_data[i]))
    data['Origin'].extend(map(lambda x: x[2], baseline_filtered_data[i]))

# Create a Pandas DataFrame from the dictionary
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('Baseline_Filtered_Synthesized_Essays.csv', index=False)