In [1]:
from transformers import AutoTokenizer
import torch

import pandas as pd
from tqdm.notebook import tqdm

In [2]:
config = {
    'backbone': 'microsoft/deberta-v3-base',
    'model_path': './cache',
    'max_length': 512,

    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

In [3]:
# import model
# from Frozenbert.base_model import Frozenbert
model_path = './checkpoints/FrozenBert-epochs3-val_mcrmse0.4411.pth'
model = torch.load(model_path)

tokenizer = AutoTokenizer.from_pretrained(config['backbone'], cache_dir=config['model_path'])

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/deberta-v3-base/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002B89411B640>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: cfe61fd4-1174-4fad-948b-b72a73c96259)')' thrown while requesting HEAD https://huggingface.co/microsoft/deberta-v3-base/resolve/main/tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 1 test dataset

In [4]:
from utils.classes import EssayDataset
from utils.func import to_scores

# data frame
df = pd.read_csv('./input/feedback-prize-english-language-learning/train.csv')
test_df = pd.read_csv(
    './input/feedback-prize-english-language-learning/test.csv')
test_ds = EssayDataset(test_df, config, tokenizer=tokenizer, is_test=True)
test_loader = torch.utils.data.DataLoader(
    test_ds, batch_size=1, shuffle=True, num_workers=0, pin_memory=True)

model.eval()
preds = []

# iterate test_loader
for (inputs) in test_loader:
    inputs = {k: inputs[k].to(device=config['device'])
                for k in inputs.keys()}

    outputs = model(inputs)
    preds.append(outputs.detach().cpu())

preds = torch.concat(preds)

preds = to_scores(preds)

print("Final Test Predictions:\n")
for i in range(len(test_loader)):
    print('essay%d---cohesion: %.2f, syntax: %.2f, vocabulary: %.2f, phraseology: %.2f, grammar: %.2f, conventions: %.2f'
          % (i, preds[i][0], preds[i][1], preds[i][2], preds[i][3], preds[i][4], preds[i][5]))

Final Test Predictions:

essay0---cohesion: 3.50, syntax: 3.50, vocabulary: 3.50, phraseology: 3.50, grammar: 3.50, conventions: 3.50
essay1---cohesion: 3.00, syntax: 2.50, vocabulary: 3.00, phraseology: 2.50, grammar: 2.00, conventions: 2.50
essay2---cohesion: 3.00, syntax: 3.00, vocabulary: 3.00, phraseology: 3.00, grammar: 2.50, conventions: 2.50


In [5]:
# view outputs of tokenizer
test_ds[2]

{'input_ids': tensor([     1,   2651,   9805,    704,   1603,    272,    307,   1632,    269,
           1496,    361,    400,    301,    295,    350,    619,    335,    301,
            281,    489,    653,    491,    261,    476,   1757,    264,    291,
           1548,    260,    879,    355,    504,    272,    278,    269,    489,
            493,    264,    489,    282,    653,    491,    354,    264,    282,
          20627,    263,    324,    942,    288,    305,    264,   9603,    385,
            277,    290,   3968,    260,    489,    653,    491,   4573,    274,
            551,    266,   1100,    265,  14285,    261,   1632,    682,    274,
            409,    264,   6525,    264,    333,    402,    310,    354,    339,
            274,    464,    261,   3606,    295,    327,    799,    491,    641,
            262,    384,    260,    279,    362,    919,    272,    489,    653,
            269,    493,    354,    653,    942,    269,    272,    278,   1360,
            274

# 2 any text 

In [6]:
from utils.classes import EssayDataset
from utils.func import to_scores
# test result after training


def test(test_loader):

    preds = []

    # iterate test_loader
    for (inputs) in test_loader:
        inputs = {k: inputs[k].to(device=config['device'])
                  for k in inputs.keys()}

        outputs = model(inputs)
        preds.append(outputs.detach().cpu())

    preds = torch.concat(preds)
    return preds


def test_essay(essay):
    data = [[0, essay]]

    test_df = pd.DataFrame(
        data, columns=['text_id', 'full_text'], dtype=object)

    tokenizer = AutoTokenizer.from_pretrained(config['backbone'], cache_dir=config['model_path'])
    test_ds = EssayDataset(test_df, config, tokenizer=tokenizer, is_test=True)

    test_loader = torch.utils.data.DataLoader(test_ds,
                                              batch_size=1,
                                              shuffle=True,
                                              num_workers=0,
                                              pin_memory=True
                                              )

    model.eval()
    preds = test(test_loader=test_loader)

    preds = to_scores(preds)

    print('cohesion: %.1f, syntax: %.1f, vocabulary: %.1f, phraseology: %.1f, grammar: %.1f, conventions: %.1f'
          % (preds[0][0], preds[0][1], preds[0][2], preds[0][3], preds[0][4], preds[0][5]))

In [7]:
# type essay here
essay = "yo aaa h ddwdjd ly good."
test_essay(essay)

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/deberta-v3-base/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002B89418B640>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: fc790a5f-223b-4383-b2b2-38fd72bec7f2)')' thrown while requesting HEAD https://huggingface.co/microsoft/deberta-v3-base/resolve/main/tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


cohesion: 0.5, syntax: 0.5, vocabulary: 1.0, phraseology: 0.5, grammar: 1.5, conventions: 0.5


# 3 val test

In [8]:
val_df = pd.read_csv(
    './input/feedback-prize-english-language-learning/train.csv')

tokenizer = AutoTokenizer.from_pretrained(config['backbone'])

val_ds = EssayDataset(val_df, config, tokenizer=tokenizer)

val_loader = torch.utils.data.DataLoader(val_ds,
                                         batch_size=1,
                                         shuffle=False,
                                         num_workers=0,
                                         pin_memory=True
                                         )
# val_loader.dataset.df

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /microsoft/deberta-v3-base/resolve/main/tokenizer_config.json (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002B81EB10E80>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: f7162836-e8d6-479f-af73-121b43dd1d05)')' thrown while requesting HEAD https://huggingface.co/microsoft/deberta-v3-base/resolve/main/tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:

def valid(model, val_loader):
    c = 0
    progress = tqdm(val_loader, total=len(val_loader))

    avg_targets = torch.zeros((1, 6)).to(device=config['device'])
    avg_outputs = torch.zeros((1, 6)).to(device=config['device'])
    print('cohesion: , syntax: , vocabulary: , phraseology: , grammar: , conventions: ')

    # iterate self.val_loader
    for (inputs, targets) in progress:

        inputs = {k: inputs[k].to(device=config['device'])
                  for k in inputs.keys()}

        targets = targets['labels'].to(device=config['device'])

        outputs = model(inputs)
        # outputs=to_scores(outputs)

        if c < 7 and (targets.tolist()[0][1] == 4 or targets.tolist()[0][0] == 4):
            print("targets: ", targets.tolist())
            print("outputs: ", outputs.tolist())
            print()
            c += 1

        avg_targets = torch.add(avg_targets, targets)
        avg_outputs = torch.add(avg_outputs, outputs)

    avg_targets = avg_targets/len(val_loader)
    avg_outputs = avg_outputs/len(val_loader)

    print("avg_targets: ", avg_targets.tolist())
    print("avg_outputs: ", avg_outputs.tolist())


valid(model, val_loader)

  0%|          | 0/3911 [00:00<?, ?it/s]

cohesion: , syntax: , vocabulary: , phraseology: , grammar: , conventions: 
targets:  [[3.5, 4.0, 4.0, 3.5, 3.5, 4.0]]
outputs:  [[3.189666509628296, 3.175337791442871, 3.2942020893096924, 3.1032185554504395, 3.2570724487304688, 3.431267023086548]]

targets:  [[3.5, 4.0, 3.5, 3.5, 4.0, 4.0]]
outputs:  [[3.3855955600738525, 3.5182673931121826, 3.52046275138855, 3.3622946739196777, 3.489351511001587, 3.573880910873413]]

targets:  [[4.0, 3.5, 3.0, 4.0, 3.5, 4.0]]
outputs:  [[3.5180983543395996, 3.6222243309020996, 3.8053174018859863, 3.699758768081665, 3.693772077560425, 3.711016893386841]]

targets:  [[3.0, 4.0, 3.0, 4.0, 3.5, 3.0]]
outputs:  [[3.5345354080200195, 3.6554787158966064, 3.741503953933716, 3.804649591445923, 3.7657315731048584, 3.47724986076355]]

targets:  [[4.0, 4.0, 4.0, 4.0, 4.5, 3.5]]
outputs:  [[4.054250717163086, 4.097082614898682, 4.196408748626709, 4.205399990081787, 4.119227409362793, 4.012894153594971]]

targets:  [[4.0, 4.5, 5.0, 4.0, 5.0, 4.5]]
outputs:  [[4.26