In [None]:
from peft import PeftModel
from transformers import RwkvModel, DefaultDataCollator, Trainer
from torch.utils.data import DataLoader
import datasets
from models import RWKV_TOKENIZER, RwkvModelForSequenceClassification
import torch

In [None]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
#torch.cuda.set_per_process_memory_fraction(0.7, 0)

In [None]:
PATH = ''

In [None]:
model = RwkvModelForSequenceClassification.from_pretrained(PATH+'model/raven-0.4b-world', num_labels=3,pad_token_id=0)

In [None]:
clf = PeftModel.from_pretrained(
    model,
    PATH+'output_dir',
    is_trainable=False
)

In [None]:
clf = clf.merge_and_unload()

In [None]:
max_length = 2048
tokenizer = RWKV_TOKENIZER(PATH+'rwkv_vocab_v20230424.txt')
pad_token_id = 0

def tokenization_rwkv(example):
    inputs_ids = [tokenizer.encode(d) for d in example["data"]]
    #pad the inputs_ids with pad_token_id to max_length or truncate the inputs_ids to max_length
    inputs_ids = [ids + [pad_token_id] * (max_length - len(ids)) if len(ids) < max_length else ids[:max_length] for ids in inputs_ids]
    labels = example['labels'].copy()
    example['input_ids']=inputs_ids
    example['labels']=labels
    return example

In [None]:
max_length = 2048
tokenizer = RWKV_TOKENIZER(PATH+'rwkv_vocab_v20230424.txt')
pad_token_id = 0

def tokenization_rwkv(example):
    inputs_ids = [tokenizer.encode(d) for d in example["news"]]
    #pad the inputs_ids with pad_token_id to max_length or truncate the inputs_ids to max_length
    inputs_ids = [ids + [pad_token_id] * (max_length - len(ids)) if len(ids) < max_length else ids[:max_length] for ids in inputs_ids]
    labels = example['label'].copy()
    example['input_ids']=inputs_ids
    example['labels']=labels
    return example

In [None]:
ds = datasets.load_dataset('csv', data_files={"test":PATH+'data/test_small.csv'})

ds = ds.map(tokenization_rwkv ,remove_columns=['news','length','label'],batched=True)
test_ds = ds['test']

test_dataloader = DataLoader(test_ds, batch_size=32, num_workers=10,pin_memory=True,collate_fn=DefaultDataCollator(return_tensors='pt'))

In [None]:
clf.eval()
predictor = Trainer(model=clf)

In [None]:
import pandas as pd

In [None]:
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()

In [None]:
df = pd.read_csv(PATH+'data/test_2.csv')

In [None]:
mapping = {'non-hate':0, 'offensive':1, 'hate':2}
df['label'] = df['labels'].map(mapping)

In [None]:
import numpy as np

In [None]:
df

In [None]:
y_preds = []
probas = []

with torch.no_grad():
    for i in range(0,5044,26):
        ds = datasets.Dataset.from_pandas(df.iloc[i:i+26])
        ds = ds.map(tokenization_rwkv ,remove_columns=['news','length','label'],batched=True)
        y_pred = predictor.predict(ds)
        y_preds.extend(np.argmax(y_pred.predictions[0], axis=1))
        probas.extend(y_pred.predictions[0])
        torch.cuda.empty_cache()

In [None]:
df['preds'] = y_preds
df['probas'] = probas

In [None]:
confusion_matrix(df['label'], df['preds'])

In [None]:
print(classification_report(df['label'], df['preds'], zero_division=0))

In [None]:
df

In [None]:
df.to_csv(PATH+'data/test_small_preds_2.csv', index=False)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
confusion_matrix(df['label'], df['preds'])


In [None]:
print(classification_report(df['label'], df['preds'], zero_division=0))