In [None]:
import pandas as pd
import tqdm
from bert_score import score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from transformers import AutoTokenizer
from IPython.display import display
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("model")
tokenizer.model_max_length = 1000000000

In [None]:
# trt

df_base = pd.read_csv('fttrt/trt-base.csv',encoding='utf-8')
df_ir = pd.read_csv('fttrt/trt-ft.csv',encoding='utf-8')
df = pd.merge(df_base, df_ir, on=['question','answer','knowledges','prompt'], how='inner')
print(len(df))
df['time_base'].mean(), df['time_finetune'].mean()

In [None]:
# ft

df_base = pd.read_csv('fttrt/ft-base.csv',encoding='utf-8')
df_ir = pd.read_csv('fttrt/ft-ft.csv',encoding='utf-8')
df = pd.merge(df_base, df_ir, on=['question','answer','knowledges','prompt'], how='inner')
print(len(df))
df['time_base'].mean(), df['time_finetune'].mean()

In [None]:
bt = sorted(df['time_base'].to_list())
ftt = sorted(df['time_finetune'].to_list())
trim = int(0.05*len(bt))
sum(bt[trim:-trim])/len(bt[trim:-trim]), sum(ftt[trim:-trim])/len(ftt[trim:-trim])

In [None]:
plt.figure(figsize=(10, 2))
plt.hist(ftt, bins=10, alpha=0.7, label='Base Model Time')
plt.hist(bt, bins=10, alpha=0.7, label='Finetuned Model Time')

plt.xlabel('Time (seconds)')
plt.ylabel('Frequency')
plt.title('Time Distribution for Base and Fine-tuned Models')
plt.legend()
plt.show()

In [None]:
# trt from ir

df_base = pd.read_csv('fttrt/trt-ir-base.csv',encoding='utf-8')
df_ir = pd.read_csv('fttrt/trt-ir-ir.csv',encoding='utf-8')
df = pd.merge(df_base, df_ir, on=['question','answer','knowledges','prompt'], how='inner')
print(len(df))
df['time_base'].mean(), df['time_finetune'].mean()

## bert score trt

In [None]:
model_type = 'microsoft/deberta-xlarge-mnli'  

labels = df["answer"].to_list()
b = df["response_base"].to_list()
ft = df["response_finetune"].to_list()

In [None]:
print('inference base')
P, R, F1 = score(b, labels, model_type=model_type, verbose=False)
print('precision\t', P.mean().item())
print('recall\t', R.mean().item())
print('f1\t', F1.mean().item())

print('inference finetune')
P, R, F1 = score(ft, labels, model_type=model_type, verbose=False)
print('precision\t', P.mean().item())
print('recall\t', R.mean().item())
print('f1\t', F1.mean().item())

## bert score ft

In [None]:
model_type = 'microsoft/deberta-xlarge-mnli'  

labels = df["answer"].to_list()
b = df["response_base"].to_list()
ft = df["response_finetune"].to_list()

print('inference base')
P, R, F1 = score(b, labels, model_type=model_type, verbose=False)
print('precision\t', P.mean().item())
print('recall\t', R.mean().item())
print('f1\t', F1.mean().item())

print('inference finetune')
P, R, F1 = score(ft, labels, model_type=model_type, verbose=False)
print('precision\t', P.mean().item())
print('recall\t', R.mean().item())
print('f1\t', F1.mean().item())

## bert score trt (ir)

In [None]:
model_type = 'microsoft/deberta-xlarge-mnli'  

labels = df["answer"].to_list()
b = df["response_base"].to_list()
ft = df["response_finetune"].to_list()

print('inference base')
P, R, F1 = score(b, labels, lang="th", verbose=False, nthreads=8)
print('precision\t', P.mean().item())
print('recall\t', R.mean().item())
print('f1\t', F1.mean().item())

print('inference finetune')
P, R, F1 = score(ft, labels,lang="th",verbose=False, nthreads=8)
print('precision\t', P.mean().item())
print('recall\t', R.mean().item())
print('f1\t', F1.mean().item())

## rouge score trt

In [None]:
labels = df["answer"].to_list()
bs = df["response_base"].to_list()
fts= df["response_finetune"].to_list()

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True, tokenizer=tokenizer)
fb = [0,0,0]
fft = [0,0,0]

for label, b, ft in zip(labels, bs, fts):
    scores = scorer.score(label, b)
    fb[0] += scores['rouge1'].fmeasure
    fb[1] += scores['rouge2'].fmeasure
    fb[2] += scores['rougeL'].fmeasure

    scores = scorer.score(label, ft)
    fft[0] += scores['rouge1'].fmeasure
    fft[1] += scores['rouge2'].fmeasure
    fft[2] += scores['rougeL'].fmeasure

for i in range(3):
    fb[i] /= len(labels)
    fft[i] /= len(labels)

fb, fft

## rouge score ft

In [None]:
df1 = pd.read_csv("tmps/inference_base.csv", encoding='utf-8')
df2 = pd.read_csv("tmps/inference_finetune.csv", encoding='utf-8')
df = pd.merge(df1, df2, on=['question','answer','references','knowledges', 'source'], how='inner')
len(df)

In [None]:
labels = df["answer"].to_list()
bs = df["response_base"].to_list()
fts= df["response_finetune"].to_list()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True, tokenizer=tokenizer)
fb = [0,0,0]
fft = [0,0,0]

for label, b, ft in zip(labels, bs, fts):
    scores = scorer.score(label, b)
    fb[0] += scores['rouge1'].fmeasure
    fb[1] += scores['rouge2'].fmeasure
    fb[2] += scores['rougeL'].fmeasure

    scores = scorer.score(label, ft)
    fft[0] += scores['rouge1'].fmeasure
    fft[1] += scores['rouge2'].fmeasure
    fft[2] += scores['rougeL'].fmeasure

for i in range(3):
    fb[i] /= len(labels)
    fft[i] /= len(labels)

fb, fft

## bleu score trt

In [None]:
tokenizer = AutoTokenizer.from_pretrained("model")
tokenizer.model_max_length = 1000000000

labels = df["answer"].to_list()
bs = df["response_base"].to_list()
fts= df["response_finetune"].to_list()

In [None]:
bb = []
bft = []

for label, b, ft in zip(labels, bs, fts):

    label = tokenizer.encode(label, add_special_tokens=False)
    b = tokenizer.encode(b, add_special_tokens=False)
    ft = tokenizer.encode(ft, add_special_tokens=False)
    label = [tokenizer.decode(l, skip_special_tokens=True) for l in label]
    b = [tokenizer.decode(l, skip_special_tokens=True) for l in b]
    ft = [tokenizer.decode(l, skip_special_tokens=True) for l in ft]

    bb.append(sentence_bleu([label], b))
    bft.append(sentence_bleu([label], ft))


In [None]:
sum(bb)/len(bb), sum(bft)/len(bft)

## bleu score ft

In [None]:
tokenizer = AutoTokenizer.from_pretrained("model")
tokenizer.model_max_length = 1000000000

labels = df["answer"].to_list()
bs = df["response_base"].to_list()
fts= df["response_finetune"].to_list()

bb = []
bft = []

for label, b, ft in zip(labels, bs, fts):

    label = tokenizer.encode(label, add_special_tokens=False)
    b = tokenizer.encode(b, add_special_tokens=False)
    ft = tokenizer.encode(ft, add_special_tokens=False)
    label = list(set([tokenizer.decode(l, skip_special_tokens=True) for l in label]))
    b = list(set([tokenizer.decode(l, skip_special_tokens=True) for l in b]))
    ft = list(set([tokenizer.decode(l, skip_special_tokens=True) for l in ft]))

    bb.append(sentence_bleu([label], b, weights=[1,0,0,0]))
    bft.append(sentence_bleu([label], ft, weights=[1,0,0,0]))

sum(bb)/len(bb), sum(bft)/len(bft)