In [None]:
import torch
import pandas as pd
from transformers import T5Tokenizer, AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_metric
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm_notebook
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer, util
import copy
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
import spacy
from nltk.corpus import stopwords
from nltk import word_tokenize
from bert_score import score
import textdescriptives as td
import torch.quantization
from transformers import BitsAndBytesConfig

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model_path = '/home/qiyu/Dev/ziqing/T5/T5_newcombinedsquad_once'
trained_tokenizer_path = '/home/qiyu/Dev/ziqing/T5/T5_newcombinedsquad_once'

In [3]:

# class QuestionGeneration:

#     def __init__(self, model_dir=None):
#         self.model = T5ForConditionalGeneration.from_pretrained(trained_model_path)
#         self.tokenizer = T5Tokenizer.from_pretrained(trained_tokenizer_path)
#         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#         self.model = self.model.to(self.device)
#         self.model.eval()

#     def generate(self, topic: str, context: str):
#         input_text = '<topic> %s <context> %s ' % (topic, context)
#         encoding = self.tokenizer.encode_plus(
#             input_text,
#             return_tensors='pt'
#         ).to(self.device)
#         input_ids = encoding['input_ids']
#         attention_mask = encoding['attention_mask']
#         outputs = self.model.generate(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             num_beams = 10,
#             num_return_sequences = 8
#         )
#         question_list = []
#         for output in outputs:
#             question = self.tokenizer.decode(
#                 output,
#                 skip_special_tokens=True,
#                 clean_up_tokenization_spaces=True
#             )
#             question_list.append({'question': question, 'topic': topic, 'context': context})
#         return question_list



In [4]:

# class QuestionGeneration:
#     def __init__(self):
#         self.model = T5ForConditionalGeneration.from_pretrained(trained_model_path)
#         self.tokenizer = T5Tokenizer.from_pretrained(trained_tokenizer_path)
#         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#         self.model = self.model.to(self.device)
#         self.model.eval()

#     def process_context(self, topic, context):
#         sentences = sent_tokenize(context)
#         indices = [i for i, sentence in enumerate(sentences) if topic in sentence]
        
#         if len(indices) == 0:
#          
#             return context
        
#         
#         start_index = max(0, indices[0] - 2)
#         end_index = min(len(sentences), indices[-1] + 3) 
#         return " ".join(sentences[start_index:end_index])

#     def generate(self, topic: str, context: str):
#         processed_context = self.process_context(topic, context)
#         input_text = f'{topic}<sep>{context}'
#         encoding = self.tokenizer.encode_plus(
#             input_text,
#             return_tensors='pt'
#         ).to(self.device)
#         input_ids = encoding['input_ids']
#         attention_mask = encoding['attention_mask']
#         outputs = self.model.generate(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             num_beams=10,
#             num_return_sequences=8
#         )
#         question_list = []
#         for output in outputs:
#             question = self.tokenizer.decode(
#                 output,
#                 skip_special_tokens=True,
#                 clean_up_tokenization_spaces=True
#             )
#             question_list.append({'question': question, 'topic': topic, 'context': processed_context})
#         return question_list



In [4]:

class QuestionGeneration:
    def __init__(self):

        quantization_config = BitsAndBytesConfig(load_in_4bit=True)
        self.model = T5ForConditionalGeneration.from_pretrained(trained_model_path, quantization_config=quantization_config)
        self.tokenizer = T5Tokenizer.from_pretrained(trained_tokenizer_path)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        #self.model = self.model.to(self.device)
        self.model.eval()

    def process_context(self, topic, context):
        sentences = sent_tokenize(context)
        indices = [i for i, sentence in enumerate(sentences) if topic in sentence]
        
        if len(indices) == 0:
            return context
        
        start_index = max(0, indices[0] - 2)
        end_index = min(len(sentences), indices[-1] + 3)
        return " ".join(sentences[start_index:end_index])

    def generate(self, topic: str, context: str):
        processed_context = self.process_context(topic, context)
        input_text = '<topic> {} <context> {} '.format(topic, processed_context)
        encoding = self.tokenizer.encode_plus(
            input_text,
            return_tensors='pt'
        ).to(self.device)
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        outputs = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            num_beams=10,
            num_return_sequences=8
        )
        question_list = []
        for output in outputs:
            question = self.tokenizer.decode(
                output,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
            question_list.append({'question': question, 'topic': topic, 'context': processed_context})
        return question_list


In [5]:

class QGDataset(Dataset):

    def __init__(self, tokenizer, file_path, max_len_input=512, max_len_output=128):
        self.tokenizer = tokenizer
        self.data = pd.read_csv(file_path)
        self.max_len_input = max_len_input
        self.max_len_output = max_len_output
        self.context_column = 'text'
        self.topic = 'topic'
        self.question_column = 'question'
        self.inputs = []
        self.targets = []
        self._load_data()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]['input_ids'].squeeze()
        target_ids = self.targets[index]['input_ids'].squeeze()
        source_mask = self.inputs[index]['attention_mask'].squeeze()
        target_mask = self.targets[index]['attention_mask'].squeeze()
        labels = copy.deepcopy(target_ids)
        labels[labels == 0] = -100
        return {'source_ids': source_ids, 'source_mask': source_mask, 'target_ids': target_ids, 'target_mask': target_mask, 'labels': labels}

    def _load_data(self):
        for idx in tqdm_notebook(range(len(self.data))):

            context, topic, target = self.data.loc[idx, self.context_column], self.data.loc[idx, self.topic], self.data.loc[idx, self.question_column]
            # if len(str(answer).split()) >= 8:
            #     input_text = '<longanswer> %s <context> %s ' % (answer, context)
            # else:
            #     input_text = '<answer> %s <context> %s ' % (answer, context)
            input_text = '<topic> %s <context> %s ' % (topic, context)
            target = str(target)

            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_text],
                max_length=self.max_len_input,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target],
                max_length=self.max_len_output,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)


In [5]:

class QGDataset(Dataset):

    def __init__(self, tokenizer, file_path, max_len_input=512, max_len_output=128):
        self.tokenizer = tokenizer
        self.data = pd.read_csv(file_path)
        self.max_len_input = max_len_input
        self.max_len_output = max_len_output
        self.context_column = 'text'
        self.topic = 'topic'
        self.question_column = 'question'
        self.inputs = []
        self.targets = []
        self._load_data()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]['input_ids'].squeeze()
        target_ids = self.targets[index]['input_ids'].squeeze()
        source_mask = self.inputs[index]['attention_mask'].squeeze()
        target_mask = self.targets[index]['attention_mask'].squeeze()
        labels = copy.deepcopy(target_ids)
        labels[labels == 0] = -100
        return {'source_ids': source_ids, 'source_mask': source_mask, 'target_ids': target_ids, 'target_mask': target_mask, 'labels': labels}

    def _load_data(self):
        for idx in tqdm_notebook(range(len(self.data))):

            context, topic, target = self.data.loc[idx, self.context_column], self.data.loc[idx, self.topic], self.data.loc[idx, self.question_column]
            # if len(str(answer).split()) >= 8:
            #     input_text = '<longanswer> %s <context> %s ' % (answer, context)
            # else:
            #     input_text = '<answer> %s <context> %s ' % (answer, context)
            input_text = f'{topic}<sep>{context}'
            target = str(target)

            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_text],
                max_length=self.max_len_input,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target],
                max_length=self.max_len_output,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)


In [6]:
# trained_model = 'sentence-transformers/sentence-t5-base'

# class SentenceEmbeddings:
#     def __init__(self):
#         self.embedder = SentenceTransformer(trained_model)
#         self.nlp = spacy.load('en_core_web_sm')  

#     def encode(self, text):
#         return self.embedder.encode(text, convert_to_tensor=True)

#     def split_into_sentence_groups(self, context):
    
#         doc = self.nlp(context)
#         grouped_sentences = []
#         temp_sentences = []
        
       
#         for i, sentence in enumerate(doc.sents):
#             temp_sentences.append(sentence.text)
#             if (i + 1) % 3 == 0:
#                 grouped_sentences.append(' '.join(temp_sentences))
#                 temp_sentences = []
        
#         if temp_sentences:
#             grouped_sentences.append(' '.join(temp_sentences))

#         return grouped_sentences

#     def get_most_similar(self, context: str, qa_list: list, text_weight=0.2, topic_weight=0.8):
    
#         paragraphs = self.split_into_sentence_groups(context)
        
#         paragraph_embeddings = {idx: self.encode(paragraph) for idx, paragraph in enumerate(paragraphs)}

#         top1 = {'idx': None, 'score': float('-inf')}
#         for i in range(len(qa_list)):
#             topic_embeddings = self.encode(qa_list[i]['topic'])

#             paragraph_scores = {idx: util.pytorch_cos_sim(topic_embeddings, par_emb) for idx, par_emb in paragraph_embeddings.items()}
#             most_relevant_paragraph = max(paragraph_scores, key=paragraph_scores.get)

#             question_embeddings = self.encode(qa_list[i]['question'])
#             text_sim = util.pytorch_cos_sim(paragraph_embeddings[most_relevant_paragraph], question_embeddings)
#             topic_sim = util.pytorch_cos_sim(topic_embeddings, question_embeddings)
#             combined_score = (text_sim[0][0].item() * text_weight) + (topic_sim[0][0].item() * topic_weight)

#             if combined_score > top1['score']:
#                 top1['score'] = combined_score
#                 top1['idx'] = i

#         if top1['idx'] is not None:
#             return qa_list[top1['idx']]
#         else:
#             return None


In [6]:

trained_model = 'sentence-transformers/sentence-t5-base'

class SentenceEmbeddings:

    def __init__(self):
        self.embedder = SentenceTransformer(trained_model)

    def encode(self, text):
        return self.embedder.encode(text, convert_to_tensor=True)

    def get_most_similar(self, context: str, qa_list: list, text_weight=0.2, topic_weight=0.8):
        text_embeddings = self.encode(context)
        top1 = {'idx': None, 'score': float('-inf')}
        for i in range(len(qa_list)):
            topic_embeddings = self.encode(qa_list[i]['topic'])
            question_embeddings = self.encode(qa_list[i]['question'])
            text_sim = util.pytorch_cos_sim(text_embeddings, question_embeddings)
            topic_sim = util.pytorch_cos_sim(topic_embeddings, question_embeddings)
            combined_score = (text_sim[0][0].item() * text_weight) + (topic_sim[0][0].item() * topic_weight)

            if combined_score > top1['score']:
                top1['score'] = combined_score
                top1['idx'] = i

        if top1['idx'] is not None:
            return qa_list[top1['idx']]
        else:
            return None

In [None]:
KhanQ_dataset = load_dataset('csv', data_files = '/home/qiyu/Dev/ziqing/T5/combined_KhanQ.csv')
QG = QuestionGeneration()
SE = SentenceEmbeddings()

In [None]:
print('Generating questions...')
i = 0
references = []
predictions = []

for d in tqdm_notebook(range(653)):
    # if i > 3: break
    # i += 1
    topic = KhanQ_dataset['train']['topic2'][d]
    question = KhanQ_dataset['train']['question2'][d]
    context = KhanQ_dataset['train']['text'][d]
    references.append(question)
    qa_pair_list = QG.generate(topic, context)
    generated_question = SE.get_most_similar(context, qa_pair_list)
    predictions.append(generated_question['question'])


In [9]:
np.save('/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/references_T5_Q4_newcs_once.npy', references)
np.save('/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/predictions_T5_Q4_newcs_once.npy', predictions)

In [12]:
np.save('/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/referencestopic2_T5_Q4_newcs_once.npy', references)
np.save('/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/predictionstopic2_T5_Q4_newcs_once.npy', predictions)

In [3]:
references = np.load('/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/references_T5_Q4_newcs_once.npy').tolist()
predictions = np.load('/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/predictions_T5_Q4_newcs_once.npy').tolist()

In [18]:
references_topic2 = np.load('/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/referencestopic2_T5_Q4_newcs_once.npy').tolist()
predictions_topic2 = np.load('/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/predictionstopic2_T5_Q4_newcs_once.npy').tolist()

In [None]:
#####Prediction VS Reference Example
assert len(references) == len(predictions_topic2), "The number of references and predictions must be the same."

for i, (ref, pred) in enumerate(zip(references, predictions_topic2)):
    print(f"Comparison {i+1}:")
    print(f"Reference: {ref}")
    print(f"Prediction: {pred}")
    print('-'*50)

In [23]:
def compute_bleu(predictions, references, weights):
    bs = []
    smooth = SmoothingFunction()
    for i in range(len(predictions)):
        bleu_score = sentence_bleu([references[i]], predictions[i], weights= weights, smoothing_function=smooth.method2)
        bs.append(bleu_score)
    return bs

n_gram = 0
weights = []
if n_gram == 1:
    weights = [1, 0, 0, 0]
elif n_gram == 2:
    weights = [0, 1, 0, 0]
elif n_gram == 3:
    weights = [0, 0, 1, 0]
elif n_gram == 4:
    weights = [0, 0, 0, 1]
elif n_gram == 0:  # Represent mix
    weights = [0.25, 0.25, 0.25, 0.25]
    

bs1 = np.array(compute_bleu(predictions, references, weights=[1,0,0,0]))
bs2 = np.array(compute_bleu(predictions, references, weights=[0,1,0,0]))
bs3 = np.array(compute_bleu(predictions, references, weights=[0,0,1,0]))
bs4 = np.array(compute_bleu(predictions, references, weights=[0,0,0,1]))
bsmix = np.array(compute_bleu(predictions, references, weights=[0.25,0.25,0.25,0.25]))


In [None]:

result_content = f"""
BLEU Score Comparison:
-------------------------------------------------
1-gram BLEU Score:
    Prediction = {bs1.mean()}
-------------------------------------------------
2-gram BLEU Score:
    Prediction = {bs2.mean()}
-------------------------------------------------
3-gram BLEU Score:
    Prediction = {bs3.mean()}
-------------------------------------------------
4-gram BLEU Score:
    Prediction = {bs4.mean()}
-------------------------------------------------
Mixed BLEU Score:
    Prediction = {bsmix.mean()}
-------------------------------------------------
"""

# Write the content to a txt file
file_path = "/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/bleu_T5_Q4_newcs_once.txt"
with open(file_path, "w") as file:
    file.write(result_content)

# Displaying the file path for the user to download
file_path


In [None]:
rouge = load_metric('rouge')
bleu = load_metric('bleu')
meteor = load_metric('meteor')

rouge_score = rouge.compute(predictions=predictions, references=references)
print(rouge_score)


In [None]:

result_content = f"""
ROUGE Score Comparison:
-------------------------------------------------
**ROUGE-1 Scores:**
    Precision - Low: {rouge_score['rouge1'].low.precision:.2%}, Mid: {rouge_score['rouge1'].mid.precision:.2%}, High: {rouge_score['rouge1'].high.precision:.2%}    
    Recall - Low: {rouge_score['rouge1'].low.recall:.2%}, Mid: {rouge_score['rouge1'].mid.recall:.2%}, High: {rouge_score['rouge1'].high.recall:.2%}    
    F-measure - Low: {rouge_score['rouge1'].low.fmeasure:.2%}, Mid: {rouge_score['rouge1'].mid.fmeasure:.2%}, High: {rouge_score['rouge1'].high.fmeasure:.2%}
    -------------------------------------------------
**ROUGE-2 Scores:**
    Precision - Low: {rouge_score['rouge2'].low.precision:.2%}, Mid: {rouge_score['rouge2'].mid.precision:.2%}, High: {rouge_score['rouge2'].high.precision:.2%}
    Recall - Low: {rouge_score['rouge2'].low.recall:.2%}, Mid: {rouge_score['rouge2'].mid.recall:.2%}, High: {rouge_score['rouge2'].high.recall:.2%}
    F-measure - Low: {rouge_score['rouge2'].low.fmeasure:.2%}, Mid: {rouge_score['rouge2'].mid.fmeasure:.2%}, High: {rouge_score['rouge2'].high.fmeasure:.2%}
-------------------------------------------------
**ROUGE-L Scores:**
    Precision - Low: {rouge_score['rougeL'].low.precision:.2%}, Mid: {rouge_score['rougeL'].mid.precision:.2%}, High: {rouge_score['rougeL'].high.precision:.2%}
    Recall - Low: {rouge_score['rougeL'].low.recall:.2%}, Mid: {rouge_score['rougeL'].mid.recall:.2%}, High: {rouge_score['rougeL'].high.recall:.2%}
    F-measure - Low: {rouge_score['rougeL'].low.fmeasure:.2%}, Mid: {rouge_score['rougeL'].mid.fmeasure:.2%}, High: {rouge_score['rougeL'].high.fmeasure:.2%}
-------------------------------------------------
"""
# Print the results
print(result_content)

# Save results to a variable, or alternatively write to a file
rouge_score_T5squad46 = result_content
# with open("rouge_score_T5squad46.txt", "w") as file:
#     file.write(result_content)


# Write the content to a txt file
file_path = "/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/rouge_T5_Q4_newcs_once.txt"
with open(file_path, "w") as file:
    file.write(result_content)

# Displaying the file path for the user to download
file_path


In [None]:
meteor_results = meteor.compute(predictions=predictions, references=references)
print(meteor_results)

In [None]:

result_content = f"""
METEOR Score Comparison:
-------------------------------------------------
METEOR Score: {meteor_results}
-------------------------------------------------
"""

# Print the results to check output
print(result_content)

# Write the content to a txt file
file_path = "/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/meteor_T5_Q4_newcs_once.txt"
with open(file_path, "w") as file:
    file.write(result_content)


file_path


In [None]:
def calculate_f1_score(generated, labels):
    precisions = []
    recalls = []
    
    for gen, lab in zip(generated, labels):
        gen_tokens = set(word_tokenize(gen.lower()))
        lab_tokens = set(word_tokenize(lab.lower()))
        
        common_tokens = gen_tokens.intersection(lab_tokens)
        if len(gen_tokens) == 0 or len(lab_tokens) == 0:
            continue
        
        precision = len(common_tokens) / len(gen_tokens)
        recall = len(common_tokens) / len(lab_tokens)
        
        precisions.append(precision)
        recalls.append(recall)
    
    if precisions and recalls:
        mean_precision = sum(precisions) / len(precisions)
        mean_recall = sum(recalls) / len(recalls)
        if mean_precision + mean_recall != 0:
            f1_score = 2 * (mean_precision * mean_recall) / (mean_precision + mean_recall)
        else:
            f1_score = 0
        return mean_precision, mean_recall, f1_score
    else:
        return 0, 0, 0  # No valid data to calculate scores

precision, recall, f1 = calculate_f1_score(predictions, references)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


In [None]:

result_content = f"""
F1 Score Comparison:
-------------------------------------------------
F1 Score: {f1}

Precision: {precision}

Recall: {recall}
-------------------------------------------------
"""

# Print the results to check output
print(result_content)

# Write the content to a txt file
file_path = "/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/f1_T5_Q4_newcs_once.txt"
with open(file_path, "w") as file:
    file.write(result_content)

file_path


In [None]:

nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("textdescriptives/information_theory")


prediction_texts = [str(pred) for pred in predictions]
reference_texts = [str(ref) for ref in references]


def calculate_perplexity(texts):
    docs = list(nlp.pipe(texts))
    perplexities = [doc._.perplexity for doc in docs]
    return perplexities

prediction_perplexities = calculate_perplexity(prediction_texts)
reference_perplexities = calculate_perplexity(reference_texts)

# Calculate average perplexities
average_prediction_perplexity = np.mean(prediction_perplexities)
average_reference_perplexity = np.mean(reference_perplexities)

# Step 4: Print the results
print(f"Average Prediction Perplexity: {average_prediction_perplexity}")
print(f"Average Reference Perplexity: {average_reference_perplexity}")

In [None]:

result_content = f"""
Perplexity(spaCy):
-------------------------------------------------
Perplexiyu: {average_prediction_perplexity.item()}
-------------------------------------------------
"""

# Print the results to check output
print(result_content)

# Write the content to a txt file
file_path = "/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/PPL_T5_Q4_newcs_once.txt"
with open(file_path, "w") as file:
    file.write(result_content)

# Displaying the file path for the user to download
file_path

In [33]:

def clean_sentences(sentence_list):
    stop_words = set(stopwords.words('english'))
    filtered_sentences = []
    for sentence in sentence_list:
        # Tokenize the sentence
        words = nltk.word_tokenize(sentence)
        # Remove stopwords and question marks
        filtered_sentence = ' '.join([word for word in words if word.lower() not in stop_words and word != '?'])
        filtered_sentences.append(filtered_sentence)
    return filtered_sentences

clean_references = clean_sentences(references)
clean_predictions = clean_sentences(predictions)
clean_references_topic2 = clean_sentences(references_topic2)
clean_predictions_topic2 = clean_sentences(predictions_topic2)


In [None]:
########Example
# Optional: Check a few entries to ensure cleaning is done
print("Original vs Cleaned:")
for original, cleaned in zip(references[:5], clean_references[:5]):
    print("Original:", original)
    print("Cleaned:", cleaned)
    print('-'*50)

In [None]:

assert len(clean_references) == len(clean_predictions), "The number of references and predictions must be the same."


P, R, F1 = score(clean_predictions, clean_references, lang="en", verbose=True)


print(f"Average Precision: {P.mean().item()}")
print(f"Average Recall: {R.mean().item()}")
print(f"Average F1 Score: {F1.mean().item()}")


In [None]:
#### Individual Score Example
for idx, (p, r, f1) in enumerate(zip(P, R, F1)):
    print(f"Comparison {idx+1}:")
    print(f"Reference: {references[idx]}")
    print(f"Prediction: {predictions[idx]}")
    print(f"Precision: {p:.4f}, Recall: {r:.4f}, F1 Score: {f1:.4f}\n")

In [None]:
# Create a DataFrame to store results
results_df = pd.DataFrame({
    'Reference': references,
    'Prediction': predictions,
    'Precision': P.tolist(),
    'Recall': R.tolist(),
    'F1 Score': F1.tolist()
})

# Save to CSV
results_filename = 'BERT30_gq1_lq.csv'
results_df.to_csv(results_filename, index=False)

# Print the path to the saved file
print(f"Results have been saved to {results_filename}")

In [None]:

result_content = f"""
BERT Score Comparison(cleaned):
-------------------------------------------------
BERT Precision: {P.mean().item()}

BERT Recall: {R.mean().item()}

BERT F1: {F1.mean().item()}
-------------------------------------------------
"""

# Print the results to check output
print(result_content)

# Write the content to a txt file
file_path = "/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/BERT_gq1_lq.txt"
with open(file_path, "w") as file:
    file.write(result_content)


file_path

In [None]:

assert len(clean_references) == len(clean_predictions_topic2), "The number of references and predictions must be the same."


P, R, F1 = score(clean_predictions_topic2, clean_references, lang="en", verbose=True)


print(f"Average Precision: {P.mean().item()}")
print(f"Average Recall: {R.mean().item()}")
print(f"Average F1 Score: {F1.mean().item()}")


In [None]:
#### Individual Score Example
for idx, (p, r, f1) in enumerate(zip(P, R, F1)):
    print(f"Comparison {idx+1}:")
    print(f"Reference: {clean_references[idx]}")
    print(f"Prediction: {clean_predictions_topic2[idx]}")
    print(f"Precision: {p:.4f}, Recall: {r:.4f}, F1 Score: {f1:.4f}\n")

In [None]:

results_df = pd.DataFrame({
    'Reference': references,
    'Prediction': predictions_topic2,
    'Precision': P.tolist(),
    'Recall': R.tolist(),
    'F1 Score': F1.tolist()
})


results_filename = 'BERT30_gq2_lq.csv'
results_df.to_csv(results_filename, index=False)


print(f"Results have been saved to {results_filename}")

In [None]:

result_content = f"""
BERT Score Comparison(cleaned):
-------------------------------------------------
BERT Precision: {P.mean().item()}

BERT Recall: {R.mean().item()}

BERT F1: {F1.mean().item()}
-------------------------------------------------
"""

# Print the results to check output
print(result_content)

# Write the content to a txt file
file_path = "/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/BERT_gq2_lq.txt"
with open(file_path, "w") as file:
    file.write(result_content)


file_path

In [39]:

df = pd.read_csv('/home/qiyu/Dev/ziqing/T5/combined_KhanQ.csv')
context1 = df['context1'].tolist()
context2 = df['context2'].tolist()

In [None]:

assert len(context1) == len(clean_predictions_topic2), "The number of references and predictions must be the same."


P, R, F1 = score(clean_predictions_topic2, context1, lang="en", verbose=True)


print(f"Average Precision: {P.mean().item()}")
print(f"Average Recall: {R.mean().item()}")
print(f"Average F1 Score: {F1.mean().item()}")


In [None]:

result_content = f"""
BERT Score Comparison(cleaned):
-------------------------------------------------
BERT Precision: {P.mean().item()}

BERT Recall: {R.mean().item()}

BERT F1: {F1.mean().item()}
-------------------------------------------------
"""

# Print the results to check output
print(result_content)

# Write the content to a txt file
file_path = "/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/BERT_gq2_ctx1.txt"
with open(file_path, "w") as file:
    file.write(result_content)


file_path

In [None]:

assert len(context2) == len(clean_predictions_topic2), "The number of references and predictions must be the same."


P, R, F1 = score(clean_predictions_topic2, context2, lang="en", verbose=True)


print(f"Average Precision: {P.mean().item()}")
print(f"Average Recall: {R.mean().item()}")
print(f"Average F1 Score: {F1.mean().item()}")


In [None]:

result_content = f"""
BERT Score Comparison(cleaned):
-------------------------------------------------
BERT Precision: {P.mean().item()}

BERT Recall: {R.mean().item()}

BERT F1: {F1.mean().item()}
-------------------------------------------------
"""

# Print the results to check output
print(result_content)

# Write the content to a txt file
file_path = "/home/qiyu/Dev/ziqing/T5/train/eval_newcsQGSE/T5_Q4/BERT_qg2_ctx2.txt"
with open(file_path, "w") as file:
    file.write(result_content)


file_path

In [None]:
import scipy.stats as stats

scores_topicqgedu = [0.536, 0.328, 0.221, 0.177, 0.321,0.22,1.345,0.2159]
scores_topicqg = [0.551, 0.343, 0.236, 0.191, 0.33,0.233,1.323,0.2295]


t_stat, p_value = stats.ttest_rel(scores_topicqgedu, scores_topicqg)

print(f't : {t_stat}, p : {p_value}')
