In [None]:
!pip install transformers datasets

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("joyfine/llama2-7b-fine-tuning_QA_Google_20")
model = AutoModelForCausalLM.from_pretrained("joyfine/llama2-7b-fine-tuning_QA_Google_20")

In [None]:
from tqdm import tqdm
import os
import torch
import csv
import re
import pandas as pd

In [None]:
test = "truthfulqa" # truthfulqa, sst2, finance, twitter, google

In [None]:
if test == "truthfulqa":
    dataset = load_dataset("truthful_qa","multiple_choice")
    train_set, test_set = train_test_split(dataset['validation'], test_size=0.2, random_state=42)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    for i in tqdm(range(0,len(test_set['question']),1)): # for i in tqdm(range(0,len(dataset['validation']['question']),1))
        # question = dataset['validation']['question'][i]
        # data = dataset['validation']['mc1_targets'][i]
        question = test_set['question'][i]
        data = test_set['mc1_targets'][i]
        options = [f"{chr(65+i)}. {choice}" for i, choice in enumerate(data['choices'])]
        correct_answer = [chr(65+i) for i, label in enumerate(data['labels']) if label == 1]
        input = question + '\n' + '\n'.join(options)
        prompt = f"Question: [{input}] Please answer the following multiple-choice question and only give me the selected option and provide your confidence level. \
        Note that the confidence level indicates the degree of certainty you have about your answer and is represented as a percentage. Make sure you answer in the following structure: \n \
        [Answer]: , \n[Confidence (0-100)]: \n \
        Note: The confidence level indicates the degree of certainty you have about your answer and is represented as a percentage. \
        For instance, if your confidence level is 80%, it means you are 80% certain that your answer is correct and there is a 20% chance that it may be incorrect. "
        prompt_template=f'''A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {prompt} ASSISTANT:

        '''
        input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
        output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
        output = tokenizer.decode(output[0])
        answer_match = re.search(r'\[Answer\]: (\w)', output)
        confidence_match = re.search(r'\[Confidence \(0\-100\)\]: (\d+)', output)
        answer = answer_match.group(1) if answer_match else output
        confidence_level = int(confidence_match.group(1)) if confidence_match else output
        # print(answer, confidence_level)
        with open('/content/drive/MyDrive/1011_truthfulQA/truthfulQA_256.csv', 'a+', newline='', encoding = 'utf-8') as file:
            writer = csv.writer(file)

            # Writing headers
            if os.path.getsize('/content/drive/MyDrive/1011_truthfulQA/truthfulQA_256.csv') == 0:
                writer.writerow(["Question", "Options", "Predicted Answer", "Correct Answer", "Confidence Level", "output"])

            # Writing data
            writer.writerow([question, "\n".join(options), answer, correct_answer[0], confidence_level, output])

elif test == "sst2":
    dataset = load_dataset("sst2")
    test_set = dataset['validation']

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    start = 0
    max = len(test_set['sentence'])
    prefix = '''Read this sentence, select the correct sentiment for it and give the option letter: A: positive and B: negative \
    Use the following format to provide your answer and confidence level: \n
    Answer and Confidence (0-100): [Your answer, please only include the capital letter], \
    [Your confidence level, please only include the numerical number]% \n
    Note: The confidence level indicates the degree of certainty you have about your answer and is represented as a percentage. \
    For instance, if your confidence level is 80%, it means you are 80% certain that your answer is correct and there is a 20% chance that it may be incorrect.'''

    for sentence,label in tqdm(zip(test_set['sentence'][start:max],test_set['label'][start:max])):
        correct_answer = 'A' if label == 1 else 'B'
        input = sentence + '\n'
        prompt = prefix + f"Sentence: [{input}]"
        prompt_template=f'''A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {prompt} ASSISTANT:'''

        input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
        output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=5, max_new_tokens=512)
        output = tokenizer.decode(output[0])
        answer_confidence = re.search(r'Answer and Confidence(?: \(0-100\))?: (.+), (\d{1,3})%', output)
        answer = answer_confidence.group(1).strip() if answer_confidence else output
        confidence_level = int(answer_confidence.group(2)) if answer_confidence else output
        options = ['A. Positive', 'B. Negative']
        with open('/content/drive/MyDrive/output_1011_parameter/SST2_16_again.csv', 'a+', newline='', encoding = 'utf-8') as file:
            writer = csv.writer(file)
            if os.path.getsize('/content/drive/MyDrive/output_1011_parameter/SST2_16_again.csv') == 0:
                writer.writerow(["Question", "Options", "Predicted Answer", "Correct Answer", "Confidence Level", "output"])
            writer.writerow([sentence, "\n".join(options), answer, correct_answer[0], confidence_level, output])

elif test == "finance":
    random_seed = 42
    dataset = load_dataset("financial_phrasebank", 'sentences_allagree')
    df = pd.DataFrame.from_dict(dataset["train"])
    train = df.sample(n=754, random_state=random_seed)
    df_remaining = df.drop(train.index)
    test_set = df_remaining.sample(frac=0.2, random_state=random_seed)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    start = 0
    # max = len(test_set['sentence'])
    max = 90
    prefix = '''Read this sentence, select the correct sentiment for it and give the option letter: A: positive, B: negative and C: neutral. \
    Use the following format to provide your answer and confidence level. \n
    Answer and Confidence (0-100): [Your answer, please only include the capital letter], \
    [Your confidence level, please only include the numerical number]% \n
    Note: The confidence level indicates the degree of certainty you have about your answer and is represented as a percentage. \
    For instance, if your confidence level is 80%, it means you are 80% certain that your answer is correct and there is a 20% chance that it may be incorrect.'''

    for sentence,label in tqdm(zip(test_set['sentence'][start:max],test_set['label'][start:max])):
        correct_answer = 'A' if label == 2 else ('B' if label == 0 else 'C')
        input = sentence + '\n'
        prompt = prefix + f"Sentence: [{input}]"
        prompt_template=f'''A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {prompt} ASSISTANT:'''

        input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
        output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=5, max_new_tokens=512)
        output = tokenizer.decode(output[0])
        answer_confidence = re.search(r'Answer and Confidence(?: \(0-100\))?: (.+), (\d{1,3})%', output)
        answer = answer_confidence.group(1).strip() if answer_confidence else output
        confidence_level = int(answer_confidence.group(2)) if answer_confidence else output
        options = ['A. Positive', 'B. Negative', 'C. Neutral']
        with open('/content/drive/MyDrive/output_1011_parameter/Finance_64.csv', 'a+', newline='', encoding = 'utf-8') as file:
            writer = csv.writer(file)
            if os.path.getsize('/content/drive/MyDrive/output_1011_parameter/Finance_64.csv') == 0:
                writer.writerow(["Question", "Options", "Predicted Answer", "Correct Answer", "Confidence Level", "output"])
            writer.writerow([sentence, "\n".join(options), answer, correct_answer[0], confidence_level, output])

elif test == "twitter":
    dataset = load_dataset("carblacac/twitter-sentiment-analysis")
    random_seed = 42
    df = pd.DataFrame.from_dict(dataset["test"])
    test_set = df.sample(n=319, random_state=random_seed)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    start = 0
    max = len(test_set['text'])
    prefix = '''Read this sentence, select the correct sentiment for it and give the option letter: A: positive and B: negative \
    Use the following format to provide your answer and confidence level: \n
    Answer and Confidence (0-100): [Your answer, please only include the capital letter], \
    [Your confidence level, please only include the numerical number]% \n
    Note: The confidence level indicates the degree of certainty you have about your answer and is represented as a percentage. \
    For instance, if your confidence level is 80%, it means you are 80% certain that your answer is correct and there is a 20% chance that it may be incorrect.'''

    for sentence,label in tqdm(zip(test_set['text'][start:max],test_set['feeling'][start:max])):
        correct_answer = 'A' if label == 1 else 'B'
        input = sentence + '\n'
        prompt = prefix + f"Sentence: [{input}]"
        prompt_template=f'''A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {prompt} ASSISTANT:'''

        input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
        output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=5, max_new_tokens=512)
        output = tokenizer.decode(output[0])
        answer_confidence = re.search(r'Answer and Confidence(?: \(0-100\))?: (.+), (\d{1,3})%', output)
        answer = answer_confidence.group(1).strip() if answer_confidence else output
        confidence_level = int(answer_confidence.group(2)) if answer_confidence else output
        options = ['A. Positive', 'B. Negative']
        with open('/content/drive/MyDrive/output_1011_parameter/Twitter_64.csv', 'a+', newline='', encoding = 'utf-8') as file:
            writer = csv.writer(file)
            if os.path.getsize('/content/drive/MyDrive/output_1011_parameter/Twitter_64.csv') == 0:
                writer.writerow(["Question", "Options", "Predicted Answer", "Correct Answer", "Confidence Level", "output"])
            writer.writerow([sentence, "\n".join(options), answer, correct_answer[0], confidence_level, output])
elif test == "google":
    random_seed = 42
    df = pd.read_csv('/content/drive/MyDrive/google_play_comments.csv', encoding='latin-1')
    train = df.sample(n=650, random_state=random_seed)
    df_remaining = df.drop(train.index)
    test_set = df_remaining.sample(n=306, random_state=random_seed)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    start = 0
    max = len(test_set['comment'])
    prefix = '''Read this sentence, select the correct sentiment for it and give the option letter: A: positive and B: negative \
    Use the following format to provide your answer and confidence level: \n
    Answer and Confidence (0-100): [Your answer, please only include the capital letter], \
    [Your confidence level, please only include the numerical number]% \n
    Note: The confidence level indicates the degree of certainty you have about your answer and is represented as a percentage. \
    For instance, if your confidence level is 80%, it means you are 80% certain that your answer is correct and there is a 20% chance that it may be incorrect.'''

    for sentence,label in tqdm(zip(test_set['comment'][start:max],test_set['label'][start:max])):
        correct_answer = 'A' if label == 1 else 'B'
        input = sentence + '\n'
        prompt = prefix + f"Sentence: [{input}]"
        prompt_template=f'''A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: {prompt} ASSISTANT:'''

        input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
        output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=5, max_new_tokens=512)
        output = tokenizer.decode(output[0])
        answer_confidence = re.search(r'Answer and Confidence(?: \(0-100\))?: (.+), (\d{1,3})%', output)
        answer = answer_confidence.group(1).strip() if answer_confidence else output
        confidence_level = int(answer_confidence.group(2)) if answer_confidence else output
        options = ['A. Positive', 'B. Negative']
        with open('/content/drive/MyDrive/output_1011_parameter/GooglePlay_512.csv', 'a+', newline='', encoding = 'utf-8') as file:
            writer = csv.writer(file)
            if os.path.getsize('/content/drive/MyDrive/output_1011_parameter/GooglePlay_512.csv') == 0:
                writer.writerow(["Question", "Options", "Predicted Answer", "Correct Answer", "Confidence Level", "output"])
            writer.writerow([sentence, "\n".join(options), answer, correct_answer[0], confidence_level, output])