In [1]:
from time import time
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from IPython.display import display, Markdown

In [2]:
from huggingface_hub import login
login(token = "")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
model = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [59]:
from time import time
def query_model(
        system_message,
        user_message,
        temperature=0.1,
        max_length=50,
        top_p = 0.3
        ):
    user_message = "Question: " + user_message + " Answer:"
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
        ]
    prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
        )
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    sequences = pipeline(
        prompt,
        do_sample=True,
        top_p=top_p,
        temperature=temperature,
        #num_return_sequences=1,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=pipeline.model.config.eos_token_id
    )
    #answer = f"{sequences[0]['generated_text'][len(prompt):]}\n"
    answer = sequences[0]['generated_text']

    return user_message + " " + answer


system_message = """
You are an AI assistant designed to answer simple questions.
Please restrict your answer to the exact question asked.
"""

In [6]:
def colorize_text(text):
    for word, color in zip(["Reasoning", "Question", "Answer", "Total time"], ["blue", "red", "green", "magenta"]):
        text = text.replace(f"{word}:", f"\n\n**<font color='{color}'>{word}:</font>**")
    return text

In [85]:
import pandas as pd
df = pd.read_csv('/kaggle/input/cwi-dataset/cwi_training.txt', delimiter = "\t", header = None)
df.columns = ["sentence", "token", "index", "score"]

In [70]:
from sklearn.metrics import accuracy_score, recall_score, f1_score

def g_score(labels, predicted):

  accuracy = accuracy_score(labels, predicted)
  recall = recall_score(labels, predicted)

  return (2 * accuracy * recall) / (accuracy + recall)

In [67]:
from tqdm.notebook import tqdm

df = df.iloc[:35]
result = []
labels = df.score.values

for index, row in tqdm(df.iterrows()):
     response = query_model(
        system_message,
        user_message=f"""Question: Considering that even common words can be complex in contexts, is the word '{row['token']}' in the following sentence considered complex for a non-native English speaker? Please answer with 'Yes' or 'No'.
            Context: '{row['sentence']}' 
        """,
        temperature = 0.1,
        max_length=25)
     result.append(response[-3:])

0it [00:00, ?it/s]

In [68]:
results = [1 if i.rstrip() == "Yes" else 0 for i in result]

In [78]:
print("accuracy", accuracy_score(labels, results) * 100)
print("recall", recall_score(labels, results) * 100)
print("F1-Score", f1_score(labels, results) * 100)
print("G-Score", g_score(labels, results) * 100)

accuracy 65.71428571428571
recall 76.0
F1-Score 76.0
G-Score 70.48387096774194


In [86]:
from tqdm.notebook import tqdm

df = df.iloc[-50:]
result = []
labels = df.score.values

for index, row in tqdm(df.iterrows()):
     response = query_model(
        system_message,
        user_message=f"""Question: Considering that even common words can be complex in contexts, is the word '{row['token']}' in the following sentence considered complex for a non-native English speaker? Please answer with 'Yes' or 'No'.
            Context: '{row['sentence']}' 
        """,
        temperature = 0.1,
        max_length=25)
     result.append(response[-3:])

0it [00:00, ?it/s]

In [87]:
results = [1 if i.rstrip() == "Yes" else 0 for i in result]

In [88]:
print("accuracy", accuracy_score(labels, results) * 100)
print("recall", recall_score(labels, results) * 100)
print("F1-Score", f1_score(labels, results) * 100)
print("G-Score", g_score(labels, results) * 100)

accuracy 86.0
recall 68.75
F1-Score 75.86206896551724
G-Score 76.41357027463651
