In [1]:
from time import time
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from IPython.display import display, Markdown

In [2]:
from huggingface_hub import login
login(token = "")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
model = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)

2024-06-20 09:37:52.246041: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-20 09:37:52.246176: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-20 09:37:52.362443: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
def query_model(
        system_message,
        user_message,
        temperature=0.1,
        max_length=50,
        top_p = 0.9
        ):
    start_time = time()
    user_message = "Question: " + user_message + " Answer:"
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
        ]
    prompt = pipeline.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
        )
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    sequences = pipeline(
        prompt,
        do_sample=True,
        top_p=top_p,
        temperature=temperature,
        #num_return_sequences=1,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=pipeline.model.config.eos_token_id
    )
    #answer = f"{sequences[0]['generated_text'][len(prompt):]}\n"
    answer = sequences[0]['generated_text']
    end_time = time()
    ttime = f"Total time: {round(end_time-start_time, 2)} sec."

    return user_message + " " + answer  + " " +  ttime


system_message = """
You are an AI assistant designed to answer simple questions.
Please restrict your answer to the exact question asked.
"""

In [5]:
def colorize_text(text):
    for word, color in zip(["Reasoning", "Question", "Answer", "Total time"], ["blue", "red", "green", "magenta"]):
        text = text.replace(f"{word}:", f"\n\n**<font color='{color}'>{word}:</font>**")
    return text

In [8]:
response = query_model(
    system_message,
    user_message="Can you tell if word in sentence is lexically complex or no? I will give you sentence and you predict the lexical complexity between 0 and 1, outputting inly one number, can you do it?",
    temperature=0.1,
    max_length=256)
display(Markdown(colorize_text(response)))



**<font color='red'>Question:</font>** Can you tell if word in sentence is lexically complex or no? I will give you sentence and you predict the lexical complexity between 0 and 1, outputting inly one number, can you do it? 

**<font color='green'>Answer:</font>** Yes, I can predict the lexical complexity of a word in a sentence. Please provide the sentence, and I'll output a number between 0 and 1, where 0 represents a simple word and 1 represents a complex word. 

**<font color='magenta'>Total time:</font>** 31.16 sec.

In [6]:
!wget -O train.tsv "https://raw.githubusercontent.com/neilrs123/Lexical-Complexity-Prediction/master/Dataset/Sub-task%201/lcp_single_train.tsv"

!wget -O trial.tsv "https://raw.githubusercontent.com/neilrs123/Lexical-Complexity-Prediction/master/Dataset/Sub-task%201/lcp_single_trial.tsv"

  pid, fd = os.forkpty()


--2024-06-18 23:26:24--  https://raw.githubusercontent.com/neilrs123/Lexical-Complexity-Prediction/master/Dataset/Sub-task%201/lcp_single_train.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1746979 (1.7M) [text/plain]
Saving to: 'train.tsv'


2024-06-18 23:26:24 (71.8 MB/s) - 'train.tsv' saved [1746979/1746979]

--2024-06-18 23:26:25--  https://raw.githubusercontent.com/neilrs123/Lexical-Complexity-Prediction/master/Dataset/Sub-task%201/lcp_single_trial.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 97235 (95K) [text/pl

In [11]:
import pandas as pd
train, trial = df = pd.read_table('train.tsv'), pd.read_table('trial.tsv')

In [12]:
train = train[train["corpus"] == "bible"].drop(["corpus", "id"], axis = 1)
trial = trial[trial["subcorpus"] == "bible"].drop(["subcorpus", "id"], axis = 1)

In [65]:
from datasets import Dataset
data = Dataset.from_pandas(trial.iloc[:35])
from transformers import set_seed
from tqdm import tqdm
import re
import warnings
warnings.filterwarnings('ignore')

set_seed(9)

In [None]:
llama: temp: 0.5, best prompt ---> 34%

In [79]:
results = []
for row in tqdm(data):
    token, sentence = row["token"], row["sentence"]
    
    response = query_model(
    system_message,
    user_message=f"""Estimate the lexical complexity of the token in the context, give one single float value between 0 and 1
            where 0 means the lowest lexical complexity and 1 means the highest lexical complexity (very complex).
            the third quartile at 0.371 meaning most of the tokens complexity below 0.371.
            use all possible features to accuratly estimate it:
            The Context: {row['sentence']}
            Token: {row['token']}""",
    temperature = 0.5,
    max_length=25)
#     display(Markdown(colorize_text(response)))
    match = re.search(pattern, response)
    if match:
        number = float(match.group(1))
    else:
        number = None
    results.append(number)    

100%|██████████| 35/35 [01:53<00:00,  3.23s/it]


In [80]:
from scipy.stats import pearsonr
pearsonr(results, data["complexity"])

PearsonRResult(statistic=0.22951723108012925, pvalue=0.18473522973917983)

In [None]:
    user_message=f"""Estimate the lexical complexity of the token in the context, give one single float value between 0 and 1
            the third quartile at 0.371 meaning most of the tokens complexity below 0.371.
            use all possible features to accuratly estimate it:
            The Context: {row['sentence']}
            Token: {row['token']}""",
    temperature = 0.5,
    max_length=25)
results

In [None]:
#best result 34%