In [16]:
import warnings
import os
import sys

warnings.filterwarnings('ignore')
current_dir = %pwd

parent_dir = os.path.abspath(os.path.join(current_dir, '../..'))
sys.path.append(parent_dir)

In [17]:
from transformers import pipeline
import torch
import outlines
import pandas as pd
from tqdm import tqdm

access_token = "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"
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

In [18]:
from src.main.utilities.utils import get_dataset

inputs, targets = get_dataset(one_hot=False)

choices = ["Entertainment", "Life", "Politics", "Sports", "Voices"]


def make_prompt(sentence, choices):
    prompt = f"""Classify this news in one of the category: '{sentence}'.
    Choose between the following categories: {", ".join(choices)}.
    Answer: """
    return prompt

In [19]:
pipe = pipeline(
    "text-generation", model = model_id, model_kwargs ={"torch_dtype": torch.float16}, device_map = "auto", do_sample= False, token=access_token
)

Downloading shards:   0%|          | 0/4 [03:05<?, ?it/s]


KeyboardInterrupt: 

In [None]:
model_replies = []
for news in inputs:
    prompt = make_prompt(news, choices) 
    reply = pipe(prompt, max_new_tokens = 20, return_full_text = False)
    model_replies.append(reply[0]['generated_text'])       

In [None]:
import pandas as pd
# put everything in a dict and convert to dataframe
d = {"sentence": news, "model reply": model_replies}
df = pd.DataFrame(d)
print(df['model reply'][0])
df

### Use  [Outlines](https://outlines-dev.github.io/outlines/) for forcing specific tokens

In [None]:
del pipe # clean up memory of model

# load the model with outlines
model = outlines.models.transformers(model_id, model_kwargs ={"torch_dtype": torch.float16}, device="cuda")

# load a generator with the model and possible choices
generator = outlines.generate.choice(model, choices)

In [None]:
outlines_replies = []
for news in inputs:
    prompt = make_prompt(news, choices) 
    # generate reply with generator (will be one of choices)
    reply = generator(prompt)
    outlines_replies.append(reply)

In [None]:
df['outlines replies'] = outlines_replies
df

## Multilabel classification w/ LLM prompt and outlines

In [None]:
torch.cuda.empty_cache
import outlines
from tqdm import tqdm

#del pipe # clean up memory of model
#del model #clean up model
#del generator #clean up generator


# load the model with outlines
model = outlines.models.transformers(model_id, model_kwargs ={"torch_dtype": torch.float16}, device="cuda")


# load a generator with the model and possible choices
choices = ['True', 'False']
generator = outlines.generate.choice(model, choices)

labels = ["Entertainment", "Life", "Politics", "Sports", "Voices"]

def make_prompt(news, choices):
    prompt = f"""Classify this news in one of the category: '{news}'.
    Choose between the following categories: {", ".join(choices)}.
    Answer {" or ".join(choices)}."""
    return prompt

outlines_replies = []
for sentence in tqdm(news):
    prompts = make_prompt(sentence, labels, choices)
    # generate reply with generator (will be one of choices)
    predictions = generator(prompts)
    #predictions = [True if "si" else False for e in predictions]
    outlines_replies.append(predictions)

In [None]:
import pandas as pd
from IPython.display import display, Markdown, Latex

display(Markdown(f"""{"# <center>LlaMA-3</center>"}"""))
pd.set_option('display.max_colwidth', None)

df = pd.DataFrame(outlines_replies, columns = labels)
df['sentence'] = news
display(df[["Entertainment", "Life", "Politics", "Sports", "Voices"]])