In [1]:
from dotenv import load_dotenv
from hydra import compose, initialize
from omegaconf import OmegaConf

from supertrainer import StrictDict
from supertrainer.utils.helpers import login_hf

load_dotenv()
login_hf()

[32m2024-10-13 18:30:20.301[0m | [34m[1mDEBUG   [0m | [36msupertrainer.utils.helpers[0m:[36mlogin_hf[0m:[36m42[0m - [34m[1mUse token from environment variable HUGGINGFACE_API_KEY[0m


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/erland.fuadi/.cache/huggingface/token
Login successful


In [11]:
with initialize(config_path="../configs", version_base=None):
    cfg = compose(config_name="train", overrides=["+evaluation=arabicbert_factcheck"])
    cfg = OmegaConf.to_container(cfg, resolve=True)
    cfg = StrictDict(cfg)
    print(cfg)

{'testing': False, 'evaluation': {'evaluation': {'model_kwargs': {'attn_implementation': 'sdpa', 'torch_dtype': 'auto', 'use_cache': False, 'device_map': None}, 'class_name': 'supertrainer.inferences.bert.BERTInference', 'classes': ['REFUTES', 'SUPPORTS', 'NOT_ENOUGH_INFO'], 'model_name': 'asafaya/bert-base-arabic'}}}


In [3]:
import torch
from supertrainer.inference.base import BaseInference
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import LoraConfig, get_peft_model
from supertrainer import types, logger

class BERTInference(BaseInference):
    def __init__(self, config: types.Config) -> None:
        super().__init__(config)

    def load_model(self) -> AutoModelForSequenceClassification:
        model = AutoModelForSequenceClassification.from_pretrained(
            self.config.inference.model_name, **self.config.inference.model_kwargs
        )
        model.to(self.device)
        model.eval()
        logger.debug("BERT model loaded and ready for inference.")
        return model

    def load_tokenizer(self) -> AutoTokenizer:
        tokenizer = AutoTokenizer.from_pretrained(self.config.inference.model_name)
        if tokenizer.pad_token is None:
            tokenizer.add_special_tokens({"pad_token": tokenizer.eos_token})
        if tokenizer.model_max_length > 100_000:
            tokenizer.model_max_length = 2048
        return tokenizer

    def preprocess(self, text: str):
        return self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)

    def postprocess(self, outputs):
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=-1).item()
        label = self.config.inference.id2class.get(predicted_class, "Unknown")
        return label

    def predict(self, text: str) -> str:
        inputs = self.preprocess(text)
        with torch.no_grad():
            outputs = self.model(**inputs)
        prediction = self.postprocess(outputs)
        return prediction


In [4]:
id2class = {k: v for k, v in enumerate(cfg.inference.classes)}
class2id = {v: k for k, v in id2class.items()}

with cfg.allow_modification():
    cfg.inference.id2class = id2class
    cfg.inference.class2id = class2id

In [20]:
cfg

{'testing': False,
 'inference': {'model_kwargs': {'attn_implementation': 'sdpa',
   'torch_dtype': 'auto',
   'use_cache': False,
   'device_map': None},
  'class_name': 'supertrainer.trainers.inference.bert.BERTInference',
  'classes': ['REFUTES', 'SUPPORTS', 'NOT_ENOUGH_INFO'],
  'model_name': 'FacebookAI/xlm-roberta-base',
  'id2class': {0: 'REFUTES', 1: 'SUPPORTS', 2: 'NOT_ENOUGH_INFO'},
  'class2id': {'REFUTES': 0, 'SUPPORTS': 1, 'NOT_ENOUGH_INFO': 2}}}

In [5]:
bert_inference = BERTInference(cfg)
bert_inference.predict("This is a test sentence.")

[32m2024-10-13 15:31:56.475[0m | [1mINFO    [0m | [36msupertrainer.inference.base[0m:[36mtokenizer[0m:[36m37[0m - [1mLoading tokenizer[0m
[32m2024-10-13 15:31:57.597[0m | [34m[1mDEBUG   [0m | [36msupertrainer.inference.base[0m:[36mmodel[0m:[36m28[0m - [34m[1mLoading model[0m
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[32m2024-10-13 15:32:09.072[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mload_model[0m:[36m17[0m - [34m[1mBERT model loaded and ready for inference.[0m


'SUPPORTS'

# Database

In [11]:
from datasets import load_dataset

dataset = load_dataset("Yoonseong/climatebert_factcheck")
dataset

DatasetDict({
    train: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category'],
        num_rows: 4298
    })
    test: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category'],
        num_rows: 1535
    })
    valid: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category'],
        num_rows: 1842
    })
})

In [12]:
# Select random 200 datasets
dataset = dataset["train"].shuffle(seed=42).select(range(200))

# export to csv
# dataset.to_csv("data.csv")

# Transform Dataset

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")



In [4]:
tokenizer.chat_template = "{{ messages['claim'] }}. Evidence: {{ messages['evidence'] }}"

In [28]:
tokenizer.chat_template

"{{ claim + '. Evidence: ' + evidence}}"

In [35]:
tokenizer.apply_chat_template(dataset, tokenize=False)



In [30]:
dataset[0]

{'claim_id': 216,
 'claim': '[The 1990 IPCC report said] that the Antarctic ice sheets were stable',
 'evidence': 'The West Antarctic Ice Sheet is stable so long as the Ross Ice Shelf is constrained by drag along its lateral boundaries and pinned by local grounding.',
 'evidence_label': 'NOT_ENOUGH_INFO',
 'label': 'neutral',
 'category': 'Cryosphere'}

In [13]:
dataset

Dataset({
    features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category'],
    num_rows: 200
})

In [22]:
def format_dataset(examples):
    texts = [f"{claim}. Evidence: {evidence}" for claim, evidence in zip(examples["claim"], examples["evidence"])]
    labels = examples["evidence_label"]
    return {"text": texts, "labels": labels}

In [24]:
# Transform dataset to text and labels

formatted_dataset = dataset.map(format_dataset, batched=True, remove_columns=dataset.column_names)
formatted_dataset
    

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels'],
    num_rows: 200
})

In [27]:
from datasets import DatasetDict

formatted_dataset_dict = DatasetDict({"test": formatted_dataset})
formatted_dataset.save_to_disk("/home/erland.fuadi/Python_Project/supertrainer/assets_local/formatted_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/200 [00:00<?, ? examples/s]

In [1]:
from huggingface_hub import whoami

whoami()

{'type': 'user',
 'id': '6548ca34bab28a482e69c24a',
 'name': 'Masa-Erland',
 'fullname': 'Erland Hilman Fuadi',
 'email': 'erland@joinmasa.ai',
 'emailVerified': True,
 'canPay': False,
 'periodEnd': 1730419199,
 'isPro': False,
 'avatarUrl': '/avatars/d6f733991b3011ce53c9055f3083332f.svg',
 'orgs': [{'type': 'org',
   'id': '659e04966dc6b399d29d03e2',
   'name': 'joinmasa-ai',
   'fullname': 'Masa Research',
   'email': None,
   'canPay': False,
   'periodEnd': None,
   'avatarUrl': 'https://www.gravatar.com/avatar/cfb1ee41354e06b84514ed7af5519c6b?d=retro&size=100',
   'roleInOrg': 'write',
   'isEnterprise': False},
  {'type': 'org',
   'id': '6605a972330c04c62fa17a8e',
   'name': 'masa-research',
   'fullname': 'masa research',
   'email': None,
   'canPay': False,
   'periodEnd': None,
   'avatarUrl': 'https://www.gravatar.com/avatar/abe396516d4115bad5849f256d9eb86c?d=retro&size=100',
   'roleInOrg': 'write',
   'isEnterprise': False}],
 'auth': {'type': 'access_token',
  'accessTo

# LLM

In [3]:
import torch
from unsloth import FastLanguageModel

max_seq_length = 2048
dtype = torch.bfloat16
load_in_4bit = True

model_name = "unsloth/Llama-3.2-3B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.647 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [4]:
tokenizer.chat_template

'{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- if strftime_now is defined %}\n        {%- set date_string = strftime_now("%d %b %Y") %}\n    {%- else %}\n        {%- set date_string = "26 Jul 2024" %}\n    {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0][\'role\'] == \'system\' %}\n    {%- set system_message = messages[0][\'content\']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = "" %}\n{%- endif %}\n\n{#- System message #}\n{{- "<|start_header_id|>system<|end_header_id|>\\n\\n" }}\n{%- if tools is not none %}\n    {{- "Environment: ipython\\n" }}\n{%- endif %}\n{{- "Cutting

In [7]:
from unsloth import get_chat_template

tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")
tokenizer.chat_template

'{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = "26 July 2024" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0][\'role\'] == \'system\' %}\n    {%- set system_message = messages[0][\'content\'] %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = "" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- "Environment: ipython\n" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- "Tools: " + builtin_tools | reject(\'equalto\', \'code_interpreter\'

In [15]:
FastLanguageModel.for_inference(model)

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

In [24]:
inputs["input_ids"].shape

IndexError: too many indices for tensor of dimension 2

In [26]:
outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                         temperature = 1.5, min_p = 0.1)
tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens = True)

'The Fibonacci sequence is a series of numbers where a number is the sum of the two preceding ones, usually starting with 0 and 1. \n\nHowever, you provided the sequence 1, 1, 2, 3, 5, 8, which seems to start with the traditional starting numbers '

# Anthropic

In [1]:
import anthropic
from dotenv import load_dotenv

load_dotenv()

client = anthropic.Anthropic()

message = client.messages.create(
    model="claude-3-5-sonnet-20240620",
    max_tokens=1000,
    temperature=0,
    system="You are a world-class poet. Respond only with short poems.",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Why is the ocean salty?"
                }
            ]
        }
    ]
)
print(message.content)

[TextBlock(text="Waves crash and foam,\nEarth's tears flow free,\nEons of minerals\nDissolve in the sea.\n\nSalt from the land,\nCarried by streams,\nAccumulates where\nThe blue ocean gleams.", type='text')]


In [2]:
from datasets import Dataset

dataset = Dataset.load_from_disk("/home/erland.fuadi/Python_Project/supertrainer/assets_local/formatted_dataset")
dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 200
})

In [3]:
subset_dataset = dataset.select(range(5))
subset_dataset.save_to_disk("/home/erland.fuadi/Python_Project/supertrainer/assets_local/subset_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/5 [00:00<?, ? examples/s]

In [9]:
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
client = OpenAI()

client.batches.list(limit=10)

SyncCursorPage[Batch](data=[Batch(id='batch_6711e2276af88190a92b90b5ed197cdf', completion_window='24h', created_at=1729225255, endpoint='/v1/chat/completions', input_file_id='file-EcWg38OHkKs4W73bEP3gUs5y', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1729311655, failed_at=None, finalizing_at=None, in_progress_at=1729225255, metadata={'description': 'GPT-4o-mini fact-checking'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=5))], object='list', first_id='batch_6711e2276af88190a92b90b5ed197cdf', last_id='batch_6711e2276af88190a92b90b5ed197cdf', has_more=False)

In [6]:
from instructor.batch import BatchJob
from pydantic import BaseModel 
from typing import Literal

class ClassificationResponse(BaseModel):
    label: Literal["REFUTES", "SUPPORTS", "NOT_ENOUGH_INFO"]

parsed, unparsed = BatchJob.parse_from_file(
    file_path="../output2.jsonl",
    response_model=ClassificationResponse,
)


In [7]:
parsed

[]

In [8]:
unparsed

[{'id': 'batch_req_6711ef207c808190aa8701bc5ecacd6e',
  'custom_id': 'request-1',
  'response': {'status_code': 200,
   'request_id': 'a2c2d4f3dbd835842cc26d03ea0e69cb',
   'body': {'id': 'chatcmpl-AJZXCBOzIhOn238la2SLHSoZpnvKC',
    'object': 'chat.completion',
    'created': 1729228190,
    'model': 'gpt-4o-mini-2024-07-18',
    'choices': [{'index': 0,
      'message': {'role': 'assistant',
       'content': 'The claim regarding the 1990 IPCC report stating that the Antarctic ice sheets were stable is misleading. While the 1990 report did present findings that suggested some aspects of the Antarctic ice sheets were stable at that time, it did not comprehensively state that the entire Antarctic ice sheet was stable or would remain so indefinitely',
       'refusal': None},
      'logprobs': None,
      'finish_reason': 'length'}],
    'usage': {'prompt_tokens': 81,
     'completion_tokens': 64,
     'total_tokens': 145,
     'prompt_tokens_details': {'cached_tokens': 0},
     'comple