In [1]:
from datasets import Dataset, DatasetDict
import pandas as pd
import os
from sklearn.model_selection import train_test_split




  from .autonotebook import tqdm as notebook_tqdm


In [None]:
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

path = os.path.join('data', 'All_capped_keywords.csv')

df = pd.read_csv(path)



In [8]:
# Split into train/test (90% train, 10% test)
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# Create a DatasetDict like in the SFT notebook
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

dataset

DatasetDict({
    train: Dataset({
        features: ['paperID', 'venue', 'year', 'openAccessPdf', 'url', 'authors', 'referenceCount', 'title', 'abstract', 'conclusion', 'Chatgpt Response', 'Key Takeaways', 'Importance', 'Model/Method Proposed', 'Performance', 'Effectiveness', 'Future Works', 'Sentiment', 'Sentiment Score', 'combined', 'combined_keywords', 'response_keywords', 'future_work_keywords', 'capped_keywords', 'field'],
        num_rows: 82727
    })
    test: Dataset({
        features: ['paperID', 'venue', 'year', 'openAccessPdf', 'url', 'authors', 'referenceCount', 'title', 'abstract', 'conclusion', 'Chatgpt Response', 'Key Takeaways', 'Importance', 'Model/Method Proposed', 'Performance', 'Effectiveness', 'Future Works', 'Sentiment', 'Sentiment Score', 'combined', 'combined_keywords', 'response_keywords', 'future_work_keywords', 'capped_keywords', 'field'],
        num_rows: 9192
    })
})

### an example

In [9]:
example = dataset["train"][0]
print(example.keys())

dict_keys(['paperID', 'venue', 'year', 'openAccessPdf', 'url', 'authors', 'referenceCount', 'title', 'abstract', 'conclusion', 'Chatgpt Response', 'Key Takeaways', 'Importance', 'Model/Method Proposed', 'Performance', 'Effectiveness', 'Future Works', 'Sentiment', 'Sentiment Score', 'combined', 'combined_keywords', 'response_keywords', 'future_work_keywords', 'capped_keywords', 'field'])


### print the abstract from this example

In [10]:
abstract = example["abstract"]
print(abstract)

Ambiguity is a major obstacle to providing services based on sentence classification. However, because of the structural limitations of the service, there may not be sufficient contextual information to resolve the ambiguity. In this situation, we focus on ambiguity detection so that service design considering ambiguity is possible. We utilize similarity in a semantic space to detect ambiguity in service scenarios and training data. In addition, we apply task-specific embedding to improve performance. Our results demonstrate that ambiguities and resulting labeling errors in training data or scenarios can be detected. Additionally, we confirm that it can be used to debug services


### load the tokenizer

In [11]:
from transformers import AutoTokenizer


In [12]:

model_id = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# required for training
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Set reasonable max length for abstracts + titles
if tokenizer.model_max_length > 100_000:
    tokenizer.model_max_length = 600

# Define a simple instruction template for supervised fine-tuning
# The model input will be the abstract, output will be the title
DEFAULT_TITLE_TEMPLATE = "Abstract: {abstract}\nTitle:"

tokenizer.title_template = DEFAULT_TITLE_TEMPLATE

In [13]:
import re
import random
from multiprocessing import cpu_count

In [14]:
def apply_article_template(example, tokenizer):
      
    example["description"] = f"Abstract: {example['abstract']}\nTitle: {example['title']}"
    return example


# Remove other columns after creating the text
column_names = list(dataset['train'].features)
dataset = dataset.map(apply_article_template,
                                num_proc=cpu_count(),
                                fn_kwargs={"tokenizer": tokenizer},
                                remove_columns=column_names,
                                desc="Applying article template",)

# create the splits
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

for index in random.sample(range(len(dataset["train"])), 3):
  print(f"Sample {index} of the processed training set:\n\n{dataset['train'][index]['description']}")

Applying article template (num_proc=8):   0%|          | 0/82727 [03:28<?, ? examples/s]


RuntimeError: One of the subprocesses has abruptly died during map operation.To debug the error, disable multiprocessing.

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['description'],
        num_rows: 82727
    })
    test: Dataset({
        features: ['description'],
        num_rows: 9192
    })
})

### define the model arguments

In [None]:
from transformers import BitsAndBytesConfig
import torch

# specify how to quantize the model
# quantization_config = BitsAndBytesConfig(
#             load_in_4bit=True,
#             bnb_4bit_quant_type="nf4",
#             bnb_4bit_compute_dtype=torch.float16,
# )

model_kwargs = dict(
    torch_dtype="auto",      # Auto will pick float32 on CPU
    use_cache=False,         # Required for gradient checkpointing
    device_map=None,         # CPU only
)

### Define SFTTrainer

In [None]:
from trl import SFTTrainer
from peft import LoraConfig
from transformers import TrainingArguments

# path where the Trainer will save its checkpoints and logs
output_dir = 'data/article-t5-small'

# based on config

training_args = TrainingArguments(
    fp16=False,                    # CPU 
    do_eval=True,
    gradient_accumulation_steps=1, # Reduce accumulation on CPU
    gradient_checkpointing=False,  
    learning_rate=2e-5,
    log_level="info",
    logging_steps=5,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    max_steps=-1,
    num_train_epochs=1,
    output_dir=output_dir,
    overwrite_output_dir=True,
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,
    save_strategy="no",
    save_total_limit=None,
    seed=42,
    push_to_hub=False,
)



# based on config
peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)


trainer = SFTTrainer(
    model=model_id,
    model_init_kwargs=model_kwargs,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=True,
    peft_config=peft_config,
    max_seq_length=tokenizer.model_max_length,
)




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


NameError: name 'model_id' is not defined

### Train

In [None]:
train_result = trainer.train()

### save the model

In [None]:
metrics = train_result.metrics
max_train_samples = training_args.max_train_samples if training_args.max_train_samples is not None else len(train_dataset)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

### use the saved model


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir, load_in_4bit=True, device_map="auto")

In [None]:
import torch

title = "Are Individual's political ideologies impacted by the surrounding environment"

# prepare the messages for the model
input_ids = tokenizer.apply_article_template(title, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

# inference
outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=256,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95
)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])