<a href="https://colab.research.google.com/github/Canonik/Aluminum-Trump-assignments-/blob/main/trump_2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

!pip install -q --upgrade transformers datasets peft accelerate

import torch, random, numpy as np
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
import torch, time
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from google.colab import files


uploaded = files.upload()
dataset = load_dataset("json", data_files="/content/trump_phrases.json")
print(dataset)

MODEL_NAME = "google/flan-t5-large"


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)


lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(base_model, lora_config)
model.train()

def format_example(example):
    phrase = example["text"]
    inputs = tokenizer(
        phrase,
        truncation=True,
        padding="max_length",
        max_length=100
    )
    targets = tokenizer(
        phrase,
        truncation=True,
        padding="max_length",
        max_length=100
    )
    labels = [tid if tid != tokenizer.pad_token_id else -100 for tid in targets["input_ids"]]

    return {
        "input_ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
        "attention_mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
        "labels": torch.tensor(labels, dtype=torch.long)
    }


train_data = dataset["train"].map(format_example)

eval_data = None


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=2,
    fp16=True,
    logging_steps=20,
    save_steps=200,
    save_total_limit=2,
    predict_with_generate=True,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    tokenizer=tokenizer,
    data_collator=data_collator
)


start_time = time.time()
trainer.train()
end_time = time.time()
print(f"Total training time: {end_time - start_time:.2f}s")



model.save_pretrained("/content/neutralizer-lora-flan-t5-small")
tokenizer.save_pretrained("/content/neutralizer-lora-flan-t5-small")



sample_texts = [
    "What about COVID-19?",
    "Summarize Trump’s statements on the COVID-19 pandemic, focusing on his views on the government’s response, the role of China, and measures he promoted.",
    "How’s the economy?",
    "Summarize Trump’s position on the U.S. economy, including his claims about growth, unemployment, and stock market performance, with references to transcript passages.",
    "What about tariffs?",
    "Summarize Trump’s stance on tariffs, especially toward China, including his stated goals, justifications, and expected outcomes, citing transcript passages when possible.",
    "What about immigration?",
    "Summarize Trump’s position on immigration policy, highlighting his views on border security, the wall, and restrictions on entry, with supporting quotes.",
    "What about national security?",
    "Summarize Trump’s views on national security, focusing on military strength, terrorism, and foreign policy challenges, and reference transcript evidence when relevant."
]

for text in sample_texts:
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    min_length=20,
    num_beams=4,
    do_sample=False
    )
    print("Query:", text)
    print("Answer:", tokenizer.decode(outputs[0], skip_special_tokens=True))
    print("------")



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[?25h

Saving trump_phrases.json to trump_phrases.json


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 4359
    })
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/4359 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


# Nuova sezione