In [1]:
import os
import pandas as pd

base_path = "42bin_haber/news"

rows = []
for category in os.listdir(base_path):
    category_path = os.path.join(base_path, category)
    if os.path.isdir(category_path):
        for file_name in os.listdir(category_path):
            if file_name.endswith(".txt"):
                file_path = os.path.join(category_path, file_name)
                with open(file_path, "r", encoding="utf-8") as f:
                    text = f.read().strip()
                    if len(text) > 30:  # çok kısa metinleri filtrele
                        rows.append({"kategori": category, "metin": text})

df = pd.DataFrame(rows)
print(df.head(), len(df))


  kategori                                              metin
0  turkiye  Kılıçdaroğlu'ndan önemli açıklama\nCHP Genel B...
1  turkiye  Ambulansa alınmayınca öldü mü?\nAksaray'da kaz...
2  turkiye  Kaçırılan kamu görevlilerini alacak heyet yola...
3  turkiye  Patriot'lar geldi, eylemler patladı\n \n Alman...
4  turkiye  İzmir'de işkence mağduruna yeniden Ağır Ceza y... 41988


In [2]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)
train_data = dataset["train"]
test_data = dataset["test"]


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="meta-llama/Llama-2-7b-chat-hf",
    max_seq_length=1024,
    dtype=None,
    load_in_4bit=True # GPU belleğini yarı yarıya azaltır
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


Skipping import of cpp extensions due to incompatible torch version 2.8.0+cu128 for torchao version 0.14.0         Please see GitHub issue #2919 for more info


Switching to PyTorch attention since your Xformers is broken.

('unterminated string literal (detected at line 1122)', (1122, 1))
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.4: Fast Llama patching. Transformers: 4.56.2.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.585 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    target_modules=["q_proj", "v_proj"],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    # task_type="CAUSAL_LM",  <-- kaldırıldı
)


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.10.4 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [None]:
from transformers import TrainingArguments, Trainer , DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token


def tokenize(batch):
    return tokenizer(
        batch["metin"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )

train_data = train_data.map(tokenize, batched=True)
test_data = test_data.map(tokenize, batched=True)

def add_labels(example):
    example["labels"] = example["input_ids"].copy()
    return example

train_data = train_data.map(add_labels)
test_data = test_data.map(add_labels)

args = TrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_steps=50,
    save_strategy="epoch",
    fp16=True,
    learning_rate=2e-4,
    warmup_steps=100,
)

trainer = Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    args=args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

trainer.train()

Map: 100%|██████████| 37789/37789 [00:05<00:00, 6980.00 examples/s]
Map: 100%|██████████| 4199/4199 [00:00<00:00, 7028.32 examples/s]
Map: 100%|██████████| 37789/37789 [00:06<00:00, 5760.43 examples/s]
Map: 100%|██████████| 4199/4199 [00:00<00:00, 5784.51 examples/s]
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 37,789 | Num Epochs = 3 | Total steps = 7,086
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 4,194,304 of 6,742,609,920 (0.06% trained)


Unsloth: Will smartly offload gradients to save VRAM!


In [8]:
from transformers import pipeline

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "Dışişleri Bakanı Davutoğlu, Yunanistan ile Türkiye  ne dedi"

output = pipe(prompt,max_length=150,do_sample=True,temperature=0.8)
print(output[0]["generated_text"])

Device set to use cuda:0


Dışişleri Bakanı Davutoğlu, Yunanistan ile Türkiye  ne dedi?
Dışişleri Bakanı Ahmet Davutoğlu, Türkiye-Yunanistan ilişkilerinin arttığı günlerde önemli bir gelişme olduğunu belirterek, Türkiye ve Yunanistan arasında yeni bir dünyasının kurulmasının ve gelişmesinin ihtiyaç duyulduğunu söyledi.


In [7]:
model.save_pretrained("unsloth_llama_news")
tokenizer.save_pretrained("unsloth_llama_news")

('unsloth_llama_news/tokenizer_config.json',
 'unsloth_llama_news/special_tokens_map.json',
 'unsloth_llama_news/chat_template.jinja',
 'unsloth_llama_news/tokenizer.model',
 'unsloth_llama_news/added_tokens.json',
 'unsloth_llama_news/tokenizer.json')