In [None]:
import os
os.chdir("..")
from datasets import load_dataset
import os
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from glob import glob
import re

In [None]:
def load_dataset_from_folder(data_folder):
    """Load all text files from the data folder and prepare dataset."""
    texts = []
    
    # Read all .txt files in the data folder
    txt_files = glob(os.path.join(data_folder, "*.txt"))
    
    for file_path in txt_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            
        # Split by <s> tokens and clean up
        sentences = content.split('<s>')
        for sentence in sentences:
            sentence = sentence.strip()
            if sentence and len(sentence) > 10:  # Filter out very short texts
                texts.append(sentence)
    
    return texts

In [None]:
def preprocess_function(examples, tokenizer, max_length=512):
    """Tokenize the texts for causal language modeling."""
    # Tokenize the texts
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors=None
    )
    
    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized

In [None]:
MODEL_NAME = "Qwen/Qwen2-0.5B"  # Using Qwen2-0.5B as it's more readily available
DATA_FOLDER = "data"
OUTPUT_DIR = "./qwen-vietnamese-finetuned"
MAX_LENGTH = 512

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

In [None]:
texts = load_dataset_from_folder(DATA_FOLDER)
print(f"Loaded {len(texts)} text samples")

In [None]:
# Create dataset
dataset = Dataset.from_dict({"text": texts})

# Split dataset (80% train, 20% validation)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Tokenize datasets
train_dataset = train_dataset.map(
    lambda x: preprocess_function(x, tokenizer, MAX_LENGTH),
    batched=True,
    remove_columns=["text"]
)

eval_dataset = eval_dataset.map(
    lambda x: preprocess_function(x, tokenizer, MAX_LENGTH),
    batched=True,
    remove_columns=["text"]
)

print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")

In [None]:
train_dataset

In [None]:
print("Sample input_ids:", train_dataset[0]["input_ids"])
print("Original text:", tokenizer.decode(train_dataset[0]["input_ids"]))

In [None]:
print("Sample input_ids:", train_dataset[0]["labels"])
print("Original text:", tokenizer.decode(train_dataset[0]["labels"]))

In [None]:
print(train_dataset[0]["labels"] == train_dataset[0]["input_ids"])

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
training_args = TrainingArguments(
    output_dir="checkpoints",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()


In [None]:
# Training hyperparameters
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 5e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 100
SAVE_STEPS = 500
EVAL_STEPS = 500

print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

In [None]:
# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Resize token embeddings if needed
model.resize_token_embeddings(len(tokenizer))

print("Loading and preprocessing dataset...")
texts = load_dataset_from_folder(DATA_FOLDER)
print(f"Loaded {len(texts)} text samples")

In [None]:
 # Create dataset
dataset = Dataset.from_dict({"text": texts})

# Split dataset (80% train, 20% validation)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Tokenize datasets
train_dataset = train_dataset.map(
    lambda x: preprocess_function(x, tokenizer, MAX_LENGTH),
    batched=True,
    remove_columns=["text"]
)

eval_dataset = eval_dataset.map(
    lambda x: preprocess_function(x, tokenizer, MAX_LENGTH),
    batched=True,
    remove_columns=["text"]
)

print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")