In [None]:
import pandas as pd
import torch
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,EarlyStoppingCallback
)

from datasets import Dataset
from sklearn.model_selection import train_test_split
import numpy as np

print("="*60)
print("ARABIC NAME SEGMENTATION TRAINER - WORKING VERSION")
print("="*60)

In [None]:

# Or load from CSV:
df = pd.read_csv('/content/arabic_name_segmentation_dataset_20k_modified.csv')



In [None]:
import pandas as pd

# Check for duplicates
duplicates_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates_count}")

# Store original length
original_length = len(df)

# Drop duplicates
df = df.drop_duplicates()

# Show results
new_length = len(df)
rows_removed = original_length - new_length

print(f"\nOriginal rows: {original_length}")
print(f"Rows after removing duplicates: {new_length}")
print(f"Rows removed: {rows_removed}")

# Reset index after dropping duplicates
df = df.reset_index(drop=True)
print("\n✓ Duplicates removed and index reset")

# Display first few rows
print("\nFirst few rows after cleaning:")
print(df.head())

In [None]:
data = {'input': df['input'].tolist(), 'target': df['target'].tolist()}
print(f"Total samples: {len(data['input'])}")

In [None]:
if len(data['input']) > 10:
    # Create a temporary DataFrame to easily drop rows with NaN in 'input' or 'target'
    temp_df = pd.DataFrame(data)
    temp_df.dropna(subset=['input', 'target'], inplace=True)

    train_inputs, val_inputs, train_targets, val_targets = train_test_split(
        temp_df['input'].tolist(), temp_df['target'].tolist(), test_size=0.1, random_state=42
    )
else:
    # For small datasets, use all for training
    print("WARNING: Small dataset detected. Using 80/20 split.")
    temp_df = pd.DataFrame(data)
    temp_df.dropna(subset=['input', 'target'], inplace=True)
    train_inputs, val_inputs, train_targets, val_targets = train_test_split(
        temp_df['input'].tolist(), temp_df['target'].tolist(), test_size=0.2, random_state=42
    )

print(f"Training samples: {len(train_inputs)}")
print(f"Validation samples: {len(val_inputs)}")

In [None]:
# ============= 2. LOAD MODEL (Use T5-small instead) =============
print("\nLoading T5-small model (more stable than ByT5)...")
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(f"✓ Model loaded: {model_name}")

In [None]:
# ============= 3. CREATE DATASETS =============
def create_dataset(inputs, targets):
    """Create dataset with proper formatting"""
    formatted_inputs = [f"segment arabic name: {text}" for text in inputs]

    # Tokenize
    input_encodings = tokenizer(
        formatted_inputs,
        padding=False,
        truncation=True,
        max_length=128,
        return_tensors=None
    )

    target_encodings = tokenizer(
        targets,
        padding=False,
        truncation=True,
        max_length=128,
        return_tensors=None
    )

    # Create dataset
    dataset_dict = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

    return Dataset.from_dict(dataset_dict)

print("\nCreating datasets...")
train_dataset = create_dataset(train_inputs, train_targets)
val_dataset = create_dataset(val_inputs, val_targets)

print(f"✓ Train dataset: {len(train_dataset)} samples")
print(f"✓ Val dataset: {len(val_dataset)} samples")

# Verify data
print("\n" + "="*60)
print("SAMPLE DATA VERIFICATION:")
print("="*60)
sample_idx = 0
print(f"Input text: {train_inputs[sample_idx]}")
print(f"Target text: {train_targets[sample_idx]}")
print(f"Input tokens: {train_dataset[sample_idx]['input_ids'][:15]}...")
print(f"Label tokens: {train_dataset[sample_idx]['labels'][:10]}...")
print("="*60 + "\n")


In [None]:
# ============= 4. DATA COLLATOR =============
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    padding=True
)

In [None]:
# ============= 5. TRAINING ARGUMENTS =============
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-name-segmenter",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=25,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_steps=10,
    warmup_steps=100,
    fp16=torch.cuda.is_available(),
    report_to="none",
    generation_max_length=128,
    generation_num_beams=4,
)

In [None]:
# ============= 6. INITIALIZE TRAINER =============
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]

)


In [None]:
# ============= 7. TRAIN =============
print("="*60)
print("STARTING TRAINING")
print("="*60)
print("This may take 15-30 minutes depending on your hardware...")
print()

train_result = trainer.train()

print("\n" + "="*60)
print("TRAINING COMPLETED!")
print("="*60)
print(f"Final train loss: {train_result.training_loss:.4f}")


STARTING TRAINING
This may take 15-30 minutes depending on your hardware...



Epoch,Training Loss,Validation Loss
1,0.0338,0.020765
2,0.0173,0.00655
3,0.0089,0.00614
4,0.004,0.003694
5,0.0042,0.003191
6,0.003,0.002667
7,0.0069,0.003095
8,0.0078,0.003007
9,0.0022,0.002862


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



TRAINING COMPLETED!
Final train loss: 0.0371


In [None]:
# ============= 8. SAVE MODEL =============
print("\nSaving model...")
model.save_pretrained("./t5-name-segmenter-final")
tokenizer.save_pretrained("./t5-name-segmenter-final")
print("✓ Model saved to './t5-name-segmenter-final'")

In [None]:
# Create zip file
print("\nCreating zip file...")
zip_filename = "t5-name-segmenter-final"
shutil.make_archive(zip_filename, 'zip', './t5-name-segmenter-final')
print(f"✓ Model zipped to '{zip_filename}.zip'")

# Get zip file size
zip_size = os.path.getsize(f"{zip_filename}.zip") / (1024 * 1024)  # Convert to MB
print(f"✓ Zip file size: {zip_size:.2f} MB")

In [None]:
# ============= 9. EVALUATION =============
print("\n" + "="*60)
print("EVALUATING MODEL")
print("="*60)

eval_results = trainer.evaluate()
print(f"Validation Loss: {eval_results['eval_loss']:.4f}")

In [None]:

# ============= 10. INFERENCE FUNCTION =============
def segment_name(name, model, tokenizer):
    """Segment a name using the trained model"""
    model.eval()

    input_text = f"segment arabic name: {name}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True)

    # Move to device
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=128,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=2,
            repetition_penalty=1.5
        )

    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

In [None]:
# ============= 11. TEST ON VALIDATION DATA =============
print("\n" + "="*60)
print("TESTING ON VALIDATION SAMPLES")
print("="*60 + "\n")

correct = 0
total = min(len(val_inputs), 5)

for i in range(total):
    original = val_inputs[i]
    expected = val_targets[i]
    predicted = segment_name(original, model, tokenizer)

    is_correct = predicted.strip().lower() == expected.strip().lower()
    if is_correct:
        correct += 1

    print(f"Input:     {original}")
    print(f"Expected:  {expected}")
    print(f"Predicted: {predicted}")
    print(f"Status:    {'✓ CORRECT' if is_correct else '✗ INCORRECT'}")
    print()

print(f"Accuracy on sample: {correct}/{total} ({100*correct/total:.1f}%)")


In [None]:
# ============= 12. TEST ON NEW NAMES =============
print("\n" + "="*60)
print("TESTING ON NEW UNSEEN NAMES")
print("="*60 + "\n")

test_names = [
    'mohamedaliahmed',
    'hassanibrahimkhalid',
    'fatimamohamedsaid',
    'abdullahomarhassan',
    'khaledyoussefali'
]

for name in test_names:
    segmented = segment_name(name, model, tokenizer)
    print(f"{name:25s} → {segmented}")

In [None]:
# ============= 13. USAGE INSTRUCTIONS =============
print("\n" + "="*60)
print("HOW TO USE THE TRAINED MODEL")
print("="*60)
print("""
# Load the model later:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('./t5-name-segmenter-final')
model = T5ForConditionalGeneration.from_pretrained('./t5-name-segmenter-final')

def segment_name(name):
    inputs = tokenizer(f"segment arabic name: {name}", return_tensors="pt")
    outputs = model.generate(**inputs, max_length=128, num_beams=4)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Use it:
result = segment_name('mohamedaliahmed')
print(result)  # Mohamed Ali Ahmed
""")

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('./t5-name-segmenter-final')
model = T5ForConditionalGeneration.from_pretrained('./t5-name-segmenter-final')

def segment_name(name):
    inputs = tokenizer(f"segment arabic name: {name}", return_tensors="pt")
    outputs = model.generate(**inputs, max_length=128, num_beams=4)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
result = segment_name('ahmedmohamedabdelsalam')
print(result)