In [None]:
import pandas as pd
import random

# Load your cleaned data
file_path = "./formal_thai.csv"  # <-- Adjust your path if needed
df = pd.read_csv(file_path, encoding="utf-8-sig")

# Keep only valid rows
df_clean = df.dropna(subset=["text", "summary"]).reset_index(drop=True)

# Rephrasing templates
rephrasing_templates = [
    "ขอเรียนเชิญร่วม ",
    "มีแผนจะดำเนินการ ",
    "กำหนดจัด ",
    "เพื่อพิจารณาเกี่ยวกับ ",
    "ได้กําหนดการประชุมเพื่อ ",
    "ด้วยความเคารพ ขอเชิญเข้าร่วม "
]

# Augmentation function
def augment_text(text):
    prefix = random.choice(rephrasing_templates)
    return prefix + text

augmented_rows = []

# Start augmenting
for idx, row in df_clean.iterrows():
    # 1. Original
    augmented_rows.append({
        "text": row["text"],
        "summary": row["summary"]
    })

    # 2. Rephrased
    augmented_rows.append({
        "text": augment_text(row["text"]),
        "summary": row["summary"]
    })

    # 3. Focused Short Version (keep first 25 words)
    short_text = ' '.join(row["text"].split()[:25]) + "..."
    augmented_rows.append({
        "text": "สรุปย่อ: " + short_text,
        "summary": row["summary"]
    })

# Create new DataFrame
df_augmented = pd.DataFrame(augmented_rows)

# Save augmented dataset
augmented_file_path = "./formal_thai_augmented.csv"
df_augmented.to_csv(augmented_file_path, index=False, encoding="utf-8-sig")

print(f"✅ Augmentation Complete. Total records: {len(df_augmented)}")
print(f"📂 Saved to: {augmented_file_path}")


✅ Augmentation Complete. Total records: 150
📂 Saved to: ./formal_thai_augmented.csv


In [None]:
import random

# Simple Thai rephrasing templates
def augment_text(text):
    patterns = [
        "ขอเรียนเชิญร่วม",
        "มีการนัดหมายเพื่อ",
        "กำหนดการประชุมเกี่ยวกับ",
        "เพื่อพิจารณาเรื่อง",
        "กำหนดพิธีการในเรื่องของ",
        "ด้วยความเคารพขอเชิญเข้าร่วม"
    ]
    # Randomly select a template
    prefix = random.choice(patterns)

    # Augment: Add a polite prefix before the first sentence
    return prefix + " " + text

# Augment the dataset
augmented_rows = []

for idx, row in df_clean.iterrows():
    # 1. Original
    augmented_rows.append({
        "text": row["text"],
        "summary": row["summary"]
    })

    # 2. Light rephrase
    augmented_rows.append({
        "text": augment_text(row["text"]),
        "summary": row["summary"]
    })

    # 3. Focused version (cut long details)
    short_text = row["text"].split(' ', 20)  # Keep only first ~20 words
    short_text = ' '.join(short_text) + "..."
    augmented_rows.append({
        "text": "สรุปย่อ: " + short_text,
        "summary": row["summary"]
    })

# Create new DataFrame
df_augmented = pd.DataFrame(augmented_rows)

print(f"✅ After augmentation: {len(df_augmented)} rows")


✅ After augmentation: 150 rows


In [None]:
# Save to new CSV
df_augmented.to_csv("./formal_thai_augmented.csv", index=False, encoding="utf-8-sig")

print("✅ Saved as: ./formal_thai_augmented.csv")

✅ Saved as: ./formal_thai_augmented.csv
