In [None]:
!pip install -q transformers
!pip install -q Faker

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/2.0 MB[0m [31m22.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import json
import random
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from faker import Faker
from tqdm import tqdm

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto"
)

fake = Faker("en_US")

OUTPUT_FILE = "my_data.json"
TOTAL_LINES = 2000
PII_RATIO = 0.5

class ContextGenerator:
    def __init__(self):
        self.intro_part_1 = [
            "Please be advised that", "It is important to note that", "For security reasons,",
            "Regarding the recent update,", "In response to your request,", "We recently noticed that",
            "Upon checking the system logs,", "Urgent reminder:", "Automatic Notification:",
            "During the last audit,", "As per the new policy,", "To proceed with verification,"
        ]

        self.intro_part_2 = [
            "we have updated the record for", "the system has flagged", "user verification failed for",
            "we successfully retrieved the details of", "you are required to confirm",
            "an administrator accessed the file containing", "database entry #4421 shows",
            "the confidential report included", "our monitoring tool detected"
        ]

        self.intro_part_3 = [
            "your primary", "the registered", "a suspicious", "the backup",
            "the default", "an encrypted", "the associated", "customer's"
        ]

        self.outro_part_1 = [
            "is currently active", "has been compromised", "needs immediate attention",
            "was successfully validated", "is missing from the archive", "cannot be verified",
            "will be permanently deleted", "has been synchronized"
        ]

        self.outro_part_2 = [
            "so please update it immediately", "which might cause login issues",
            "and will be used for future billing", "to prevent unauthorized access",
            "before the session expires", "as part of the 2FA process",
            "requiring manual override"
        ]

        self.outro_part_3 = [
            "within the next 24 hours.", "by the end of the business day.",
            "in our secure database.", "on the main server.", "via the admin panel.",
            "during the next reboot cycle.", "as soon as possible."
        ]

    def get_intro(self):
        return f"{random.choice(self.intro_part_1)} {random.choice(self.intro_part_2)} {random.choice(self.intro_part_3)}"

    def get_outro(self):
        return f"{random.choice(self.outro_part_1)} {random.choice(self.outro_part_2)} {random.choice(self.outro_part_3)}"

ctx_gen = ContextGenerator()

def gen_normal_txt():
    topics = [
        "technology", "weather", "food", "travel", "history", "science", "daily routine",
        "sports", "music", "movies", "finance", "education", "health & fitness",
        "gaming", "fashion", "business", "art & culture", "relationships", "nature"
    ]
    chosen_topic = random.choice(topics)

    prompt = (
        "<|user|>\n"
        f"Write ONE long, complex sentence (at least 25 words) about {chosen_topic}. "
        "It should be descriptive and detailed. Do not include personal names or data.\n"
        "<|assistant|>\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    try:
        outputs = model.generate(
            **inputs, max_new_tokens=60, do_sample=True, temperature=0.9, top_p=0.95, pad_token_id=tokenizer.eos_token_id
        )
        text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if "<|assistant|>" in text:
            text = text.split("<|assistant|>")[-1]
        else:
            text = text.replace(prompt.replace("<|user|>\n", "").replace("\n<|assistant|>\n", ""), "")
        return text.strip() if len(text.strip()) > 10 else fake.paragraph(nb_sentences=2)
    except:
        return fake.paragraph(nb_sentences=2)

def gen_pii_txt():
    keys_db = {
        "email": ["Email", "User Mail", "Contact"],
        "phone": ["Phone", "Mobile", "Hotline"],
        "ip": ["IP Address", "Host IP", "Source IP"],
        "ssn": ["SSN", "Social ID", "Tax ID"],
        "card": ["Credit Card", "Payment No", "Visa"]
    }
    separators = [": ", " = ", " -> ", " - "]

    type_choice = random.choice(list(keys_db.keys()))
    key = random.choice(keys_db[type_choice])
    sep = random.choice(separators)

    if type_choice == "email":
        value = fake.email()
    elif type_choice == "phone":
        value = fake.phone_number()
    elif type_choice == "ip":
        value = fake.ipv4()
    elif type_choice == "ssn":
        value = fake.ssn()
    elif type_choice == "card":
        value = fake.credit_card_number()

    kv_pair = f"{key}{sep}{value}"

    intro = ctx_gen.get_intro()
    outro = ctx_gen.get_outro()

    return f"{intro} {kv_pair} {outro}"

dataset = []

for _ in tqdm(range(TOTAL_LINES)):
    if random.random() < PII_RATIO:
        line = gen_pii_txt()
    else:
        line = gen_normal_txt()

    dataset.append(line.replace("\n", " ").strip())

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

print(json.dumps(dataset[:5], indent=4))

100%|██████████| 2000/2000 [31:22<00:00,  1.06it/s]

[
    "The gusting winds brought with it a chill, tearing at our clothes and making our faces burn. We huddled together, shivering and shaking, as the snow began to fall, making the world a stark and unfamiliar place. The crispness",
    "Automatic Notification: we successfully retrieved the details of the default User Mail: thomaschristopher@example.net has been synchronized which might cause login issues via the admin panel.",
    "Regarding the recent update, the confidential report included the associated Visa - 4142464839402855 was successfully validated so please update it immediately within the next 24 hours.",
    "The journey towards education is an arduous one, one that requires a lifetime of dedication and hard work. A young child, eager to explore the world around them, begins their formal education, their world stretching before them like a vast and intricate puzzle. Teachers help them",
    "As per the new policy, the system has flagged your primary Tax ID: 489-63-0460 wi




In [None]:
OUTPUT_FILE_KV = "my_data_key_value.json"
TOTAL_LINES_KV = 2000

dataset_kv = []

key_variations = {
    "email": ["Email", "User Mail", "Contact"],
    "phone": ["Phone", "Mobile", "Hotline"],
    "ip": ["IP Address", "Host IP", "Source IP"],
    "ssn": ["SSN", "Social ID", "Tax ID"],
    "card": ["Credit Card", "Payment No", "Visa"]
}

separators = [": ", " : ", "=", " = ", " -> ", ":"]

def gen_key_value_line():
    data_type = random.choice(list(key_variations.keys()))
    key_label = random.choice(key_variations[data_type])
    sep = random.choice(separators)

    if data_type == "email":
        value = fake.email()
    elif data_type == "phone":
        value = fake.phone_number()
    elif data_type == "ip":
        value = fake.ipv4()
    elif data_type == "ssn":
        value = fake.ssn()
    elif data_type == "card":
        value = fake.credit_card_number()

    return f"{key_label}{sep}{value}"

print(f"Generating {TOTAL_LINES_KV} Key-Value lines...")

for _ in tqdm(range(TOTAL_LINES_KV)):
    line = gen_key_value_line()

    dataset_kv.append(line.replace("\n", " ").strip())

with open(OUTPUT_FILE_KV, "w", encoding="utf-8") as f:
    json.dump(dataset_kv, f, ensure_ascii=False, indent=4)

print(json.dumps(dataset_kv[:5], indent=4))

Generating 2000 Key-Value lines...


100%|██████████| 2000/2000 [00:00<00:00, 3927.40it/s]

[
    "IP Address=105.19.237.208",
    "Host IP:92.64.229.67",
    "Hotline -> (837)264-5105x6159",
    "SSN : 567-06-7681",
    "User Mail: phoffman@example.com"
]





In [None]:
OUTPUT_FILE_SIMPLE = "my_data_key_value_simple.json"
TOTAL_LINES_SIMPLE = 2000

simple_dataset = []

key_variations = {
    "email": ["Email"],
    "phone": ["Phone"],
    "ip":    ["IP Address"],
    "ssn":   ["SSN"],
    "card":  ["Credit Card"]
}


def gen_simple_kv_line():
    data_type = random.choice(list(key_variations.keys()))

    key_label = random.choice(key_variations[data_type])

    sep = ": "

    if data_type == "email":
        value = fake.email()
    elif data_type == "phone":
        value = fake.phone_number()
    elif data_type == "ip":
        value = fake.ipv4()
    elif data_type == "ssn":
        value = fake.ssn()
    elif data_type == "card":
        value = fake.credit_card_number()

    return f"{key_label}{sep}{value}"

for _ in tqdm(range(TOTAL_LINES_SIMPLE)):
    line = gen_simple_kv_line()

    simple_dataset.append(line.replace("\n", " ").strip())

with open(OUTPUT_FILE_SIMPLE, "w", encoding="utf-8") as f:
    json.dump(simple_dataset, f, ensure_ascii=False, indent=4)

print(json.dumps(simple_dataset[:5], indent=4))

100%|██████████| 2000/2000 [00:00<00:00, 23212.13it/s]

[
    "Email: sharon40@example.net",
    "Email: judystevens@example.net",
    "IP Address: 65.243.147.1",
    "IP Address: 129.154.247.14",
    "Credit Card: 4966338299336337"
]



