In [4]:
# Install necessary libraries
!pip install transformers datasets accelerate
from datasets import load_dataset
import pandas as pd
import re



In [5]:
# Load the D&D character backstories dataset
dataset = load_dataset("MohamedRashad/dnd_characters_backstories", split="train")
df = pd.DataFrame(dataset)

In [6]:
# Extract weapon-related information using regex
weapon_keywords = r"\b(sword|blade|axe|bow|staff|dagger|mace|hammer|lance|spear|crossbow|flail|scimitar|halberd|rapier|whip|club|morningstar|trident|glaive|sling|quarterstaff|pick|polearm|katana|nunchaku|shuriken|chakram|scythe|sabre|cutlass|dirk|stiletto|kris|bolo|kukri|tomahawk|boomerang|cudgel|bludgeon|warhammer|battleaxe|longsword|shortsword|greatsword|broadsword|claymore|zweihander|falchion|estoc|tanto|wakizashi|yari|naginata|kanabo|tetsubo|guisarme|bec de corbin|fauchard|billhook|manriki-gusari|kusarigama|yari|naginata|kanabo|tetsubo|guisarme|bec de corbin|fauchard|billhook|manriki-gusari|kusarigama)\b"

In [7]:
# Function to extract sentences containing weapon keywords
def extract_weapon_sentences(text, keywords):
    sentences = re.split(r'(?<=[.!?]) +', text)
    weapon_sentences = [sentence for sentence in sentences if re.search(keywords, sentence, re.IGNORECASE)]
    return weapon_sentences


In [8]:
# Apply extraction function to the dataset
df['weapon_sentences'] = df['text'].apply(lambda x: extract_weapon_sentences(x, weapon_keywords))


In [9]:
# Flatten the list of weapon sentences and remove duplicates
weapon_data = list(set([sentence for sublist in df['weapon_sentences'] for sentence in sublist]))


In [10]:
# Create a DataFrame and save to a text file for training
weapon_df = pd.DataFrame(weapon_data, columns=["text"])
weapon_df.to_csv("weapon_descriptions.txt", index=False, header=False)


In [11]:
# Load tokenizer and model
from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments


In [12]:
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# Tokenize dataset
def load_dataset(tokenizer, file_path, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

In [None]:
train_dataset = load_dataset(tokenizer, "weapon_descriptions.txt")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)


In [None]:
# Training setup
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

In [None]:
# Save model
trainer.save_model("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")

In [None]:
# Sample comparison
from transformers import pipeline

original_model = pipeline("text-generation", model="distilgpt2", tokenizer=tokenizer)
fine_tuned_model = pipeline("text-generation", model="./fine-tuned-model", tokenizer=tokenizer)

prompt = "Flame"

print("🔹 Original:")
print(original_model(prompt, max_length=20)[0]['generated_text'])

print("\n🔸 Fine-tuned:")
print(fine_tuned_model(prompt, max_length=20)[0]['generated_text'])