In [4]:
# Install necessary libraries
!pip install transformers datasets accelerate
from datasets import load_dataset
import pandas as pd
import re



In [5]:
# Load the D&D character backstories dataset
dataset = load_dataset("MohamedRashad/dnd_characters_backstories", split="train")
df = pd.DataFrame(dataset)

In [6]:
# Extract weapon-related information using regex
weapon_keywords = r"\b(sword|blade|axe|bow|staff|dagger|mace|hammer|lance|spear|crossbow|flail|scimitar|halberd|rapier|whip|club|morningstar|trident|glaive|sling|quarterstaff|pick|polearm|katana|nunchaku|shuriken|chakram|scythe|sabre|cutlass|dirk|stiletto|kris|bolo|kukri|tomahawk|boomerang|cudgel|bludgeon|warhammer|battleaxe|longsword|shortsword|greatsword|broadsword|claymore|zweihander|falchion|estoc|tanto|wakizashi|yari|naginata|kanabo|tetsubo|guisarme|bec de corbin|fauchard|billhook|manriki-gusari|kusarigama|yari|naginata|kanabo|tetsubo|guisarme|bec de corbin|fauchard|billhook|manriki-gusari|kusarigama)\b"

In [7]:
# Function to extract sentences containing weapon keywords
def extract_weapon_sentences(text, keywords):
    sentences = re.split(r'(?<=[.!?]) +', text)
    weapon_sentences = [sentence for sentence in sentences if re.search(keywords, sentence, re.IGNORECASE)]
    return weapon_sentences


In [8]:
# Apply extraction function to the dataset
df['weapon_sentences'] = df['text'].apply(lambda x: extract_weapon_sentences(x, weapon_keywords))


In [9]:
# Flatten the list of weapon sentences and remove duplicates
weapon_data = list(set([sentence for sublist in df['weapon_sentences'] for sentence in sublist]))


In [10]:
# Create a DataFrame and save to a text file for training
weapon_df = pd.DataFrame(weapon_data, columns=["text"])
weapon_df.to_csv("weapon_descriptions.txt", index=False, header=False)


In [2]:
# Load tokenizer and model
from transformers import AutoTokenizer, AutoModelForCausalLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments


In [3]:
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [4]:
# Tokenize dataset
def load_dataset(tokenizer, file_path, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

In [6]:
train_dataset = load_dataset(tokenizer, "weapon_descriptions.txt")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)
import warnings
warnings.filterwarnings("ignore")


In [7]:
# Training setup
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True
)


In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)
import warnings
warnings.filterwarnings("ignore")

# Train the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=8, training_loss=2.3799827098846436, metrics={'train_runtime': 11.0722, 'train_samples_per_second': 1.445, 'train_steps_per_second': 0.723, 'total_flos': 522593501184.0, 'train_loss': 2.3799827098846436, 'epoch': 4.0})

In [10]:
# Save model
trainer.save_model("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")

('./fine-tuned-model\\tokenizer_config.json',
 './fine-tuned-model\\special_tokens_map.json',
 './fine-tuned-model\\vocab.json',
 './fine-tuned-model\\merges.txt',
 './fine-tuned-model\\added_tokens.json',
 './fine-tuned-model\\tokenizer.json')

In [12]:
# Sample comparison
from transformers import pipeline

original_model = pipeline("text-generation", model="distilgpt2", tokenizer=tokenizer)
fine_tuned_model = pipeline("text-generation", model="./fine-tuned-model", tokenizer=tokenizer)

prompt = "Flame"

print("🔹 Original:")
print(original_model(prompt, max_length=20)[0]['generated_text'])

print("\n🔸 Fine-tuned:")
print(fine_tuned_model(prompt, max_length=20)[0]['generated_text'])
import warnings
warnings.filterwarnings("ignore")

Device set to use cpu
Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


🔹 Original:


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Flame, as well as her father, was present alongside her.
She and her father were

🔸 Fine-tuned:
Flamelee) in the form of a child(doughnut or pearls of sea fl
