In [None]:
pip install torch transformers accelerate sentence-transformers faiss-cpu pandas datasets peft trl bitsandbytes


In [None]:
import pandas as pd
import random
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer
import torch

In [None]:
df = pd.read_csv("./data/kjv.csv")

In [None]:
themes = {
    "love": ["love", "charity"],
    "faith": ["faith", "believe", "trust"],
    "sin": ["sin", "iniquity", "transgression", "forbidden"],
    "creation": ["create", "made", "beginning"],
    "wisdom": ["wisdom", "understanding", "knowledge"],
    "forgiveness": ["forgive", "forgiveness", "pardon", "mercy"],
    "prayer": ["pray", "prayer", "ask", "supplication"],
    "hope": ["hope", "promise", "wait", "salvation"],
    "justice": ["justice", "correct", "righteous"],
    "unity": ["unity", "community", "choosen"]
}

In [None]:
def collect_theme_examples(theme, keywords, max_verses=3):
    matches = df[df["Text"].str.contains("|".join(keywords), case=False)]
    if len(matches) == 0:
        return None
    matches = matches.sample(min(max_verses, len(matches)))  # random sample
    verses = [f"{row['Book Name']} {row['Chapter']}:{row['Verse']} - {row['Text']}"
              for _, row in matches.iterrows()]
    question = f"What does the Bible say about {theme}?"
    answer = " ".join(verses)
    return {"prompt": question, "completion": answer}

In [None]:
qa_thematic = []
for theme, keywords in themes.items():
    for _ in range(100):  # 30 examples per theme
        example = collect_theme_examples(theme, keywords)
        if example:
            qa_thematic.append(example)

In [None]:
qa_lookup = [
    {
        "prompt": f"What does {row['Book Name']} {row['Chapter']}:{row['Verse']} say?",
        "completion": row["Text"]
    }
    for _, row in df.sample(15000, random_state=42).iterrows()
]

qa_all = qa_lookup + qa_thematic
random.shuffle(qa_all)

In [None]:
dataset = Dataset.from_list(qa_all)
dataset = dataset.train_test_split(test_size=0.1)

def format_example(example):
    return {
        "text": f"### Question:\n{example['prompt']}\n\n### Answer (in KJV style):\n{example['completion']}"
    }

train_dataset = dataset["train"].map(format_example)
eval_dataset = dataset["test"].map(format_example)

In [None]:
model_id = "dphn/Dolphin3.0-Qwen2.5-0.5B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.float16
)


tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config = bnb_config,
    offload_folder="offload",
    torch_dtype=torch.float16
)

In [None]:
peft_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
model.gradient_checkpointing_enable()
model.config.use_cache = False
model.enable_input_require_grads()


In [None]:
model = get_peft_model(model, peft_config)

for name, param in model.named_parameters():
    if "lora_" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

In [20]:
training_args = TrainingArguments(
    output_dir="./christAin-uncensored",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    logging_steps=50,
    num_train_epochs=3,
    save_strategy="epoch",
    fp16=True,
    push_to_hub=False
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset.shuffle().select(range(5000)),
    eval_dataset=eval_dataset.shuffle().select(range(500)),
    processing_class=tokenizer,
)

Adding EOS to train dataset: 100%|██████████| 5000/5000 [00:00<00:00, 15941.23 examples/s]
Tokenizing train dataset: 100%|██████████| 5000/5000 [00:01<00:00, 3910.19 examples/s]
Truncating train dataset: 100%|██████████| 5000/5000 [00:00<00:00, 1030035.36 examples/s]
Adding EOS to eval dataset: 100%|██████████| 500/500 [00:00<00:00, 15304.89 examples/s]
Tokenizing eval dataset: 100%|██████████| 500/500 [00:00<00:00, 3189.36 examples/s]
Truncating eval dataset: 100%|██████████| 500/500 [00:00<00:00, 192930.27 examples/s]


In [21]:
for name, param in model.named_parameters():
    param.requires_grad = "lora_" in name

# quick check
trainable = [n for n, p in model.named_parameters() if p.requires_grad]
print("Trainable parameters:", trainable)


Trainable parameters: ['base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight', 'base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight', 'base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight', 'base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight', 'base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight', 'base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight', 'base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight', 'base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight', 'base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight', 'base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight', 'base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight', 'base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight', 'base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight', 'base_model.mod

In [24]:
trainer.train()

# Save model
trainer.save_model("./christAin-uncensored")

Step,Training Loss
10,3.6229
20,3.6068
30,3.6711
40,3.513
50,3.5635
60,3.4767
70,3.4541
80,3.3851
90,3.3099
100,3.2727


In [27]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

model_id = "dphn/Dolphin3.0-Qwen2.5-0.5B"
base_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
model = PeftModel.from_pretrained(base_model, "./christAin-uncensored")
tokenizer = AutoTokenizer.from_pretrained(model_id)

def ask(question):
    inputs = tokenizer(f"### Question:\n{question}\n\n### Answer (in KJV style):", return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=500, temperature=0.7)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(ask("What does the Bible say about love?"))
print(ask("What was the original sin?"))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


### Question:
What does the Bible say about love?

### Answer (in KJV style):  
Love is the foundation of all true religion. It is love which guides the poor to the needy, and the sinner to the sinner. Love is the fulness of the Spirit, and the fulfillment of the Holy Ghost. There is no fear without love. Love is life. He who loves is begotten of God; he who is begotten of God, lives. Love is the voice of God. Love is the voice of God, and the voice of the Father is in the Son. We love, and ye shall be loved, even as God is in us. We love Him and ye shall love Him, for He who loves is like the Father. Love is the pillar of the house, the head of the corner. Love is the pillar of the house, the head of the corner. Love is the pillar of the house, the head of the corner. Love is the pillar of the house, the head of the corner. Love is the pillar of the house, the head of the corner. Love is the pillar of the house, the head of the corner. Love is the pillar of the house, the head of the 

In [None]:
def ask_kjv(question, max_tokens=500):
    inputs = tokenizer(
        f"### Question:\n{question}\n\n### Answer (in KJV style):",
        return_tensors="pt"
    ).to(model.device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=0.5,
        do_sample=True,
        top_p=0.5
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(ask_kjv("What does the Bible say about love?"))
print(ask_kjv("What was the original sin?"))

In [None]:
print(ask_kjv("Are there dinosaurs?"))

In [30]:
base_model_name = "dphn/Dolphin3.0-Qwen2.5-0.5B"  # Original base model
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load LoRA adapter
lora_model = PeftModel.from_pretrained(model, "./christAin-uncensored/")

# Merge and save
merged_model = lora_model.merge_and_unload()
merged_model.save_pretrained("./christAin-uncensored-merged/")
tokenizer.save_pretrained("./christAin-uncensored-merged/")

print("Model merged and saved!")

Model merged and saved!
