In [60]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset, Dataset

In [61]:
tokenizer = AutoTokenizer.from_pretrained('google-t5/t5-small')
model = AutoModelForSeq2SeqLM.from_pretrained('google-t5/t5-small')

In [62]:
import pandas as pd

df = pd.read_csv('bible_verses_2.csv')
print(df.head())


                                                GenZ  \
0  Gen Z: Genesis 11. In the very start, God made...   
1  Gen Z: Genesis 21. So, like, the sky and the e...   
2  Gen Z: Genesis 31. So, this snake, right, it w...   
3  Gen Z: Genesis 41. Adam and Eve did the deed, ...   
4  Gen Z: Genesis 51. Yo, this is the story of Ad...   

                                                Norm  
0  KJV: Genesis 11. In the beginning God created ...  
1  KJV: Genesis 21. Thus the heavens and the eart...  
2  KJV: Genesis 31. Now the serpent was more subt...  
3  KJV: Genesis 41. And Adam knew Eve his wife; a...  
4  KJV: Genesis 51. This {is} the book of the gen...  


In [63]:
df_chapterlist = pd.read_csv('./slangs_datasets/bible.csv')
book_chapter_list = df_chapterlist['Chapter'].tolist()
#replace / with space and strip
book_chapter_list = [x.replace("/", " ").strip() for x in book_chapter_list]
#add the string "Gen Z: " to the list 
book_chapter_list_genZ = ["Gen Z: " + x for x in book_chapter_list]
book_chapter_list_kjv = ["KJV: " + x for x in book_chapter_list]

In [71]:
import re
def splitter(genz_text, norm_text, repatterngenz, repatternkjv, book_chapter_list_genZ, book_chapter_list_kjv):
    # Process Gen Z text
    match_genz = re.search(repatterngenz, genz_text)
    if match_genz:
        book_chapter_genz = match_genz.group(1).strip()
        # Split the book and chapter
        if ' ' in book_chapter_genz:
            book_name_genz, chapter_number_genz = book_chapter_genz.rsplit(' ', 1)
            book_genz = book_name_genz.strip()
            chapter_genz = int(chapter_number_genz.strip())
        else:
            # Handle cases where the chapter number might be missing or formatted differently
            book_genz = book_chapter_genz.strip()
            chapter_genz = 1  # Default to chapter 1 if not specified
    else:
        book_genz = None
        chapter_genz = None
        print("Book and chapter not found in Gen Z text.")
        return None, None  # Exit if book and chapter are not found

    # Remove the matched book and chapter part from the text
    text_body_genz = genz_text[match_genz.end():].strip()

    # Split Gen Z text into verses
    verse_pattern_genz = r'(\d+)\.\s*(.*?)(?=(\d+\.\s)|$)'
    verses_genz = re.findall(verse_pattern_genz, text_body_genz, re.DOTALL)

    # Create DataFrame for Gen Z text
    data_genz = []
    for verse_num, verse_text, _ in verses_genz:
        data_genz.append({
            'Book': book_genz,
            'Chapter': chapter_genz,
            'Verse': int(verse_num),
            'GenZ': verse_text.strip()
        })
    df_genz = pd.DataFrame(data_genz)

    # Process KJV text
    match_kjv = re.search(repatternkjv, norm_text)
    if match_kjv:
        book_chapter_kjv = match_kjv.group(1).strip()
        # Split the book and chapter
        if ' ' in book_chapter_kjv:
            book_name_kjv, chapter_number_kjv = book_chapter_kjv.rsplit(' ', 1)
            book_kjv = book_name_kjv.strip()
            chapter_kjv = int(chapter_number_kjv.strip())
        else:
            # Handle cases where the chapter number might be missing or formatted differently
            book_kjv = book_chapter_kjv.strip()
            chapter_kjv = 1  # Default to chapter 1 if not specified
    else:
        book_kjv = None
        chapter_kjv = None
        print("Book and chapter not found in KJV text.")
        return None, None  # Exit if book and chapter are not found

    # Remove the matched book and chapter part from the text
    text_body_kjv = norm_text[match_kjv.end():].strip()

    # Split KJV text into verses
    verse_pattern_kjv = r'(\d+)\.\s*(.*?)(?=(\d+\.\s)|$)'
    verses_kjv = re.findall(verse_pattern_kjv, text_body_kjv, re.DOTALL)

    # Create DataFrame for KJV text
    data_kjv = []
    for verse_num, verse_text, _ in verses_kjv:
        data_kjv.append({
            'Book': book_kjv,
            'Chapter': chapter_kjv,
            'Verse': int(verse_num),
            'Norm': verse_text.strip()
        })
    df_kjv = pd.DataFrame(data_kjv)

    # Check if the number of verses match
    verses_match = len(df_genz) == len(df_kjv)
    if not verses_match:
        print(f"Number of verses do not match: Gen Z has {len(df_genz)} verses, KJV has {len(df_kjv)} verses.")
    else:
        print(f"Number of verses match: {len(df_genz)} verses.")

    # Optionally, check if the verse numbers match
    genz_verses_set = set(df_genz['Verse'])
    kjv_verses_set = set(df_kjv['Verse'])
    if genz_verses_set != kjv_verses_set:
        print("Verse numbers do not match between Gen Z and KJV.")
    else:
        # Return concat of book and chapter side by side
        df_genz['Norm'] = df_kjv['Norm']
        return df_genz
    return 

In [72]:
book_chapter_pattern_genz = r'Gen Z:\s*(' + '|'.join(re.escape(bc) for bc in book_chapter_list) + r')\.\s*'
book_chapter_pattern_kjv = r'KJV:\s*(' + '|'.join(re.escape(bc) for bc in book_chapter_list) + r')\.\s*'

cleanedDf = pd.DataFrame()

for i in range(len(df)):
    df_genz = splitter(df.iloc[i]['GenZ'],df.iloc[i]['Norm'],book_chapter_pattern_genz,book_chapter_pattern_kjv,book_chapter_list_genZ,book_chapter_list_kjv)
    if df_genz is not None:
        cleanedDf = cleanedDf._append(df_genz)

print(cleanedDf.head())
cleanedDf.to_csv('bible_verses_cleaned.csv',index=False)
dataset = Dataset.from_pandas(cleanedDf)

Number of verses match: 30 verses.
Number of verses match: 24 verses.
Number of verses match: 23 verses.
Number of verses match: 25 verses.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not found in Gen Z text.
Book and chapter not

In [73]:
def processtokens(df):
    inputs = df['GenZ']
    targets = df['Norm']
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=256, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(processtokens, batched=True)

Map:   0%|          | 0/2020 [00:00<?, ? examples/s]

Map: 100%|██████████| 2020/2020 [00:00<00:00, 5041.07 examples/s]


In [74]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_dataset, batch_size=4, shuffle=True)

In [75]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='. results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    save_steps=10,
    save_total_limit=2,
    eval_strategy="no",
    logging_steps=10,
)




In [76]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

In [77]:
trainer.train()

  0%|          | 10/2525 [00:36<2:25:05,  3.46s/it]

{'loss': 12.9046, 'grad_norm': 102.7292709350586, 'learning_rate': 4.98019801980198e-05, 'epoch': 0.02}


  1%|          | 14/2525 [00:54<2:50:25,  4.07s/it]

KeyboardInterrupt: 

In [None]:
def translate(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=128, truncation=True)
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

test_sentence = "Harry Potter was a highly unusual boy in many ways."
translated_sentence = translate(test_sentence)
print(translated_sentence)



Harry Potter Potter hat Harry Potter in vielen  ways a boy  
