Project: Bible-Related Keyword Extraction Model
Data source: https://github.com/christos-c/bible-corpus (providing Bible in different languages)
Objective:
    (1) Develop a supervised learning model to identify and extract words relevant to the Bible.
    (2) Create highlights or keywords from sermons or other text related to biblical content.
Steps:
    (1) Data Cleansing and Parsing: 
        i.Gather relevant text data (sermons and biblical texts)
        ii.Clean and preprocess the data (remove noise, special characters)
        iii.Tokenize the text into words or subword units
    (2) Label Assignment: Assign each word a binary label, 0: irrelevant to the Bible, 1: relevant to the Bible
    (3) Model Training: Transformer and SVM
    (4) Keyword extraction
Outcome: The resulting model can automatically identify and highlight Bible-related terms in sermons or other religious content





EDA Step 1: Parse the original xml file: the Bible in English and save the texts into a txt file

In [None]:
import xml.etree.ElementTree as ET
import re
tree = ET.parse('English.xml')
root = tree.getroot()
texts = []
all_punctuation = "!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~"
for seg in root.findall(".//seg"):
    text = seg.text
    text = text.replace('\n','').replace('\t','')
    text =  re.sub(rf"[{all_punctuation}]", '', text)
    texts.append(text)

with open('bible_eng.txt', 'w+') as f:
    f.writelines(texts)   

EDA Step 2: Categorize the data with Named Entity Recognition
1. tokenize each word with AutoTokenizer
2. categorize word tokens with TFAutoModelForTokenClassification
3. Add labels 0 and 1 (potential notes) with tf


In [None]:
import pandas as pd
from transformers import TFAutoModelForTokenClassification, AutoTokenizer
import tensorflow as tf

model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
model = TFAutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to process text batch by batch
def process_batch(batch):
    all_tokens = []
    all_labels = []
    all_notes = []
    
    # Tokenize the batch into words
    words = batch.split()
    
    # Encode the words using the tokenizer
    tokens = tokenizer(words, is_split_into_words=True, return_tensors="tf", truncation=True, padding=True)
    
    # Get model predictions
    outputs = model(tokens)
    predictions = tf.argmax(outputs.logits, axis=-1)
    
    # Convert input_ids to tokens and align labels
    tokens = tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])
    token_labels = [model.config.id2label[prediction.numpy()] for prediction in predictions[0]]
    
    aligned_tokens = []
    aligned_labels = []
    notes_labels = []
    
    for token, label in zip(tokens, token_labels):
        if token not in ["[CLS]", "[SEP]", "[PAD]"]:
            if token.startswith("##"):
                # Append to the last token if it is a subword token
                aligned_tokens[-1] = aligned_tokens[-1] + token[2:]
            else:
                aligned_tokens.append(token)
                aligned_labels.append(label)
                if label in {'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'}:
                    notes_labels.append(1)  # Potential note
                else:
                    notes_labels.append(0)  # Not a note
    
    all_tokens.extend(aligned_tokens)
    all_labels.extend(aligned_labels)
    all_notes.extend(notes_labels)
    
    return all_tokens, all_labels, all_notes

# Initialize lists to store the tokens, labels, and notes
all_tokens = []
all_labels = []
all_notes = []

# Read the text file line by line to handle large files
max_token_length = 512  # Maximum token length for BERT models
current_batch = ""

with open('bible_eng_2.txt', 'r', encoding='utf-8') as file:
    for line in file:
        current_batch += line.strip() + " "
        
        # Check if the current batch exceeds the maximum token length
        if len(tokenizer(current_batch.split(), is_split_into_words=True)["input_ids"]) > max_token_length:
            tokens, labels, notes = process_batch(current_batch)
            all_tokens.extend(tokens)
            all_labels.extend(labels)
            all_notes.extend(notes)
            current_batch = ""
    
    # Process any remaining lines in the current batch
    if current_batch:
        tokens, labels, notes = process_batch(current_batch)
        all_tokens.extend(tokens)
        all_labels.extend(labels)
        all_notes.extend(notes)

# Create a DataFrame and save to CSV
df = pd.DataFrame({'Token': all_tokens, 'Label': all_labels, 'Notes': all_notes})
df.to_csv('bible_tokens.csv', index=False)


EDA step 3: Modify the labels value based on the buisness needs

In [1]:
import pandas as pd
df = pd.read_csv('bible_tokens.csv')
tokens_to_change = {'gospel', 'saint'}
df['Notes'] = df.apply(lambda row: 1 if row['Token'] in tokens_to_change else row['Notes'], axis=1)
df.to_csv('modified_bible_tokens.csv', index=False)

Model training: SVC
With classifier 0 (not highlighted) and 1 (highlighted) from the bible text, use SVM to train a model that can extract meaningful words related to Bible from speeches.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import re
import string

# Load the CSV document with Bible related tokens
bible_words_df = pd.read_csv('modified_bible_tokens.csv')

# Load the sermons text
with open('sermons.txt', 'r', encoding='utf-8') as file:
    large_document = file.read()

# Extract the tokens marked as 1 from the bible token document
marked_words = bible_words_df[bible_words_df['Marker'] == 1]['Token'].tolist()
marked_words = list(set(marked_words)) 

# Segment the sermons file to be paragraphs with length of 300
def segment_document(text, segment_size=300):
    words = text.split()
    segments = [' '.join(words[i:i+segment_size]) for i in range(0, len(words), segment_size)]
    return segments

segments = segment_document(large_document)

# Use TfidfVectorizer to calculate TF-IDF 
vectorizer = TfidfVectorizer(vocabulary=marked_words)
X = vectorizer.fit_transform(segments)
feature_names = vectorizer.get_feature_names_out()

# Create labels
y = [(1 if any(word in segment for word in marked_words) else 0) for segment in segments]

# Check labels 
print(f"Class distribution: {pd.Series(y).value_counts()}")

# If this paragraph only has either 1 or 0, create dummy words
if len(set(y)) < 2:
    dummy_segment_with_keywords = ' '.join(marked_words[:10])  
    dummy_segment_without_keywords = ' '.join(['dummyword']*10)  
    segments.extend([dummy_segment_with_keywords, dummy_segment_without_keywords])
    y.extend([1, 0])

# Create train and test data
X = vectorizer.fit_transform(segments)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVC
model = SVC(kernel='sigmoid', degree=5)
model.fit(X_train, y_train)

# Predict on all paragraphs
predictions = model.predict(X)

# Only extract one markable word in each paragraph
notes = []
for segment, prediction in zip(segments, predictions):
    if prediction == 1 and "dummyword" not in segment.lower():
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', segment)
        for sentence in sentences:
            if any(word.lower() in sentence.lower() for word in marked_words):
                words = sentence.split()
                for i, word in enumerate(words):
                    if word.lower() in marked_words:
                        start = max(0, i - 1)
                        end = min(len(words), i + 2)
                        context_words = words[start:end]
                        notes.append(' '.join(context_words))
                        break  

with open('extracted_notes.txt', 'w', encoding='utf-8') as file:
    for note in notes:
        file.write(note + '\n')




Clean up the notes. Remove those that are marked as 0 in bible_tokens and repetitions

In [8]:
with open('extracted_notes.txt', 'r', encoding='utf-8') as file:  
    all_notes = file.readlines()  
    print(all_notes)
    cleaned_notes = []
    for note in all_notes:
        cleaned_words = [word for word in note.split() if word in marked_words]
        cleaned_notes.extend(cleaned_words)

# Remove duplicated notes
cleaned_notes = list(set(cleaned_notes))


with open('extracted_notes_2.txt', 'w', encoding='utf-8') as file:
    for note in cleaned_notes:
        file.write(note + '\n')


['about the glory\n', 'heard of it,\n', 'day of the\n', 'All of\n', 'The images\n', 'The statistics\n', 'tons of plastic\n', 'in the oceans\n', 'species of marine\n', 'The statistics\n', 'to the reality\n', 'on the One\n', 'Gove, the\n', 'save the planet\n', 'stewards of it\n', 'create the world\n', 'The answer\n', 'much of our\n', 'has the greatest\n', 'itself; the created\n', 'clearly the sky\n', 'clearly the sky\n', 'Whatever the season\n', 'The language\n', 'when the apostle\n', 'evidence of his\n', 'created the world\n', 'The Bible\n', 'account of creation:\n', 'in the 6th-C\n', 'created the world\n', 'sort of ‘counter-claim’:\n', 'in the style\n', 'create the world\n', 'But the purpose\n', 'to the greatness\n', 'part of Scripture\n', 'to the rest\n', 'with the rest\n', 'says of human\n', 'within the created\n', 'expect the people\n', 'not the centre\n', 'All the\n', 'not all about\n', 'all about\n', 'is all about\n', 'on the fact\n', 'is the oldest\n', 'is! The oldest\n', 'The Bi

Sermons collections:
https://www.thekingdomcollective.com/spurgeon/list/
https://www.standrewsenfield.com
