In [3]:
def determine_paragraph_boundaries(outputs):
    predictions = torch.argmax(outputs.logits, dim=-1) # get the predicted labels
    paragraph_boundaries = [] # initialize an empty list
    for i, pred in enumerate(predictions): # loop over the predictions
        if pred == 1: # if the prediction is 1 (a boundary)
            paragraph_boundaries.append(i) # append the index to the list
    return paragraph_boundaries


In [4]:
def reconstruct_paragraphs(sentences, paragraph_boundaries):
    paragraphs = [] # initialize an empty list
    start = 0 # initialize the start index
    for end in paragraph_boundaries: # loop over the end indices
        paragraph = " ".join(sentences[start:end+1]) # join the sentences from start to end
        paragraphs.append(paragraph) # append the paragraph to the list
        start = end + 1 # update the start index
    return paragraphs


In [6]:
from nltk.tokenize import sent_tokenize

def tokenize_into_sentences(text):
    sentences = sent_tokenize(text) # split text into sentences using nltk
    return sentences


In [9]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

def segment_text_with_bert(text):
    # Preprocess the text: Split into sentences or chunks
    sentences = tokenize_into_sentences(text)  # You need to define this function

    # Convert sentences to BERT's input format
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

    # Predict with the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Post-processing to determine paragraph boundaries
    # This part depends on how your model was trained and how it outputs predictions
    paragraph_boundaries = determine_paragraph_boundaries(outputs)  # You need to define this function

    # Reconstruct the text into paragraphs based on the predicted boundaries
    paragraphs = reconstruct_paragraphs(sentences, paragraph_boundaries)  # You need to define this function

    return paragraphs

# Example usage
text = """
The Ice Queen reigns in splendor and grace
Her kingdom is a land of snow and frost
She rules with wisdom, justice, and a trace
Of magic that can charm or turn to cost

She is the fairest of them all, they say
Her beauty is as pure as winter's light
She is the strongest of them all, they pray
Her power is as fierce as winter's might

She swore to protect her realm from harm
From enemies who seek to melt her throne
She leads her loyal army, brave and calm
And makes her icy palace feel like home

Miles away, The Fire King rules with a blazing hand
His kingdom is a land of ash and flame
He burns with courage, glory, and command
And anyone who dares to cross his name

He is the bravest of them all, he boasts
His valor is as hot as summer's sun
He is the hungriest of them all, he toasts
His greed is as relentless as his fun

He swore to conquer all the realms he sees
From foes who try to quench his fiery reign
He leads his fiery army, wild and free
And makes his burning palace only remain

The Fire King is a tyrant and a king
He is the terror of the summer's sting

The ice kingdom is a realm of wonder
Where snowflakes dance and glaciers gleam
The cold is not a curse, but a splendor
That shapes the land into a dream

The people of the ice are brave and wise
They know the secrets of the frost and wind
They live in harmony with the skies
And share their stories with their kin

The ice kingdom is a place of beauty
Where crystal castles sparkle in the light
The silence is not lonely, but a duty
That guards the peace throughout the night

The ice kingdom is a home and a treasure
It is the source of joy and pleasure

 But The fire kingdom is a realm of horror
Where flames devour and smoke obscures
The heat is not a blessing, but a terror
That tortures the land into a blur

The people of the fire are cruel and mad
They worship the inferno as their god
They revel in the chaos and the sad
And spread their misery with their rod

The fire kingdom is a place of fear
Where burning hells await the unwary
The screams are not a music, but a jeer
That mocks the pain throughout the fiery

The fire kingdom is a hell and a nightmare
It is the source of dread and despair
"""
paragraphs = segment_text_with_bert(text)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
paragraphs

[]