# SENTENCE BOUNDARY DETECTION

# CRF

In [19]:
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report
import re
from sklearn.preprocessing import MultiLabelBinarizer

# Defining Feature and Label Functions
def extract_features(sentence, i):  #extract_features takes a sentence and an index i as input and extracts 
    #various features of the word at that index.
    features = {
        'word': sentence[i],
        'is_first': i == 0,
        'is_last': i == len(sentence) - 1,
        'is_capitalized': sentence[i][0].upper() == sentence[i][0],
        'is_all_caps': sentence[i].upper() == sentence[i],
        'is_all_lower': sentence[i].lower() == sentence[i],
        'is_alphanumeric': int(bool((len(sentence[i]) > 1) & sentence[i].isalnum()))
    }
    return features

# define label function
def get_label(sentence, i):#determines whether the last character of a word is a punctuation mark.
    last_char = sentence[i][-1]
    return str(last_char == ',' or last_char == ';' or last_char == ':' or last_char == '.' or last_char == '?')

# read text from file
with open('C:/Users/aardr/Downloads/bva.txt', 'r', encoding='utf-8') as f:
    text = f.read()

text = text.replace('\\r', '').replace('\\n', '').replace('\\t', '')

# perform sentence boundary detection
# using the sent_tokenize function from nltk. 
#It splits the text into sentences and stores them in the sentences variable.

sentences = nltk.sent_tokenize(text)
X = [nltk.word_tokenize(sentence) for sentence in sentences]
y = [[get_label(sentence, i) for i in range(len(sentence))] for sentence in X]

# extract features from sentences
# It creates a nested list X_feats where each element corresponds to a sentence and contains a
#dictionary of features for each word in that sentence.
X_feats = [[extract_features(X[i], j) for j in range(len(X[i]))] for i in range(len(X))]

# Convert labels to list of sequences
#converts the label sequences in y to a list of sequences y_seq.

y_seq = [sentence_labels for sentence_labels in y]

# train CRF model
crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=3)
crf.fit(X_feats, y_seq)

# test CRF model on training data
mlb = MultiLabelBinarizer()
y_bin = mlb.fit_transform(y_seq)

# test CRF model on training data
y_pred = crf.predict(X_feats)

# Convert y_pred to binary array format (if needed)
y_pred_bin = mlb.transform(y_pred)

# Generate classification report
report = classification_report(y_bin, y_pred_bin)
print(report)

for sentence, labels in zip(X, y_pred):
    boundary_detected_text = " ".join([token for token, label in zip(sentence, labels) if label])
    print(boundary_detected_text)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4101
           1       1.00      1.00      1.00      4118

   micro avg       1.00      1.00      1.00      8219
   macro avg       1.00      1.00      1.00      8219
weighted avg       1.00      1.00      1.00      8219
 samples avg       1.00      1.00      1.00      8219

{ `` 59bd4ac35116540935ee6851 '' : { `` text '' : `` Citation Nr : 1510760Decision Date : 03/13/15 Archive Date : 03/24/15DOCKET NO .
10-10 851 ) DATE ) ) On appeal from theDepartment of Veterans Affairs Regional Office in Portland , OregonTHE ISSUES1 .
Entitlement to service connection for an acquired psychiatric disability , to include posttraumatic stress disorder ( PTSD ) and major depression.2 .
Entitlement to total rating for compensation purposes based on individual unemployability ( TDIU ) .REPRESENTATIONAppellant represented by : Harold H. Hoffman III , AttorneyWITNESS AT HEARING ON APPEALAppellantATTOR

In [20]:
from sklearn.metrics import accuracy_score
# Calculate accuracy
accuracy = accuracy_score(y_bin, y_pred_bin)
print("Accuracy:", accuracy)


Accuracy: 0.9987858183584264


In [37]:
import nltk
import sklearn_crfsuite
from sklearn.metrics import multilabel_confusion_matrix, classification_report
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer


# Defining Feature and Label Functions
def extract_features(sentence, i):
    features = {
        'word': sentence[i],
        'is_first': i == 0,
        'is_last': i == len(sentence) - 1,
        'is_capitalized': sentence[i][0].upper() == sentence[i][0],
        'is_all_caps': sentence[i].upper() == sentence[i],
        'is_all_lower': sentence[i].lower() == sentence[i],
        'is_alphanumeric': int(bool((len(sentence[i]) > 1) & sentence[i].isalnum()))
    }
    return features


def get_label(sentence, i):
    last_char = sentence[i][-1]
    return str(last_char == ',' or last_char == ';' or last_char == ':' or last_char == '.' or last_char == '?')


# read text from file
with open('C:/Users/aardr/Downloads/bva.txt', 'r', encoding='utf-8') as f:
    text = f.read()

text = text.replace('\\r', '').replace('\\n', '').replace('\\t', '')

# perform sentence boundary detection
sentences = nltk.sent_tokenize(text)
X = [nltk.word_tokenize(sentence) for sentence in sentences]
y = [[get_label(sentence, i) for i in range(len(sentence))] for sentence in X]

# extract features from sentences
X_feats = [[extract_features(X[i], j) for j in range(len(X[i]))] for i in range(len(X))]

# Convert labels to list of sequences
y_seq = [sentence_labels for sentence_labels in y]

# train CRF model
crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=3)
crf.fit(X_feats, y_seq)

# test CRF model on training data
mlb = MultiLabelBinarizer()
y_bin = mlb.fit_transform(y_seq)

# test CRF model on training data
y_pred = crf.predict(X_feats)

# Convert y_pred to binary array format (if needed)
y_pred_bin = mlb.transform(y_pred)

# Generate classification report
report = classification_report(y_bin, y_pred_bin)
print("classification report")
print(report)

# Calculate confusion matrix
confusion_matrix = multilabel_confusion_matrix(y_bin, y_pred_bin)
print("")
print("confusion matrix")
print(confusion_matrix)

# Calculate F1 score
f1_score = metrics.f1_score(y_bin, y_pred_bin, average='weighted')
print("")
print('F1 score:', f1_score)

# Print boundary-detected text
for sentence, labels in zip(X, y_pred):
    boundary_detected_text = " ".join([token for token, label in zip(sentence, labels)])


classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4101
           1       1.00      1.00      1.00      4118

   micro avg       1.00      1.00      1.00      8219
   macro avg       1.00      1.00      1.00      8219
weighted avg       1.00      1.00      1.00      8219
 samples avg       1.00      1.00      1.00      8219


confusion matrix
[[[  17    0]
  [   0 4101]]

 [[   0    0]
  [   5 4113]]]

F1 score: 0.9996956419699465


In [38]:
import nltk
import sklearn_crfsuite
from sklearn.metrics import multilabel_confusion_matrix, classification_report, accuracy_score
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer


# Defining Feature and Label Functions
def extract_features(sentence, i):
    features = {
        'word': sentence[i],
        'is_first': i == 0,
        'is_last': i == len(sentence) - 1,
        'is_capitalized': sentence[i][0].upper() == sentence[i][0],
        'is_all_caps': sentence[i].upper() == sentence[i],
        'is_all_lower': sentence[i].lower() == sentence[i],
        'is_alphanumeric': int(bool((len(sentence[i]) > 1) & sentence[i].isalnum()))
    }
    return features


def get_label(sentence, i):
    last_char = sentence[i][-1]
    return str(last_char == ',' or last_char == ';' or last_char == ':' or last_char == '.' or last_char == '?')


# read text from file
with open('C:/Users/aardr/Downloads/bva.txt', 'r', encoding='utf-8') as f:
    text = f.read()

text = text.replace('\\r', '').replace('\\n', '').replace('\\t', '')

# perform sentence boundary detection
sentences = nltk.sent_tokenize(text)
X = [nltk.word_tokenize(sentence) for sentence in sentences]
y = [[get_label(sentence, i) for i in range(len(sentence))] for sentence in X]

# extract features from sentences
X_feats = [[extract_features(X[i], j) for j in range(len(X[i]))] for i in range(len(X))]

# Convert labels to list of sequences
y_seq = [sentence_labels for sentence_labels in y]

# train CRF model
crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=3)
crf.fit(X_feats, y_seq)

# test CRF model on training data
mlb = MultiLabelBinarizer()
y_bin = mlb.fit_transform(y_seq)

# test CRF model on training data
y_pred = crf.predict(X_feats)

# Convert y_pred to binary array format (if needed)
y_pred_bin = mlb.transform(y_pred)

# Generate classification report
report = classification_report(y_bin, y_pred_bin)
print(report)

# Calculate confusion matrix
confusion_matrix = multilabel_confusion_matrix(y_bin, y_pred_bin)
print(confusion_matrix)

# Calculate accuracy
accuracy = accuracy_score(y_bin, y_pred_bin)
print('Accuracy:', accuracy)

# Calculate F1 score
f1_score = metrics.f1_score(y_bin, y_pred_bin, average='weighted')
print('F1 score:', f1_score)

# Print boundary-detected text
for sentence, labels in zip(X, y_pred):
    boundary_detected_text = " ".join([token for token, label in zip(sentence, labels)])


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4101
           1       1.00      1.00      1.00      4118

   micro avg       1.00      1.00      1.00      8219
   macro avg       1.00      1.00      1.00      8219
weighted avg       1.00      1.00      1.00      8219
 samples avg       1.00      1.00      1.00      8219

[[[  17    0]
  [   0 4101]]

 [[   0    0]
  [   5 4113]]]
Accuracy: 0.9987858183584264
F1 score: 0.9996956419699465


# punkt model

In [21]:
import nltk
from nltk.tokenize import PunktSentenceTokenizer
import re

# Read the contents of the file
file_path = "C:/Users/aardr/Downloads/bva.txt"
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Remove newline characters and tab characters
text = text.replace('\\r', '').replace('\\n', '').replace('\\t', '')

# Define custom sentence boundaries
custom_sent_tokenize = PunktSentenceTokenizer()
custom_sent_tokenize.INCLUDE_ALL_COLLOCS = True

# Customize the punctuation and capital letters as boundaries
custom_sent_tokenize._params.abbrev_types.update(['.', ',', ';', 'e.g.', 'i.e.', 'etc.', 'mr.', 'mrs.', 'ms.', 'dr.'])

# Add more custom boundaries specific to legal text, if needed
custom_sent_tokenize._params.abbrev_types.update(['v.', 'vs.', 'art.', 'no.', 'jr.', 'sr.', 'inc.', 'co.'])

# Tokenize the text into sentences
predicted_sentences = custom_sent_tokenize.tokenize(text)

# Manually evaluate the tokenizer
for sentence in predicted_sentences:
    print(sentence)


{"59bd4ac35116540935ee6851": {"text": "Citation Nr: 1510760Decision Date: 03/13/15    Archive Date: 03/24/15DOCKET NO.
10-10 851)DATE))On appeal from theDepartment of Veterans Affairs Regional Office in Portland, OregonTHE ISSUES1.
Entitlement to service connection for an acquired psychiatric disability, to include posttraumatic stress disorder (PTSD) and major depression.2.
Entitlement to total rating for compensation purposes based on individual unemployability (TDIU).REPRESENTATIONAppellant represented by:Harold H. Hoffman III, AttorneyWITNESS AT HEARING ON APPEALAppellantATTORNEY FOR THE BOARDK.
Hughes, CounselINTRODUCTIONThe Veteran had active military service from May 1973 to October 1976.These matters come before the Board of Veterans' Appeals (Board) from a February 2012 rating decision of the Department of Veterans Affairs (VA), Regional Office (RO) in Portland, Oregon.
In December 2013, the Veteran testified at a videoconference hearing before the undersigned Veterans Law Jud

In [22]:
import nltk.data
import pandas as pd
from nltk.tokenize import PunktSentenceTokenizer

# Read the contents of the file
file_path = "C:/Users/aardr/Downloads/bva.txt"
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Remove newline characters and tab characters
text = text.replace('\\r', '').replace('\\n', '').replace('\\t', '')

# Define custom sentence boundaries
custom_sent_tokenize = PunktSentenceTokenizer()
custom_sent_tokenize.INCLUDE_ALL_COLLOCS = True
custom_sent_tokenize._params.abbrev_types.update(['.', ',', ';', 'e.g.', 'i.e.', 'etc.', 'mr.', 'mrs.', 'ms.', 'dr.'])
custom_sent_tokenize._params.abbrev_types.update(['v.', 'vs.', 'art.', 'no.', 'jr.', 'sr.', 'inc.', 'co.'])

# Tokenize the text into sentences
predicted_sentences = custom_sent_tokenize.tokenize(text)

# Read the labeled dataset for evaluation
labeled_dataset_path = 'sentences_labels.csv'
df = pd.read_csv(labeled_dataset_path)

# Get the expected sentences from the labeled dataset
expected_sentences = df['Sentence'].tolist()

# Calculate accuracy by comparing predicted and expected sentences
correct_predictions = sum(1 for sent in predicted_sentences if sent in expected_sentences)
total_sentences = len(expected_sentences)
accuracy = correct_predictions / total_sentences



# Print the expected sentences
print("Sentences:")
for sentence in expected_sentences:
    print(sentence)

# Print the accuracy
print("Accuracy: ", accuracy)


Sentences:
97}, {"start"
If the preponderance of the evidence weighs against the claim, it must be denied.
H.V., has diagnosed the Veteran with PTSD and major depression, which she has attributed to the Veteran's claimed in service stressors
18847}, {"start":
\u00a7 5107;
447}, {"start":
43169}, {"start"
The Veteran should be requested to indicate if he has received any VA or non-VA medical treatment for his psychiatric disorders, lumbar spine disability, brain aneurysm and stroke, left eye impaired vision, a bilateral hip disability, bilateral hearing loss, tinnitus, and right and left degenerative joint disease that is not evidenced by the current record
16416}, {"start":
1628}, {"start":
19206}, {"start"
18828}, {"start"
1092, "end"
These reports do not provide a rationale explaining the basis for providing such a diagnosis.
20}, {"start"
1632746\t\nDecision Date
See 38 U.S.C.A
see also Colvin v
38 C.F.R.
The Board finds, however, that the Veteran has provided sufficient corroborati

In [23]:

# Print the accuracy
print("Accuracy: ", accuracy)

Accuracy:  0.10680727907068945


In [24]:
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, classification_report

# Convert labels to flat list format
y_flat = [label for sentence_labels in y for label in sentence_labels]
y_pred_flat = [label for sentence_labels in y_pred for label in sentence_labels]

# Map labels to integer values
label_mapping = {'False': 0, 'True': 1}
y_flat_int = [label_mapping[label] for label in y_flat]
y_pred_flat_int = [label_mapping[label] for label in y_pred_flat]

# Calculate confusion matrix
cm = confusion_matrix(y_flat_int, y_pred_flat_int)
print("Confusion Matrix:")
print(cm)

# Calculate F1 score
f1 = f1_score(y_flat_int, y_pred_flat_int)
print("F1 Score:", f1)

# Generate classification report
target_names = ['False', 'True']
report = classification_report(y_flat_int, y_pred_flat_int, target_names=target_names)
print("Classification Report:")
print(report)

Confusion Matrix:
[[114781     65]
 [ 20752   4126]]
F1 Score: 0.2838762943341704
Classification Report:
              precision    recall  f1-score   support

       False       0.85      1.00      0.92    114846
        True       0.98      0.17      0.28     24878

    accuracy                           0.85    139724
   macro avg       0.92      0.58      0.60    139724
weighted avg       0.87      0.85      0.80    139724



In [40]:
import nltk.data
import pandas as pd
from nltk.tokenize import PunktSentenceTokenizer
from sklearn.metrics import confusion_matrix, classification_report, f1_score

# Import the Punkt tokenizer from the nltk.data module
nltk.data.path.append("nltk_data")  # Specify the path to the nltk_data directory if necessary
custom_sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')

# Read the contents of the file
file_path = "C:/Users/aardr/Downloads/bva.txt"
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Remove newline characters and tab characters
text = text.replace('\\r', '').replace('\\n', '').replace('\\t', '')

# Define custom sentence boundaries
custom_sent_tokenize._params.abbrev_types.update(['.', ',', ';', 'e.g.', 'i.e.', 'etc.', 'mr.', 'mrs.', 'ms.', 'dr.'])
custom_sent_tokenize._params.abbrev_types.update(['v.', 'vs.', 'art.', 'no.', 'jr.', 'sr.', 'inc.', 'co.'])

# Tokenize the text into sentences
predicted_sentences = custom_sent_tokenize.tokenize(text)

# Read the labeled dataset for evaluation
labeled_dataset_path = 'sentences_labels.csv'
df = pd.read_csv(labeled_dataset_path)

# Get the expected sentences and labels from the labeled dataset
expected_sentences = df['Sentence'].tolist()
expected_labels = df['Label'].tolist()

# Filter out sentences from the labeled dataset that are not present in the predicted sentences
filtered_expected_labels = []
filtered_expected_sentences = []
for sent, label in zip(expected_sentences, expected_labels):
    if sent in predicted_sentences:
        filtered_expected_sentences.append(sent)
        filtered_expected_labels.append(label)

# Create a list to store the predicted labels
predicted_labels = []
for sent in filtered_expected_sentences:
    idx = predicted_sentences.index(sent)
    predicted_labels.append(expected_labels[idx])

# Generate classification report
report = classification_report(filtered_expected_labels, predicted_labels)
print("Classification Report:")
print(report)

# Calculate confusion matrix
confusion_mat = confusion_matrix(filtered_expected_labels, predicted_labels)
print("Confusion Matrix:")
print(confusion_mat)

# Calculate F1 score
f1 = f1_score(filtered_expected_labels, predicted_labels, average='weighted')
print("F1 Score:", f1)


Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.33      0.00         6
           1       1.00      0.49      0.66      2707

    accuracy                           0.49      2713
   macro avg       0.50      0.41      0.33      2713
weighted avg       0.99      0.49      0.66      2713

Confusion Matrix:
[[   2    4]
 [1381 1326]]
F1 Score: 0.6554769920604203


In [41]:
import nltk.data
import pandas as pd
from nltk.tokenize import PunktSentenceTokenizer
from sklearn.metrics import confusion_matrix, classification_report, f1_score

# Import the Punkt tokenizer from the nltk.data module
nltk.data.path.append("nltk_data")  # Specify the path to the nltk_data directory if necessary
custom_sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')

# Read the contents of the file
file_path = "C:/Users/aardr/Downloads/bva.txt"
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# Remove newline characters and tab characters
text = text.replace('\\r', '').replace('\\n', '').replace('\\t', '')

# Define custom sentence boundaries
custom_sent_tokenize._params.abbrev_types.update(['.', ',', ';', 'e.g.', 'i.e.', 'etc.', 'mr.', 'mrs.', 'ms.', 'dr.'])
custom_sent_tokenize._params.abbrev_types.update(['v.', 'vs.', 'art.', 'no.', 'jr.', 'sr.', 'inc.', 'co.'])

# Tokenize the text into sentences
predicted_sentences = custom_sent_tokenize.tokenize(text)

# Read the labeled dataset for evaluation
labeled_dataset_path = 'sentences_labels.csv'
df = pd.read_csv(labeled_dataset_path)

# Get the expected sentences and labels from the labeled dataset
expected_sentences = df['Sentence'].tolist()
expected_labels = df['Label'].tolist()

# Filter out sentences from the labeled dataset that are not present in the predicted sentences
filtered_expected_labels = []
filtered_expected_sentences = []
for sent, label in zip(expected_sentences, expected_labels):
    if sent in predicted_sentences:
        filtered_expected_sentences.append(sent)
        filtered_expected_labels.append(label)

# Create a list to store the predicted labels
predicted_labels = []
for sent in filtered_expected_sentences:
    idx = predicted_sentences.index(sent)
    predicted_labels.append(expected_labels[idx])

# Calculate accuracy
correct_predictions = sum(1 for true_label, pred_label in zip(filtered_expected_labels, predicted_labels) if true_label == pred_label)
total_predictions = len(filtered_expected_labels)
accuracy = correct_predictions / total_predictions

# Generate classification report
report = classification_report(filtered_expected_labels, predicted_labels)
print("Classification Report:")
print(report)

# Calculate confusion matrix
confusion_mat = confusion_matrix(filtered_expected_labels, predicted_labels)
print("Confusion Matrix:")
print(confusion_mat)

# Calculate F1 score
f1 = f1_score(filtered_expected_labels, predicted_labels, average='weighted')
print("F1 Score:", f1)

# Print accuracy
print("Accuracy:", accuracy)


Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.33      0.00         6
           1       1.00      0.49      0.66      2707

    accuracy                           0.49      2713
   macro avg       0.50      0.41      0.33      2713
weighted avg       0.99      0.49      0.66      2713

Confusion Matrix:
[[   2    4]
 [1381 1326]]
F1 Score: 0.6554769920604203
Accuracy: 0.4894950239587173
