In [60]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from dsdr import DSDR
from rouge_score import rouge_scorer

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/ailyhotte/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [54]:
import pandas as pd

data = pd.read_json('../data/easy/train.json')
test = data.iloc[1]

In [55]:
#concatenate all the elements in the 'written_opinion' list into a single string
text = ''
for key, value in test['written_opinion']['parsed'].items():
    text += value
sentences = sent_tokenize(text)

In [56]:
vectorizer = TfidfVectorizer(stop_words='english')
V = vectorizer.fit_transform(sentences).toarray()

In [58]:
summary_indices = DSDR.lin(V, m=10, lamb=0.1)
print("Summary sentences:")
summary = ''
for i in summary_indices:
    print("-", sentences[i])
    summary += sentences[i] + ' '

Summary sentences:
- See App.
- Stumpf argues that his plea was so inconsistent with his denial of having shot Mrs. Stout that he could only have pleaded guilty out of ignorance of the aggravated murder charge’s specific intent element.
- 04-637 MARGARET BRADSHAW, WARDEN, PETITIONER v. JOHN DAVID STUMPF on writ of certiorari to the united states court of appeals for the sixth circuit [June 13, 2005] Justice O’Connor delivered the opinion of the Court.
- for Cert.
- to Pet.
- The State, however, claimed that Stumpf had shot Mrs. Stout, and that he therefore was the principal offender in her murder.
- After Wesley’s trial, Stumpf moved to withdraw his own plea or vacate his death sentence, arguing that the evidence endorsed by the State in Wesley’s trial cast doubt on Stumpf ’s conviction and sentence.
- While a guilty plea is invalid if the defendant has not been informed of the crime’s elements, Stumpf ’s attorneys represented at his plea hearing that they had explained the elements to

In [62]:
def calculate_rouge(prediction, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, prediction)
    return {k: v.fmeasure for k, v in scores.items()}

rouge_score = calculate_rouge(summary, test['Summary']['conclusion'])
print("Score ROUGE :", rouge_score)

Score ROUGE : {'rouge1': 0.4812834224598931, 'rouge2': 0.17741935483870966, 'rougeL': 0.21925133689839574}


In [77]:
def dsdr_rouge(index):
    data = pd.read_json('../data/easy/train.json')
    row = data.iloc[index]
    
    #concatenate all the elements in the 'written_opinion' list into a single string
    text = ''
    for key, value in row['written_opinion']['parsed'].items():
        text += value
    sentences = sent_tokenize(text)
    
    vectorizer = TfidfVectorizer(stop_words='english')
    V = vectorizer.fit_transform(sentences).toarray()
    
    summary_indices = DSDR.lin(V, m=10, lamb=0.1)
    summary = ''
    for i in summary_indices:
        summary += sentences[i] + ' '
    
    rouge_score = calculate_rouge(summary, test['Summary']['conclusion'])
    return rouge_score


In [66]:
res = {
    'rouge1': 0,
    'rouge2': 0,
    'rougeL': 0
}
for i in range(100):
    score = dsdr_rouge(i)
    res['rouge1'] += score['rouge1']
    res['rouge2'] += score['rouge2']
    res['rougeL'] += score['rougeL']
    if i % 10 == 0:
        print(f"Processed {i} documents")
print("Average ROUGE scores over the dataset:")
print({k: v / len(data) for k, v in res.items()})

Processed 0 documents
Processed 10 documents
Processed 20 documents
Processed 30 documents
Processed 40 documents
Processed 50 documents
Processed 60 documents
Processed 70 documents
Processed 80 documents
Processed 90 documents
Average ROUGE scores over the dataset:
{'rouge1': 0.028355263670596768, 'rouge2': 0.0025879263529449514, 'rougeL': 0.015979388712446682}


In [68]:
print({k: v / 100 for k, v in res.items()})

{'rouge1': 0.24101974120007252, 'rouge2': 0.021997374000032086, 'rougeL': 0.13582480405579678}


In [78]:
dsdr_rouge(5)

{'rouge1': 0.24832214765100674,
 'rouge2': 0.02027027027027027,
 'rougeL': 0.14093959731543623}