# Extractive Summarization

## Load necessary modules

In [2]:
from datasets import load_dataset
import pandas as pd
import optuna

from Dataset import Dataset

## Define optimization task functions

In [72]:
def CNN_scores_best_representation(trial, CNN_dataset, value, pipeline=None, lemma=False):
    scores = set()
    available_scores = CNN_dataset.get_num_weights(True)
    
    i = trial.suggest_int('Number_of_scores', 1, len(available_scores))
    for x in range(i):
        score = trial.suggest_categorical('Score-%d' % x, available_scores)
        scores.add(score)

    CNN_dataset.process_dataset(scoreList=scores, all_loc_scores=True, lemma=lemma, nlp=pipeline)
    results = CNN_dataset.rouge_computation(show=False)
    return results.loc['Mean'][value]


def CNN_scores_weights_task(trial, CNN_dataset, value, pipeline=None, lemma=False):
    scores = [x for x in CNN_dataset.get_num_weights(True)]
    weights = [1 for x in range(len(scores))]

    for score in scores:
        idx = scores.index(score)
        weights[idx] = trial.suggest_float(score, -10.0, 10.0, step=0.5)
    CNN_dataset.process_dataset(scoreList=scores, all_loc_scores=True, nlp=pipeline, lemma=lemma)
    results = CNN_dataset.rouge_computation(weights=weights, show=False)
    return results.loc['Mean'][value]

## Load CNN\_dailymail dataset and initiate Dataset data structure

In [4]:
# Load dataset into a variable
CNN_dataset = load_dataset('cnn_dailymail', '3.0.0')

# Create a new instance of the Dataset class with a custom name
CNN_processed = Dataset(name='CNN_processed.json')

Reusing dataset cnn_dailymail (/home/davide/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


  0%|          | 0/3 [00:00<?, ?it/s]

## Meena and Gopalani strategy comparison

In [11]:
# Meena & Gopalani environment
MG_scores = {
             'comb1': ['TF_ISF_IDF', 'co_occur', 'sent_length'],
             'comb2': ['co_occur', 'sent_length', 'sent_location'],
             'comb3': ['TF_ISF_IDF', 'co_occur', 'sent_length',
                       'sent_location'],
             'comb4': ['sent_length', 'sent_location', 'named_entities',
                       'pos_keywords', 'proper_noun'],
             'comb5': ['co_occur', 'sent_length', 'sent_location',
                       'named_entities', 'pos_keywords', 'proper_noun'],
             'comb6': ['TF_ISF_IDF', 'co_occur', 'sent_length',
                       'sent_location', 'named_entities', 'pos_keywords',
                       'neg_keywords', 'sent_rank'],
             'comb7': ['TF_ISF_IDF', 'co_occur', 'sent_length',
                       'sent_location', 'named_entities', 'pos_keywords',
                       'neg_keywords']
             }

# Meena & Golapani initial test
MG_test = Dataset(name='MG_test_dataset.json')
MG_num_docs = 100
MG_test.build_dataset(CNN_dataset['train'], MG_num_docs)
MG_results = pd.DataFrame()
for comb, scores in MG_scores.items():
    MG_test.process_dataset(scoreList=scores)
    MG_rouge = MG_test.rouge_computation(show=False)
    MG_results = pd.concat([MG_results, MG_rouge.loc['Mean']], axis=1,
                           ignore_index=True)
print(MG_results.T)

  sentence_similarity = sentence.similarity(sent2)
processing dataset: :   0%|                                        | 100/287113 [00:17<14:15:38,  5.59it/s]


Dataset built in 18.49911141395569[sec]


computing scores: : 100%|████████████████████████████████████████████████| 100/100 [00:01<00:00, 59.28it/s]


Dataset processed in: 1.6881[sec]


computing scores: : 100%|███████████████████████████████████████████████| 100/100 [00:00<00:00, 104.89it/s]


Dataset processed in: 0.9544[sec]


computing scores: : 100%|████████████████████████████████████████████████| 100/100 [00:01<00:00, 63.07it/s]


Dataset processed in: 1.5865[sec]


computing scores: : 100%|████████████████████████████████████████████████| 100/100 [00:01<00:00, 51.96it/s]


Dataset processed in: 1.9255[sec]


computing scores: : 100%|████████████████████████████████████████████████| 100/100 [00:01<00:00, 54.21it/s]


Dataset processed in: 1.8458[sec]


computing scores: : 100%|████████████████████████████████████████████████| 100/100 [00:03<00:00, 30.57it/s]


Dataset processed in: 3.2728[sec]


computing scores: : 100%|████████████████████████████████████████████████| 100/100 [00:03<00:00, 32.32it/s]

Dataset processed in: 3.0951[sec]
    Rouge-2  Precision  F1-score
0  0.114578   0.035599  0.053768
1  0.070260   0.048146  0.055320
2  0.117600   0.036917  0.055564
3  0.146901   0.075990  0.097490
4  0.144287   0.074426  0.095598
5  0.120043   0.037519  0.056511
6  0.119123   0.037252  0.056104





## All scores computation

A warning is flagged in processing document 27 due to some of its token being unknown by spacy

In [9]:
results = pd.DataFrame(columns=['Rouge-2', 'Precision', 'F1-score'])
for _doc_num in [6, 100, 1000]:
    CNN_processed.process_dataset(CNN_dataset['train'], doc_th=_doc_num)
    res = CNN_processed.rouge_computation(show=False)
    results.loc[_doc_num] = res.loc['Mean'].T
print(results)

processing dataset: :   0%|                                          | 6/287113 [00:01<16:32:16,  4.82it/s]
computing scores: : 100%|████████████████████████████████████████████████████| 6/6 [00:00<00:00, 23.73it/s]


Dataset processed in: 0.2541[sec]


  sentence_similarity = sentence.similarity(sent2)
processing dataset: :   0%|                                        | 100/287113 [00:16<13:30:25,  5.90it/s]
computing scores: : 100%|████████████████████████████████████████████████| 100/100 [00:04<00:00, 24.16it/s]


Dataset processed in: 4.1399[sec]


processing dataset: :   0%|▏                                      | 1000/287113 [02:44<13:04:13,  6.08it/s]
computing scores: : 100%|██████████████████████████████████████████████| 1000/1000 [02:11<00:00,  7.63it/s]


Dataset processed in: 131.1135[sec]
       Rouge-2  Precision  F1-score
6     0.109382   0.028958  0.045368
100   0.120221   0.037403  0.056382
1000  0.123249   0.038178  0.057659


## Experiment: Maximum among all the computed Sentence location scores 

In [15]:
# Sum of Location scoring strategies
CNN_processed = Dataset('CNN_dataset')
CNN_processed.process_dataset(CNN_dataset['train'], doc_th=100,
                              all_loc_scores=True)
result = CNN_processed.rouge_computation()

  sentence_similarity = sentence.similarity(sent2)
processing dataset: :   0%|                                        | 100/287113 [00:17<13:50:42,  5.76it/s]
computing scores: : 100%|████████████████████████████████████████████████| 100/100 [00:04<00:00, 23.96it/s]

Dataset processed in: 4.1745[sec]
                                           Rouge-2  Precision  F1-score
0001d1afc246a7964130f43ae940af6bc6c57f01  0.120000   0.027650  0.044944
0002095e55fcbd3a2f366d9bf92a95433dc305ef  0.111111   0.017964  0.030928
00027e965c8264c35cc1bc55556db388da82b07f  0.027027   0.007246  0.011429
0002c17436637c4fe1837c935c04de47adb18e9a  0.062500   0.016760  0.026432
0003ad6ef0c37534f80b55b4235108024b407f0b  0.031250   0.010526  0.015748
...                                            ...        ...       ...
004f0f8c694c4b546b29565a8993a555537ff561  0.054054   0.016260  0.025000
004fc12e7cd2505a013d96e816afae3f3ce5015d  0.258065   0.109589  0.153846
00504275ede73591d94a6c1f994fd4856610421c  0.019608   0.007812  0.011173
00512126d65bf2a36801e4ef37f28c86c29deb28  0.085106   0.021053  0.033755
Mean                                      0.122133   0.038602  0.057941

[101 rows x 3 columns]





## Experiment: Location scoring with threshold analysis

In [None]:
# Nobata Location Treshold analysis
CNN_processed = Dataset('CNN_dataset')
CNN_processed.build_dataset(CNN_dataset['train'], doc_th=100)
loc_task = pd.DataFrame(columns=['Rouge-2', 'Precision', 'F1-score'])
for x in range(1, 21):
    CNN_processed.process_dataset(loc_th=x, all_loc_scores=True)
    loc_task.loc[x] = CNN_processed.rouge_computation(show=False).loc['Mean']
print(loc_task)

## Experiment: Location Scoring with treshold analysis using only Nobata et al. method

In [None]:
# Nobata Location Treshold analysis part 2
CNN_processed = Dataset('CNN_dataset')
CNN_processed.build_dataset(CNN_dataset['train'], doc_th=100)
loc_task = pd.DataFrame(columns=['Rouge-2', 'Precision', 'F1-score'])
for x in range(1, 21):
    CNN_processed.process_dataset(loc_th=x, locFilter=[0, 1, 0, 0, 0])  # Only NB1 used
    loc_task.loc[x] = CNN_processed.rouge_computation(show=False).loc['Mean']
print(loc_task)

## Experiment: Lemma Usage

In [25]:
results = pd.DataFrame(columns=['Rouge-2', 'Precision', 'F1-score'])
CNN_processed = Dataset('CNN_dataset')
# CNN_processed.build_dataset(CNN_dataset['train'], doc_th=100)

_doc_num = 100
for _lemma in [False, True]:
    CNN_processed.process_dataset(CNN_dataset['train'], doc_th=100,
                                  lemma=_lemma, all_loc_scores=True)
    res = CNN_processed.rouge_computation(show=False)
    results.loc[str(_lemma)] = res.loc['Mean'].T
print(results)

  sentence_similarity = sentence.similarity(sent2)
processing dataset: :   0%|                                        | 100/287113 [00:16<13:31:18,  5.90it/s]
computing scores: : 100%|████████████████████████████████████████████████| 100/100 [00:04<00:00, 24.14it/s]


Dataset processed in: 4.1430[sec]


processing dataset: :   0%|                                        | 100/287113 [00:16<13:25:31,  5.94it/s]
computing scores: : 100%|████████████████████████████████████████████████| 100/100 [00:04<00:00, 24.51it/s]

Dataset processed in: 4.0817[sec]
        Rouge-2  Precision  F1-score
False  0.122133   0.038602  0.057941
True   0.156094   0.059743  0.085454





## Experiment: Best Weights with and Without lemma usage optimization task

Will take 30min to run

In [43]:
# Scores weights optimization task
best_weights_res = pd.DataFrame(data=[], columns=['Rouge-2', 'Precision', 'F1-score'])
weights9 = {'False': [], 'True': []}

for _lemma in [False, True]:
    CNN_dataset = load_dataset('cnn_dailymail', '3.0.0')
    CNN_processed = Dataset('Weights_finding_optimisation_task.json')
    pipe = CNN_processed.build_dataset(CNN_dataset['train'], doc_th=100,
                                       return_pipe=True, lemma=_lemma)

    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: CNN_scores_weights_task(trial, CNN_processed, 'Precision',
                                                         pipe, lemma=_lemma),
                   n_trials=500)

    weights = list(study.best_params.values())
    weight_res = CNN_processed.rouge_computation(show=False, weights=weights)
    
    best_weights_res.loc[str(_lemma)] = weight_res.loc['Mean'].T
    
    for key, value in study.best_params.items():
        if value > 9:
            weights9[str(_lemma)].append(key)
    
    CNN_processed.process_dataset(lemma=_lemma, scoreList=weights9[str(_lemma)])
    weight_9_rouge = CNN_processed.rouge_computation(show=False, weights=weights)
    
    best_weights_res.loc[str(_lemma)+'_9'] = weight_9_rouge.loc['Mean'].T
print(best_weights_res)

Reusing dataset cnn_dailymail (/home/davide/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


  0%|          | 0/3 [00:00<?, ?it/s]

processing dataset: :   0%|                                          | 3/287113 [00:00<22:47:50,  3.50it/s]
[32m[I 2021-12-21 23:09:02,370][0m A new study created in memory with name: no-name-aa819bbe-f8d3-44b2-b338-ee8f4d63c4ba[0m
computing scores: : 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 15.96it/s]
[32m[I 2021-12-21 23:09:02,563][0m Trial 0 finished with value: 0.0 and parameters: {'TF': 2.0, 'sent_location': -3.5, 'proper_noun': -2.5, 'co_occur': -3.0, 'sent_similarity': 4.0, 'num_val': 1.0, 'TF_ISF_IDF': 0.0, 'sent_rank': -3.5, 'sent_length': 3.0, 'pos_keywords': -10.0, 'neg_keywords': -10.0, 'thematic_features': 7.0, 'named_entities': 1.5}. Best is trial 0 with value: 0.0.[0m


Dataset processed in: 0.1890[sec]


computing scores: : 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 16.16it/s]
[32m[I 2021-12-21 23:09:02,754][0m Trial 1 finished with value: 0.020833333333333332 and parameters: {'TF': -1.0, 'sent_location': 7.5, 'proper_noun': -5.0, 'co_occur': 10.0, 'sent_similarity': -5.0, 'num_val': -7.0, 'TF_ISF_IDF': -7.0, 'sent_rank': 6.0, 'sent_length': -4.5, 'pos_keywords': 7.0, 'neg_keywords': 2.5, 'thematic_features': -7.0, 'named_entities': 10.0}. Best is trial 1 with value: 0.020833333333333332.[0m


Dataset processed in: 0.1866[sec]


computing scores: : 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 16.05it/s]
[32m[I 2021-12-21 23:09:02,947][0m Trial 2 finished with value: 0.016504154479928965 and parameters: {'TF': -2.0, 'sent_location': -5.0, 'proper_noun': -6.5, 'co_occur': 1.0, 'sent_similarity': -9.5, 'num_val': 6.5, 'TF_ISF_IDF': 5.0, 'sent_rank': 3.5, 'sent_length': 6.5, 'pos_keywords': 5.5, 'neg_keywords': 2.5, 'thematic_features': -5.0, 'named_entities': 5.0}. Best is trial 1 with value: 0.020833333333333332.[0m


Dataset processed in: 0.1881[sec]


computing scores: : 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 99.63it/s]


Dataset processed in: 0.0315[sec]


Reusing dataset cnn_dailymail (/home/davide/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


  0%|          | 0/3 [00:00<?, ?it/s]

processing dataset: :   0%|                                          | 3/287113 [00:00<22:54:05,  3.48it/s]
[32m[I 2021-12-21 23:09:06,024][0m A new study created in memory with name: no-name-6fa33993-f8dd-466a-b7f4-fd1694bfd71b[0m
computing scores: : 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 16.53it/s]
[32m[I 2021-12-21 23:09:06,211][0m Trial 0 finished with value: 0.04047018111973586 and parameters: {'TF': 5.0, 'sent_location': 3.5, 'proper_noun': 6.5, 'co_occur': -7.0, 'sent_similarity': 1.0, 'num_val': -5.5, 'TF_ISF_IDF': 3.0, 'sent_rank': 0.0, 'sent_length': 6.5, 'pos_keywords': 1.5, 'neg_keywords': -0.5, 'thematic_features': -7.0, 'named_entities': -3.0}. Best is trial 0 with value: 0.04047018111973586.[0m


Dataset processed in: 0.1825[sec]


computing scores: : 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 16.53it/s]
[32m[I 2021-12-21 23:09:06,398][0m Trial 1 finished with value: 0.043833188805929894 and parameters: {'TF': 10.0, 'sent_location': 4.5, 'proper_noun': -9.5, 'co_occur': -8.0, 'sent_similarity': -3.0, 'num_val': -8.0, 'TF_ISF_IDF': 6.0, 'sent_rank': -10.0, 'sent_length': 3.0, 'pos_keywords': -1.5, 'neg_keywords': -8.0, 'thematic_features': -2.5, 'named_entities': -4.0}. Best is trial 1 with value: 0.043833188805929894.[0m


Dataset processed in: 0.1825[sec]


computing scores: : 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 16.54it/s]
[32m[I 2021-12-21 23:09:06,585][0m Trial 2 finished with value: 0.06032999842676776 and parameters: {'TF': -2.5, 'sent_location': 10.0, 'proper_noun': -8.0, 'co_occur': -4.5, 'sent_similarity': -7.0, 'num_val': 5.0, 'TF_ISF_IDF': 7.0, 'sent_rank': 5.5, 'sent_length': -3.0, 'pos_keywords': 1.5, 'neg_keywords': 5.0, 'thematic_features': -1.0, 'named_entities': -7.0}. Best is trial 2 with value: 0.06032999842676776.[0m


Dataset processed in: 0.1826[sec]


computing scores: : 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 121.93it/s]

Dataset processed in: 0.0258[sec]
          Rouge-2  Precision  F1-score
False    0.009009   0.020833  0.012579
False_9  0.117037   0.027061  0.043817
True     0.193471   0.060330  0.091542
True_9   0.139874   0.094984  0.109828





## Experiment: Best Representing subset, optimization task

In [112]:
CNN_dataset = load_dataset('cnn_dailymail', '3.0.0')
CNN_processed = Dataset('Weights_finding_optimisation_task.json')

subset_rouge = pd.DataFrame(columns=['Rouge-2', 'Precision', 'F1-score'])
subsets = {}
_num_doc = 100

for _lemma in [False, True]:
    pipe = CNN_processed.build_dataset(CNN_dataset['train'], doc_th=_num_doc,
                                       return_pipe=True, lemma=_lemma, suppress_warnings=True)
    study = optuna.create_study(direction='maximize')
    for _value in ['Precision', 'Rouge-2', 'F1-score']:
        study.optimize(lambda trial: CNN_scores_best_representation(trial,
                                                                    CNN_processed,
                                                                    _value,
                                                                    pipe,
                                                                    lemma=_lemma),
                       n_trials=500)

        subset = set()
        for key, value in study.best_params.items():
            if key != 'Number_of_scores':
                subset.add(value)
        
        CNN_processed.process_dataset(lemma=_lemma, scoreList=subset)
        index = _value + '_Lemma_{}'.format(_lemma)
        subsets[index] = subset
        subset_rouge.loc[index] = CNN_processed.rouge_computation(show=False).loc['Mean'].T
    CNN_processed.__init__('Weights_finding_optimisation_task.json')
print(subset_rouge, '\n')

for key, value in subsets.items():
    print(key, '\t', value)

Reusing dataset cnn_dailymail (/home/davide/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


  0%|          | 0/3 [00:00<?, ?it/s]

processing dataset: :   0%|                                         | 10/287113 [00:01<15:20:56,  5.20it/s]
[32m[I 2021-12-22 00:44:32,519][0m A new study created in memory with name: no-name-2f835f8b-3b3f-4fde-8d73-be2d93216613[0m
computing scores: : 100%|██████████████████████████████████████████████████| 10/10 [00:00<00:00, 48.54it/s]
[32m[I 2021-12-22 00:44:32,732][0m Trial 0 finished with value: 0.02417095165362467 and parameters: {'Number_of_scores': 10, 'Score-0': 'sent_rank', 'Score-1': 'pos_keywords', 'Score-2': 'sent_rank', 'Score-3': 'sent_location', 'Score-4': 'thematic_features', 'Score-5': 'TF', 'Score-6': 'TF_ISF_IDF', 'Score-7': 'proper_noun', 'Score-8': 'co_occur', 'Score-9': 'num_val'}. Best is trial 0 with value: 0.02417095165362467.[0m


Dataset processed in: 0.2071[sec]


computing scores: : 100%|██████████████████████████████████████████████████| 10/10 [00:00<00:00, 47.94it/s]


Dataset processed in: 0.2097[sec]


computing scores: : 100%|██████████████████████████████████████████████████| 10/10 [00:00<00:00, 37.36it/s]
[32m[I 2021-12-22 00:44:33,758][0m Trial 1 finished with value: 0.09152306925512077 and parameters: {'Number_of_scores': 10, 'Score-0': 'co_occur', 'Score-1': 'named_entities', 'Score-2': 'pos_keywords', 'Score-3': 'neg_keywords', 'Score-4': 'neg_keywords', 'Score-5': 'sent_location', 'Score-6': 'sent_length', 'Score-7': 'num_val', 'Score-8': 'neg_keywords', 'Score-9': 'TF_ISF_IDF'}. Best is trial 1 with value: 0.09152306925512077.[0m


Dataset processed in: 0.2688[sec]


computing scores: : 100%|██████████████████████████████████████████████████| 10/10 [00:00<00:00, 35.65it/s]


Dataset processed in: 0.2817[sec]


computing scores: : 100%|█████████████████████████████████████████████████| 10/10 [00:00<00:00, 124.43it/s]
[32m[I 2021-12-22 00:44:34,702][0m Trial 2 finished with value: 0.03346488975981136 and parameters: {'Number_of_scores': 2, 'Score-0': 'TF_ISF_IDF', 'Score-1': 'named_entities'}. Best is trial 1 with value: 0.09152306925512077.[0m


Dataset processed in: 0.0814[sec]


computing scores: : 100%|██████████████████████████████████████████████████| 10/10 [00:00<00:00, 36.06it/s]


Dataset processed in: 0.2784[sec]


processing dataset: :   0%|                                         | 10/287113 [00:01<15:39:35,  5.09it/s]
[32m[I 2021-12-22 00:44:38,510][0m A new study created in memory with name: no-name-e160813b-3896-46f5-a741-6f221120001c[0m
computing scores: : 100%|██████████████████████████████████████████████████| 10/10 [00:00<00:00, 25.47it/s]
[32m[I 2021-12-22 00:44:38,909][0m Trial 0 finished with value: 0.04583317833089087 and parameters: {'Number_of_scores': 12, 'Score-0': 'TF_ISF_IDF', 'Score-1': 'TF', 'Score-2': 'TF_ISF_IDF', 'Score-3': 'sent_rank', 'Score-4': 'neg_keywords', 'Score-5': 'TF', 'Score-6': 'named_entities', 'Score-7': 'sent_length', 'Score-8': 'num_val', 'Score-9': 'pos_keywords', 'Score-10': 'sent_similarity', 'Score-11': 'sent_rank'}. Best is trial 0 with value: 0.04583317833089087.[0m


Dataset processed in: 0.3935[sec]


computing scores: : 100%|██████████████████████████████████████████████████| 10/10 [00:00<00:00, 26.36it/s]


Dataset processed in: 0.3806[sec]


computing scores: : 100%|██████████████████████████████████████████████████| 10/10 [00:00<00:00, 36.93it/s]
[32m[I 2021-12-22 00:44:40,096][0m Trial 1 finished with value: 0.11960157044567062 and parameters: {'Number_of_scores': 12, 'Score-0': 'TF_ISF_IDF', 'Score-1': 'TF_ISF_IDF', 'Score-2': 'proper_noun', 'Score-3': 'sent_location', 'Score-4': 'sent_length', 'Score-5': 'sent_rank', 'Score-6': 'pos_keywords', 'Score-7': 'TF', 'Score-8': 'pos_keywords', 'Score-9': 'named_entities', 'Score-10': 'neg_keywords', 'Score-11': 'thematic_features'}. Best is trial 1 with value: 0.11960157044567062.[0m


Dataset processed in: 0.2719[sec]


computing scores: : 100%|██████████████████████████████████████████████████| 10/10 [00:00<00:00, 35.85it/s]


Dataset processed in: 0.2801[sec]


computing scores: : 100%|██████████████████████████████████████████████████| 10/10 [00:00<00:00, 61.83it/s]
[32m[I 2021-12-22 00:44:41,080][0m Trial 2 finished with value: 0.11022877259293822 and parameters: {'Number_of_scores': 3, 'Score-0': 'TF', 'Score-1': 'neg_keywords', 'Score-2': 'thematic_features'}. Best is trial 1 with value: 0.11960157044567062.[0m


Dataset processed in: 0.1628[sec]


computing scores: : 100%|██████████████████████████████████████████████████| 10/10 [00:00<00:00, 36.06it/s]

Dataset processed in: 0.2784[sec]
                        Rouge-2  Precision  F1-score
Precision_Lemma_False  0.094984   0.027292  0.041944
Rouge-2_Lemma_False    0.086820   0.024171  0.037427
F1-score_Lemma_False   0.086820   0.024171  0.037427
Precision_Lemma_True   0.120567   0.042817  0.062490
Rouge-2_Lemma_True     0.098677   0.036076  0.052214
F1-score_Lemma_True    0.089973   0.036076  0.050930 

Precision_Lemma_False 	 {'co_occur', 'TF_ISF_IDF', 'thematic_features', 'sent_rank', 'pos_keywords', 'proper_noun', 'num_val', 'TF', 'sent_location'}
Rouge-2_Lemma_False 	 {'co_occur', 'TF_ISF_IDF', 'neg_keywords', 'pos_keywords', 'sent_length', 'num_val', 'named_entities', 'sent_location'}
F1-score_Lemma_False 	 {'co_occur', 'TF_ISF_IDF', 'neg_keywords', 'pos_keywords', 'sent_length', 'num_val', 'named_entities', 'sent_location'}
Precision_Lemma_True 	 {'sent_similarity', 'neg_keywords', 'sent_rank', 'pos_keywords', 'sent_length', 'num_val', 'named_entities', 'TF', 'TF_ISF_IDF'}
Rouge-


