### Section bodies combined by score order top 1000 words

#### Steps:

1. Get the sections which have the scores greater than cutoff 0.744291
2. Extract the body of sections
3. Normalize the scores and adjusted number of words to be picked from the section
4. Maximize the word allocation in case some sections doesn't have required word
4. Extract top k words
5. If no relevant sections -> Write the complete file as summary with top 1000 words
6. Compare the system summary with each gold summary and get the rouge score. Take average of all gold summaries for the respective file.
7. Take average of all the scores for all the files.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install rouge_score

In [None]:
pip install bert-extractive-summarizer

In [None]:
import pandas as pd
import pickle
import os
import re
from tqdm import tqdm
import numpy as np
from summarizer import Summarizer
from extract_section_body import extract_section_body
from rouge_evaluation import get_rouge_scores
from maximal_word_allocation import get_number_of_words

In [None]:
model = Summarizer('distilbert-base-uncased', hidden=[-1,-2], hidden_concat=True)

In [None]:
VALIDATION_DATASET = True
TEST_DATASET = False

In [None]:
if VALIDATION_DATASET:
    dir_ = '/content/drive/MyDrive/FNS_Dataset_2023/validation/'
    toc_loc_pkl_file_path = '/content/drive/MyDrive/FNS_Dataset_2023/validation/out/valid_toc_loc.pkl'
    df_predicted_path = '/content/drive/MyDrive/NLP_Project/2_Section_Classification/out/validation_df_predicted.pkl'

if TEST_DATASET:
    dir_ = '/content/drive/MyDrive/FNS_Dataset_2023/testing/'
    toc_loc_pkl_file_path = '/content/drive/MyDrive/FNS_Dataset_2023/testing/out/test_toc_loc.pkl'
    df_predicted_path = '/content/drive/MyDrive/NLP_Project/2_Section_Classification/out/test_df_predicted.pkl'

annual_reports_dir = "annual_reports"
gold_summary_dir = "gold_summaries"
system_summary_dir = 'BERT_ranked_weighted_maximal_word_allocation'
team_name = 'IT-356'
dir_

'/content/drive/MyDrive/FNS_Dataset_2023/validation/'

In [None]:
df_predicted = pickle.load(open(df_predicted_path, 'rb'))
df_predicted

Unnamed: 0,file_id,toc_section,toc_section_pos,toc_section_len,is_section_in_summary,toc_section_cleaned,pred,False,True
0,30777,Financial and operational highlights,161,22,0,financi oper highlight,1,0.407563,0.592437
1,30777,Strategic report,183,6,1,strateg report,0,0.932972,0.067028
2,30777,Global network,189,11,0,global network,0,0.756613,0.243387
3,30777,Chairman’s statement,200,4,1,chairman statement,1,0.018573,0.981427
4,30777,Chief Executive’s review,204,4,1,chief execut review,1,0.007140,0.992860
...,...,...,...,...,...,...,...,...,...
10547,4162,"to 110,",21914,1104,0,,0,0.983343,0.016657
10548,4162,and 117,23018,1104,0,,0,0.985658,0.014342
10549,4162,to 116,24122,1104,0,,0,0.987655,0.012345
10550,4162,to 122,25226,5440,0,,0,0.987178,0.012822


In [None]:
def get_relevant_sections_with_score(file_id):
    cutoff_score = 0.744291
    df_dict = df_predicted[df_predicted.file_id == int(file_id)][['toc_section', 'True']].to_dict('list')
    section_score_dict = {}
    toc_sections = df_dict['toc_section']
    section_scores = df_dict['True']
    for i in range(len(toc_sections)):
        if section_scores[i] >= cutoff_score:
            section_score_dict[toc_sections[i]] = section_scores[i]
    return section_score_dict

In [None]:
def get_relevant_sections_with_body_len(file_id):
    section_body_len_dict = {}
    section_score_dict = get_relevant_sections_with_score(file_id)
    for section in section_score_dict.keys():
        body = extract_section_body(file_id, section, dir_, annual_reports_dir, toc_loc_pkl_file_path)
        section_body_len_dict[section] = len(body.split(' '))
    return section_body_len_dict

In [None]:
def get_section_number_of_words(file_id):
    section_num_words_dict = {}
    section_score_dict = get_relevant_sections_with_score(file_id)
    sections = list(section_score_dict.keys())
    section_scores = np.array(list(section_score_dict.values()))
    section_body_len_dict = get_relevant_sections_with_body_len(file_id)
    section_body_len = np.array(list(section_body_len_dict.values()))
    prev_num_required_words = np.zeros(len(section_body_len))
    num_words = get_number_of_words(section_scores, section_body_len, 1000, prev_num_required_words)
    for i in range(len(sections)):
        section_num_words_dict[sections[i]] = int(num_words[i])
    return section_num_words_dict

In [None]:
import shutil
shutil.rmtree(system_summary_dir)

In [None]:
import warnings
# Suppress FutureWarnings from scikit-learn
warnings.simplefilter(action='ignore', category=FutureWarning)

os.makedirs(system_summary_dir)
total_files = len(os.listdir(os.path.join(dir_, annual_reports_dir)))
for file in tqdm(os.listdir(os.path.join(dir_, annual_reports_dir)), desc="Processing Files", total=total_files, ncols=80, colour='green', leave=True):
    try:
        file_id = file.split('.')[0]
        relevant_sections_with_score = get_relevant_sections_with_score(file_id)
        # Section order is maintained
        relevant_sections = list(relevant_sections_with_score.keys())
        section_num_words_dict = get_section_number_of_words(file_id)

        #print(file_id, relevant_sections)
        summary = ""
        total_number_of_words_in_body = 0
        total_number_of_words_in_summary = 0
        if relevant_sections:
            #print('Relevant Section Found in ', file_id)
            for section in relevant_sections:
                number_of_words_to_be_extracted = section_num_words_dict[section]
                section_body = extract_section_body(file_id, section, dir_, annual_reports_dir, toc_loc_pkl_file_path)
                section_body_split = section_body.split(' ')
                number_of_words_in_body = len(section_body_split)
                total_number_of_words_in_body = total_number_of_words_in_body + number_of_words_in_body
                r = (number_of_words_to_be_extracted/number_of_words_in_body)
                if number_of_words_in_body > number_of_words_to_be_extracted:
                  summary= summary +" "+ model(section_body,ratio=r)
                  summary_split=summary.split(' ')
                  n=len(summary_split)
                  summary = ' '.join(summary_split[:number_of_words_to_be_extracted])
                  total_number_of_words_in_summary = total_number_of_words_in_summary + number_of_words_to_be_extracted
                  #print(file_id,"available words: ",number_of_words_in_body,"words necesary: ",number_of_words_to_be_extracted,"words got: ",n)
                else:
                    #print(file_id, section, number_of_words_in_body, number_of_words_to_be_extracted)
                    summary = summary+ " "+" ".join(section_body_split[:number_of_words_in_body])
                    total_number_of_words_in_summary = total_number_of_words_in_summary + number_of_words_in_body
                    #print(file_id,"section:",total_number_of_words_in_body)

            #print(file_id, 'number_of_words_in_output_summary' , total_number_of_words_in_summary)
            #print(file_id,  'number_of_words_in_body', total_number_of_words_in_body)
            #print('\n')
        else:
            #print('Relevant Section Not Found in ', file_id)
            summary = open(os.path.join(dir_, annual_reports_dir, file), "r", encoding="utf-8").read()
            summary_split = summary.split(' ')
            number_of_words = len(summary_split)
            #print(number_of_words)
            if number_of_words > 1000:
                summary = " ".join(summary_split[:1000])

        with open(os.path.join(system_summary_dir, file_id+'_'+team_name+'.txt'), 'w', encoding='utf-8') as f:
            f.write(str(summary))

        if ".DS_Store" in file:
            continue
    except Exception as e:
        print(file, e)

Processing Files:  52%|[32m███████████▊           [0m| 213/413 [39:40<13:27,  4.04s/it][0m

4155.txt [E088] Text of length 1724226 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.


Processing Files: 100%|[32m█████████████████████[0m| 413/413 [1:19:19<00:00, 11.53s/it][0m


In [None]:
if VALIDATION_DATASET:
    gold_summary_dir_ =  os.path.join(dir_, gold_summary_dir)
    rouge_scores = get_rouge_scores(system_summary_dir, gold_summary_dir_)
    rouge_scores

Processing File Number:  0
Processing File Number:  50
Processing File Number:  100
Processing File Number:  150
Processing File Number:  200
Processing File Number:  250
Processing File Number:  300
Processing File Number:  350
Processing File Number:  400
Number of files processed:  412


In [None]:
rouge_scores

{'rouge-1': {'p': 0.6077849661465392,
  'r': 0.3203065572142669,
  'f': 0.3474837127313389},
 'rouge-2': {'p': 0.35032728864345336,
  'r': 0.2046857755327396,
  'f': 0.21397630966351391}}

In [None]:
import shutil

source_folder = '/content/section_combined_by_position_TextRank'
shutil.make_archive('/content/source_folder', 'zip', source_folder)


'/content/source_folder.zip'

In [None]:
from google.colab import files
files.download('/content/source_folder.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>