In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# New Section

In [None]:
%cd /content/drive/MyDrive/FNS_Dataset_2023

/content/drive/MyDrive/FNS_Dataset_2023


In [None]:
import os

def count_files_in_folder(folder_path):
    return len([name for name in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, name))])

# Usage
annual_reports = '/content/drive/MyDrive/FNS_Dataset_2023/training/annual_reports'
gold_summaries = '/content/drive/MyDrive/FNS_Dataset_2023/training/gold_summaries'  # replace with your folder path
print("Number of annual reports: ",count_files_in_folder(annual_reports))
print("Number of gold summaries: ",count_files_in_folder(gold_summaries))

Number of annual reports:  3050
Number of gold summaries:  10007


In [None]:
!pip install stanza

Collecting stanza
  Downloading stanza-1.6.1-py3-none-any.whl (881 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/881.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m440.3/881.2 kB[0m [31m13.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m881.2/881.2 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji (from stanza)
  Downloading emoji-2.8.0-py2.py3-none-any.whl (358 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/358.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.9/358.9 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: emoji, stanza
Successfully installed emoji-2.8.0 stanza-1.6.1


In [None]:
import numpy as np
import pandas as pd

import stanza
import spacy
import nltk
import re
import json
import os
from tqdm import tqdm
from bs4 import BeautifulSoup

from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
!pip install stop_words

Collecting stop_words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: stop_words
  Building wheel for stop_words (setup.py) ... [?25l[?25hdone
  Created wheel for stop_words: filename=stop_words-2018.7.23-py3-none-any.whl size=32896 sha256=65f58e1e570049efd08a11575386f9ed4c9215a15a991e3f54c9f9f2332554d6
  Stored in directory: /root/.cache/pip/wheels/d0/1a/23/f12552a50cb09bcc1694a5ebb6c2cd5f2a0311de2b8c3d9a89
Successfully built stop_words
Installing collected packages: stop_words
Successfully installed stop_words-2018.7.23


In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from stop_words import get_stop_words
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.6.0/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.


In [None]:
spacy_pipline = dict()
spacy_pipline['en'] = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [None]:
stemmer = dict()
stemmer['en'] = SnowballStemmer("english")

In [None]:
def preprocess_text(text, lang='en'):
    #remove html tags
    # text = BeautifulSoup(text, "html.parser").get_text()

    #remove links
    text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "", text)

    #remove text between []
    text = re.sub('\[[^]]*\]', '', text)

    #fix contractions
    def decontracted(phrase):

        phrase = re.sub("\u2019", '\'', phrase)
        phrase = re.sub("\u2018", '\'', phrase)
        phrase = re.sub("\u201C", '\"', phrase)
        phrase = re.sub("\u201D", '\"', phrase)

        # specific
        phrase = re.sub(r"won\'t", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)

        return phrase

    text = decontracted(text)

    #Remove numbers
    text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "", text)

    #lowercase
    text = text.lower()

    # tokenize the string
    # remove punctuation and special characters
    text = re.sub(r'[_]', ' ', text)
    text = re.sub(r'[\r|\n|\r\n]+', ' ', text)
    tokenizer = RegexpTokenizer(r'\w+')

    text = tokenizer.tokenize(text)

    #Remove stopwords
    stopwords = set(get_stop_words(lang))
    text = [i for i in text if i not in stopwords]

    # #Stemming
    # text = [stemmer[lang].stem(word) for word in text]

    #Lemmatize
    text = [word for word in text if word is not None]
    doc = spacy_pipline[lang](' '.join(text))

    if lang == 'ar' or lang == 'tr':
        if (len(doc.sentences) != 0):
            text = [token.lemma for token in doc.sentences[0].words]
    else:
        text = [token.lemma_ for token in doc]

    return text

# Extract Report and Summary

In [None]:
report_dir = '/content/drive/MyDrive/FNS_Dataset_2023/training/annual_reports'
summaries_dir = "/content/drive/MyDrive/FNS_Dataset_2023/training/gold_summaries"

In [None]:
files = os.listdir(report_dir)
data = {'id':[], 'report':[]}

for f in tqdm(files):
    text_file = open(os.path.join(report_dir, f), 'r', encoding = 'utf-8')
    content = text_file.read()

    # pre-process the data
    # data = dict()
    # data['text'] = preprocess_text(content)

    # with open(os.path.join(dest, str(f[0:-3]) + "json"), 'w', encoding='utf-8') as outfile:
    #     json.dump(data, outfile)

    data['id'].append(f[0:-4])
    data['report'].append(content)

  0%|          | 0/3050 [00:00<?, ?it/s]

In [None]:
df_combined = pd.DataFrame.from_dict(data)
df_combined.set_index('id', inplace=True)
df_combined.head()

Unnamed: 0_level_0,report
id,Unnamed: 1_level_1
18772,24303.04 10 November 2015 12:43 PM proof...
15256,Synergy Health plc \nAnnual Report and Accou...
14148,Shanks Group plc Annual Report and Accounts 2...
17441,Annual Report & Accounts\n2012 Contents\nWynn...
24935,HSBC Holdings plc \nAnnual Report and Account...


In [None]:
summ = os.listdir(summaries_dir)

In [None]:
for f in tqdm(summ):
  id = f[0:-4].split('_')[0]
  num = f[0:-4].split('_')[1]

  text_file = open(os.path.join(summaries_dir, f), 'r', encoding = 'utf-8')
  content = text_file.read()

  df_combined.loc[id, 'summary_' + str(num)] = content

  0%|          | 0/10007 [00:00<?, ?it/s]

In [None]:
df_combined.head()

Unnamed: 0_level_0,report,summary_2,summary_3,summary_4,summary_1,summary_5,summary_6,summary_7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
18772,24303.04 10 November 2015 12:43 PM proof...,24303.04 10 November 2015 12:43 PM proof...,24303.04 10 November 2015 12:43 PM proof...,,24303.04 10 November 2015 12:43 PM proof...,24303.04 10 November 2015 12:43 PM proof...,,
15256,Synergy Health plc \nAnnual Report and Accou...,2010 2011 2012 2013 2014\n286.4 287.3\n312.0\...,Synergy remains well placed \nto exploit its...,1\n2\n3\n4\n1. UK & Ireland £164.7m\n2. Europ...,Strategic report \nChief Executive's strategi...,,,
14148,Shanks Group plc Annual Report and Accounts 2...,2 shanks. annual report 2008\nfinancial highl...,shanks. annual report 2008 3\nchairman’s stat...,,4 shanks. annual report 2008\nI joined as Gro...,,,
17441,Annual Report & Accounts\n2012 Contents\nWynn...,1 Wynnstay Group Plc Annual Report & Accounts...,6 www.wynnstay.co.uk\nOVERVIEW\nIn my first s...,4 www.wynnstay.co.uk\nOur business at a glanc...,10 www.wynnstay.co.uk\nINTRODUCTION \nThe Gro...,,,
24935,HSBC Holdings plc \nAnnual Report and Account...,HSBC HOLDINGS PLC \nReport of the Directors: ...,HSBC HOLDINGS PLC \nReport of the Directors: ...,,7\nOverview Operating & Financial Review Gove...,,,


In [None]:
# df_combined.drop(['summary_2', 'summary_3', 'summary_4', 'summary_5', 'summary_6', 'summary_7'], axis=1)

In [None]:
def common_unigram(text1, text2):
  if text1 is np.nan or text2 is np.nan:
      return 0

  return len(set(text1) & set(text2))

In [None]:
s1 = preprocess_text(df_combined.iloc[0]['summary_1'])
s2 = preprocess_text(df_combined.iloc[0]['summary_2'])

print('summary_1', s1)
print('\n\nsummary_2', s2)
print(common_unigram(s1, s2))

summary_1 ['november', 'pm', 'proof', 'q', 'st', 'rateg', 'y', 'w', 'h', 'ceo', 'step', 'hen', 'w', 'ick', 'target', 'increase', 'land', 'bank', 'net', 'end', 'june', 'find', 'business', 'model', 'page', 'read', 'online', 'q', 'cou', 'ld', 'exp', 'lain', 't', 'logic', 'hind', 't', 'significant', 'g', 'rowt', 'h', 'hou', 'sebu', 'ilding', 'act', 'ivity', 'inte', 'nd', 'cont', 'inue', 't', 'futu', 're', 'inland', 'home', 'extract', 'maximum', 'value', 'well', 'locate', 'land', 'bank', 'strategy', 'acquire', 'brownfield', 'site', 'pre', 'planning', 'stage', 'long', 'track', 'record', 'planning', 'success', 'position', 'group', 'housebuilder', 'exceptional', 'skill', 'large', 'scale', 'brownfield', 'development', 'capitalise', 'favourable', 'market', 'condition', 'order', 'significantly', 'increase', 'housebuilding', 'activity', 'intend', 'maintain', 'growth', 'strategy', 'enhance', 'land', 'bank', 'however', 'change', 'condition', 'company', 'may', 'realign', 'strategy', 'accordingly', 'q

In [None]:
best_summary = dict()

for index, row in tqdm(df_combined.iterrows(), total=len(df_combined)):
    overlap_matrix = []

    # Find the first available summary column
    best_index = 1
    for i in range(1, 8):
        if not pd.isnull(row["summary_" + str(i)]):
            best_index = i
            break

    best_overlap = -np.inf

    for i in range(best_index, 8):
        if pd.isnull(row["summary_" + str(i)]):
            break

        summary_a = preprocess_text(row["summary_" + str(i)])

        if len(summary_a) == 0:
            break

        temp = []
        for j in range(best_index, 8):
            if pd.isnull(row["summary_" + str(j)]):
                break

            summary_b = preprocess_text(row["summary_" + str(j)])

            if len(summary_b) == 0:
                break

            overlap = common_unigram(summary_a, summary_b)
            temp.append(overlap)

        score = sum(temp) / len(summary_a)

        if best_overlap < score:
            best_overlap = score
            best_index = i

    best_summary[index] = (best_index, best_overlap)

  0%|          | 0/3050 [00:00<?, ?it/s]

In [None]:
best_summary

{'18772': (2, 0.9351851851851852),
 '15256': (2, 1.2652259332023577),
 '14148': (3, 0.707808564231738),
 '17441': (2, 1.131578947368421),
 '24935': (2, 0.7211895910780669),
 '18014': (2, 0.8929889298892989),
 '15530': (2, 1.1369863013698631),
 '15524': (2, 0.7719298245614035),
 '3205': (2, 0.7980392156862746),
 '5112': (2, 1.2211538461538463),
 '12277': (2, 1.0672097759674135),
 '17455': (1, 1.062135922330097),
 '5648': (2, 1.4855491329479769),
 '4230': (2, 1.0168067226890756),
 '17469': (2, 1.0723404255319149),
 '7739': (2, 1.4698795180722892),
 '804': (2, 1.5367965367965368),
 '12511': (2, 1.3641304347826086),
 '8595': (2, 1.0817610062893082),
 '16824': (2, 1.5226130653266332),
 '16830': (2, 2.5555555555555554),
 '8581': (2, 0.9148264984227129),
 '2912': (2, 1.4615384615384615),
 '757': (2, 1.5174129353233832),
 '13974': (2, 3.090909090909091),
 '6584': (2, 1.151624548736462),
 '5933': (2, 1.56),
 '2084': (2, 0.865979381443299),
 '16818': (2, 1.16),
 '14028': (2, 1.5087719298245614),

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

df_combined['best_summary_index'], df_combined['best_summary_score'] = zip(*df_combined.index.map(lambda x: best_summary.get(x, (None, None))))

In [None]:
df_combined.head()

Unnamed: 0_level_0,report,summary_2,summary_3,summary_4,summary_1,summary_5,summary_6,summary_7,best_summary_index,best_summary_score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
18772,24303.04 10 November 2015 12:43 PM proof...,24303.04 10 November 2015 12:43 PM proof...,24303.04 10 November 2015 12:43 PM proof...,,24303.04 10 November 2015 12:43 PM proof...,24303.04 10 November 2015 12:43 PM proof...,,,2,0.935185
15256,Synergy Health plc \nAnnual Report and Accou...,2010 2011 2012 2013 2014\n286.4 287.3\n312.0\...,Synergy remains well placed \nto exploit its...,1\n2\n3\n4\n1. UK & Ireland £164.7m\n2. Europ...,Strategic report \nChief Executive's strategi...,,,,2,1.265226
14148,Shanks Group plc Annual Report and Accounts 2...,2 shanks. annual report 2008\nfinancial highl...,shanks. annual report 2008 3\nchairman’s stat...,,4 shanks. annual report 2008\nI joined as Gro...,,,,3,0.707809
17441,Annual Report & Accounts\n2012 Contents\nWynn...,1 Wynnstay Group Plc Annual Report & Accounts...,6 www.wynnstay.co.uk\nOVERVIEW\nIn my first s...,4 www.wynnstay.co.uk\nOur business at a glanc...,10 www.wynnstay.co.uk\nINTRODUCTION \nThe Gro...,,,,2,1.131579
24935,HSBC Holdings plc \nAnnual Report and Account...,HSBC HOLDINGS PLC \nReport of the Directors: ...,HSBC HOLDINGS PLC \nReport of the Directors: ...,,7\nOverview Operating & Financial Review Gove...,,,,2,0.72119


In [None]:
report= df_combined.loc['846', 'summary_1']
print(len(report))
best_summary['846']

5


(1, -inf)

In [None]:
# List to store indices with best_summary_score == -inf
indices_with_negative_inf = []

for index, (best_index, best_overlap) in best_summary.items():
    if best_overlap == float('-inf'):
        indices_with_negative_inf.append(index)

# Print the indices with best_summary_score == -inf
print("Indices with best_summary_score == -inf:", indices_with_negative_inf)


Indices with best_summary_score == -inf: ['14018', '846', '7497']


In [None]:
# List of indices with best_summary_score == -inf
indices_to_drop = [index for index, (_, best_overlap) in best_summary.items() if best_overlap == float('-inf')]

# Drop rows with the specified indices
df_combined.drop(indices_to_drop, inplace=True)

# Reset the index after dropping rows
df_combined.reset_index(drop=True, inplace=True)

In [None]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   report              3047 non-null   object 
 1   summary_2           3043 non-null   object 
 2   summary_3           3042 non-null   object 
 3   summary_4           735 non-null    object 
 4   summary_1           3040 non-null   object 
 5   summary_5           127 non-null    object 
 6   summary_6           8 non-null      object 
 7   summary_7           1 non-null      object 
 8   best_summary_index  3047 non-null   int64  
 9   best_summary_score  3047 non-null   float64
dtypes: float64(1), int64(1), object(8)
memory usage: 238.2+ KB


In [None]:
df_combined.to_csv('/content/traindata.csv', index=True, escapechar='\\')


In [None]:
traindf=pd.read_csv('/content/traindata.csv')

In [None]:
traindf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3047 entries, 0 to 3046
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          3047 non-null   int64  
 1   report              3047 non-null   object 
 2   summary_2           3043 non-null   object 
 3   summary_3           3042 non-null   object 
 4   summary_4           735 non-null    object 
 5   summary_1           3040 non-null   object 
 6   summary_5           127 non-null    object 
 7   summary_6           8 non-null      object 
 8   summary_7           1 non-null      object 
 9   best_summary_index  3047 non-null   int64  
 10  best_summary_score  3047 non-null   float64
dtypes: float64(1), int64(2), object(8)
memory usage: 262.0+ KB
