In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
import pandas as pd
import nltk
import re
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
stopWords = set(stopwords.words("english"))
ps = PorterStemmer()
import math
import nltk
nltk.download('punkt')
import os

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Summarizer

In [5]:
text = """
Women education is a catch all term which refers to the state of primary, secondary, tertiary and health education in girls and women. There are 65 Million girls out of school across the globe; majority of them are in the developing and underdeveloped countries. All the countries of the world, especially the developing and underdeveloped countries must take necessary steps to improve their condition of female education; as women can play a vital role in the nation’s development.
If we consider society as tree, then men are like its strong main stem which supports the tree to face the elements and women are like its roots; most important of them all. The stronger the roots are the bigger and stronger the tree will be spreading its branches; sheltering and protecting the needy.
Women are the soul of a society; a society can well be judged by the way its women are treated. An educated man goes out to make the society better, while an educated woman; whether she goes out or stays at home, makes the house and its occupants better.
Women play many roles in a society- mother, wife, sister, care taker, nurse etc. They are more compassionate towards the needs of others and have a better understanding of social structure. An educated mother will make sure that her children are educated, and will weigh the education of a girl child, same as boys.
History is replete with evidences, that the societies in which women were treated equally to men and were educated; prospered and grew socially as well as economically. It will be a mistake to leave women behind in our goal of sustainable development, and it could only be achieved if both the genders are allowed equal opportunities in education and other areas.
Education makes women more confident and ambitious; they become more aware of their rights and can raise their voice against exploitation and violence. A society cannot at all progress if its women weep silently. They have to have the weapon of education to carve out a progressive path for their own as well as their families.
"""

In [6]:
sentences = sent_tokenize(text) # NLTK function
total_documents = len(sentences)

In [7]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [8]:
freq_matrix = _create_frequency_matrix(text)

In [9]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [10]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [11]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [12]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [13]:
def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue

In [14]:
def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average

In [15]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [16]:
# 1 Sentence Tokenize
sentences = sent_tokenize(text)
total_documents = len(sentences)
#print(sentences)

# 2 Create the Frequency matrix of the words in each sentence.
freq_matrix = _create_frequency_matrix(sentences)
#print(freq_matrix)

'''
Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
'''
# 3 Calculate TermFrequency and generate a matrix
tf_matrix = _create_tf_matrix(freq_matrix)
#print(tf_matrix)

# 4 creating table for documents per words
count_doc_per_words = _create_documents_per_words(freq_matrix)
#print(count_doc_per_words)

'''
Inverse document frequency (IDF) is how unique or rare a word is.
'''
# 5 Calculate IDF and generate a matrix
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
#print(idf_matrix)

# 6 Calculate TF-IDF and generate a matrix
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
#print(tf_idf_matrix)

# 7 Important Algorithm: score the sentences
sentence_scores = _score_sentences(tf_idf_matrix)
#print(sentence_scores)

# 8 Find the threshold
threshold = _find_average_score(sentence_scores)
#print(threshold)

# 9 Important Algorithm: Generate the summary
summary = _generate_summary(sentences, sentence_scores, 1 * threshold)
print(summary)

 The stronger the roots are the bigger and stronger the tree will be spreading its branches; sheltering and protecting the needy. Women are the soul of a society; a society can well be judged by the way its women are treated. They are more compassionate towards the needs of others and have a better understanding of social structure. An educated mother will make sure that her children are educated, and will weigh the education of a girl child, same as boys. A society cannot at all progress if its women weep silently. They have to have the weapon of education to carve out a progressive path for their own as well as their families.


FNS Dataset

In [17]:
text = open("/content/drive/MyDrive/IR IA/training/annual_reports/10023.txt", "r", encoding='utf-8')

In [18]:
text1 = str(text.read())
print(text1)

 Registered office
C/- Emcee 
44 Southampton Buildings 
London UK  
WC2A 1AP
Operations Office – Rome  London Office
Via Cornelia 498  16 Old Queen Street
Roma 00166  London UK
Italy SW1H 9HP
www.medoilgas.com
Building a  
resource  
factory
Annual Report 2011
Mediterranean Oil & Gas Plc Annual Report 2011 01-17
Business review
01 Highlights
02 The Company at a glance
04 Our strategy for growth
06 Chairman’s statement
08 Chief Executive’s report
14 Financial review
16 Principal risks and uncertainties
18-25
Corporate governance
18 Board of Directors
20 Senior management
21 Directors’ report
23 Corporate governance statement
24 Remuneration report
26-56
Financial statements
26 Independent auditors’ report
27 Financial statements
57-59
57 Notice of Annual General Meeting
59 Corporate directory
FSC Logo  01
Mediterranean Oil & Gas Plc Annual Report 2011 www.medoilgas.com
BUSINESS
REVIEW
CORPORATE
GOVERNANCE
FINANCIAL
STATEMENTS
Highlights
Corporate and Operational Highlights within the 
R

In [19]:
# 1 Sentence Tokenize
sentences = sent_tokenize(text1)
total_documents = len(sentences)
#print(sentences)

# 2 Create the Frequency matrix of the words in each sentence.
freq_matrix = _create_frequency_matrix(sentences)
#print(freq_matrix)

'''
Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
'''
# 3 Calculate TermFrequency and generate a matrix
tf_matrix = _create_tf_matrix(freq_matrix)
#print(tf_matrix)

# 4 creating table for documents per words
count_doc_per_words = _create_documents_per_words(freq_matrix)
#print(count_doc_per_words)

'''
Inverse document frequency (IDF) is how unique or rare a word is.
'''
# 5 Calculate IDF and generate a matrix
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
#print(idf_matrix)

# 6 Calculate TF-IDF and generate a matrix
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
#print(tf_idf_matrix)

# 7 Important Algorithm: score the sentences
sentence_scores = _score_sentences(tf_idf_matrix)
#print(sentence_scores)

# 8 Find the threshold
threshold = _find_average_score(sentence_scores)
#print(threshold)

# 9 Important Algorithm: Generate the summary
summary = _generate_summary(sentences, sentence_scores, 1 * threshold)
print(summary)

 •  Structural geometry and hydrocarbon 
charge are the key risks. Grow production and move 
resources to reserves by maturing the 
portfolio in support of our production 
growth targets. 90% and Operator) Production Sharing 
Contract. 20%) on 25 October 2011. 128, dated 29 June 
2010 which amends the earlier Italian 
Environmental Code (Decree no. 152/2006). MOG is in active dialogue with the Italian 
Government with regard to lifting the 
imposed restrictions. We are 
working with external legal counsel on the 
analysis and implementation of alternative 
courses of action should discussions with 
the Italian authorities fail to progress. Although there is no certainty on either the 
timing or the results of our request for a 
production concession, the Company is 
optimistic of a positive outcome. The Board wishes to thank 
Michael, Andrew and Peter for their 
contributions to the Company. I am delighted to welcome someone with 
such international experience and industry 
knowledge t

In [27]:
def summarizer(text):
  sentences = sent_tokenize(text)
  total_documents = len(sentences)
  #print(sentences)

  # 2 Create the Frequency matrix of the words in each sentence.
  freq_matrix = _create_frequency_matrix(sentences)
  #print(freq_matrix)

  '''
  Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
  '''
  # 3 Calculate TermFrequency and generate a matrix
  tf_matrix = _create_tf_matrix(freq_matrix)
  #print(tf_matrix)

  # 4 creating table for documents per words
  count_doc_per_words = _create_documents_per_words(freq_matrix)
  #print(count_doc_per_words)

  '''
  Inverse document frequency (IDF) is how unique or rare a word is.
  '''
  # 5 Calculate IDF and generate a matrix
  idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
  #print(idf_matrix)

  # 6 Calculate TF-IDF and generate a matrix
  tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
  #print(tf_idf_matrix)

  # 7 Important Algorithm: score the sentences
  sentence_scores = _score_sentences(tf_idf_matrix)
  #print(sentence_scores)

  # 8 Find the threshold
  threshold = _find_average_score(sentence_scores)
  #print(threshold)

  # 9 Important Algorithm: Generate the summary
  summary = _generate_summary(sentences, sentence_scores, 1 * threshold)
  return summary




In [20]:
pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [21]:
from rouge import Rouge

In [22]:
test_txt = open("/content/drive/MyDrive/IR IA/training/gold_summaries/10023_1.txt", "r", encoding='utf-8')
test_txt = str(test_txt.read())

In [23]:
print(test_txt)

 08
Mediterranean Oil & Gas Plc Annual Report 2011 www.medoilgas.com
The Group is now in a strong financial and 
operational position, which will enable 
MOG to actively seek strategic growth 
opportunities while progressing its portfolio 
of production, development and 
exploration assets.
Looking back, 2011 was a difficult year due 
to unexpected regulatory developments 
affecting Ombrina Mare, which is a key 
asset for the Company, and the uncertain 
economic environment. However, the 
Group retained a clear focus on securing 
value from its operational structure and its 
extensive and diverse asset base; and it is 
now reaping the benefits, following the 
Group recapitalisation and the significant 
increase of gas production. 
The Company has a broad asset base 
across the exploration, development and 
production phases of the business. These 
assets have significant upside potential 
and we now have the financial strength to 
de-risk and mature these assets over the 
coming years,

In [24]:
model_out = summary

reference = test_txt

In [25]:
rouge = Rouge()

In [26]:
rouge.get_scores(model_out, reference)

[{'rouge-1': {'f': 0.3808200091072506,
   'p': 0.31621465666474324,
   'r': 0.47860262008733623},
  'rouge-2': {'f': 0.16966728613666876,
   'p': 0.13456705506054376,
   'r': 0.22954014029618083},
  'rouge-l': {'f': 0.36275190625804976,
   'p': 0.30121177149451817,
   'r': 0.4558951965065502}}]

In [51]:
path = "/content/drive/MyDrive/IR IA/training/annual_reports/"
label_path = "/content/drive/MyDrive/IR IA/training/gold_summaries/"
data = []
gs_summaries = []
summaries = []
i=0
for txt in os.listdir(path):
  
  text = open(path + txt , "r", encoding='utf-8')
  text = str(text.read())
  data.append(text)
  test_txt = open("/content/drive/MyDrive/IR IA/training/gold_summaries/"+txt[:-4]+"_1.txt", "r", encoding='utf-8')
  test_txt = str(test_txt.read())
  gs_summaries.append(test_txt)
  summ = summarizer(text)
  summaries.append(summ)
  i += 1
  if(i==10):
    break

In [52]:
data

[" innovation\nknowledge oege\nconstruction i\nrelationships\nannual report and\naccounts 2008\nii direction\nregeneration ege e a o\nTelford Homes PlC annual report and accounts 2008 www telfordhomes plc uk\nTelford RA  24/6/08  17:54  Page 1 Telford RA  24/6/08  17:54  Page 2 “The name Telford Homes is taken from the engineer\nThomas Telford, renowned for innovative ideas and\ndesigns. Innovation in all areas of the business\nincluding design, construction, partnerships and\nmarketing has played a major role in our success.” \ninnovation\nTelford RA  24/6/08  17:54  Page 3 financial highlights 2008\nrevenue\n£160.4m\n08 £160.4m\n07 £104.4m\ngross profit margin\n20.9%\n08 20.9%\n07 22.4%\noperating margin\n15.8%\n08 15.8%\n07 16.0%\nprofit before tax\n£17.7m\n08 £17.7m\n07 £13.5m\ndividend per share\n10.0p\n08 10.0p\n07 8.9p\nearnings per share\n33.3p\n08 33.3p\n07 29.4p\npre-tax return on\naverage equity\n30.0%\n08 30.0%\n07 31.2%\ngearing\n144%\n08 144%\n07 102%\nuncovered gearing\n

In [53]:
gs_summaries

[' chief executive’s review\n“An excellent performance\nin the first six months of\nthe year continued into\nthe second half despite\nincreasingly difficult market\nconditions. Revenue has\ngrown by 54% to £160.4\nmillion and profit before tax\nis up 31% to £17.7 million.”\nAndrew Wiseman\nChief Executive\nThis has been achieved as a result of the\nstrength of the Telford Homes brand,\nincorporating realistic pricing and consistent\ndelivery, together with strong marketing\nconcepts which have allowed the Company to\ncontinue selling homes to the investor market\nat an early stage in the development process. \nProperty sales and affordable housing\nContracts were exchanged on 523 open\nmarket private homes, 502 affordable\nhomes and ten commercial units, making\na total of 1,035 properties in the year.\nIncluded within this number are 230 homes\nbeing constructed under joint ventures\nwhere we recognise half of the revenue\nand profit from the development. \nWhere the market allows we 

In [54]:
summaries

[" At 31st March 2008 unrecognised\nrevenue secured by contracts exchanged was\n£125 million, higher than ever before. Our\nstrategy has put us in a very strong position\nin the face of a weakening market. This prudent approach will\ncontinue. We expect trading conditions to improve\nwhen restrictions ease in the mortgage\nmarket although it is not clear when this will\noccur. Vellum and Kinetica\nwere launched in the first few months of\n2008 and reservations are still being taken\nat these developments. Partnerships with affordable\nhousing providers remain integral to\nour business and typically 35% of any\ndevelopment is sold for affordable housing. Restrictions on mortgage finance make\nthe process of legally completing finished\nproperties more protracted. Our most recent acquisition is a parcel of\nfive sites in Southwark in partnership with\nFamily Mosaic Housing Association which\nare expected to deliver over 100 new\nhomes. Three\nof the sites are already under construction\n

In [55]:
model_out = summaries

reference = gs_summaries

In [56]:
rouge.get_scores(model_out, reference, avg=True)

{'rouge-1': {'f': 0.3180989680904308,
  'p': 0.24436739849998332,
  'r': 0.5962643903871239},
 'rouge-2': {'f': 0.14434038453093403,
  'p': 0.11099324265176036,
  'r': 0.2989968585369155},
 'rouge-l': {'f': 0.30212563233993206,
  'p': 0.23242392301604403,
  'r': 0.5659621792016016}}