Colab Upgrades

In [1]:
# !pip install PyPDF2 --q
# !pip install transformers --q
# !apt-get install -y poppler-utils --q
# !pip install transformers networkx --q
# !transformers-cli cache clear --q
# !pip install newspaper3k
# 

Imports

In [2]:
import PyPDF2
from io import StringIO
import os
import requests
from bs4 import BeautifulSoup
import torch
from transformers import LongformerTokenizer, LongformerModel, pipeline
import nltk
import re
import newspaper
import warnings
nltk.download('punkt')
warnings.filterwarnings("ignore")
# import spacy
# import networkx as nx
# import textract
# from spacy import displacy
# import textacy
# import sacremoses as sm

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/arjun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Check for CUDA

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
# The following output must be cuda to ensure that the GPU is used

cuda:0


Sentence Split

In [4]:
def split_into_sentences(paragraph):
    sentences = nltk.sent_tokenize(paragraph)
    return sentences

Bart-large-cnn-samsum

In [5]:
summarizer = pipeline("summarization", model="philschmid/bart-large-cnn-samsum", device='cuda:0')

Longformer

In [6]:
# Initialize Longformer tokenizer and model
longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
longformer_model = LongformerModel.from_pretrained('allenai/longformer-base-4096')

# Move the model to the GPU
longformer_model.to(device)
print("Loaded longformer to device")


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loaded longformer to device


For counting words

In [7]:
def count_words(sentence):
    return len(re.findall(r"\w+", sentence))

Spacy Entity recognition

In [8]:
# Load the entity recognition pipeline
# nlp = spacy.load("en_core_web_lg")

Read funtions

In [9]:
# Function to read and extract text from a PDF document
def read_pdf(file_path):
    if not os.path.isfile(file_path) or not file_path.endswith('.pdf'):
        raise ValueError('Invalid file path or file format. Please provide a valid PDF file.')

    from pdfminer.high_level import extract_text
    text = extract_text(file_path)
    return text



# Function to read and extract text from a text document
def read_text(file_path):
    if not os.path.isfile(file_path) or not file_path.endswith('.txt'):
        raise ValueError('Invalid file path or file format. Please provide a valid text file.')
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        return text

# Function to read and extract text from a website URL
def read_url(url):
    try:
        def get_main_text_from_website(url):
            article = newspaper.Article(url)
            article.download()
            article.parse()
            return article.text

        # Example usage
        # url = 'https://www.example.com'
        main_text = get_main_text_from_website(url)
        return main_text
    except:
        raise ValueError('Invalid URL or unable to extract text from URL. Please provide a valid website URL.')

Extractive summarisation functions

In [10]:
def extractive_summarization(input_text):
    in_words = count_words(input_text)
    if in_words<500:
        out_sentence = int(in_words/50)
    else:
        out_sentence = int(in_words/30)


    sentences = split_into_sentences(input_text)
    hierarchical_summary = []
    count = 0
    for chunk in sentences:
        input_ids = longformer_tokenizer.encode(chunk, return_tensors='pt').to(device)
        outputs = longformer_model(input_ids=input_ids)
        hidden_states = outputs.last_hidden_state
        sentence_embeddings = hidden_states[0]
        sentence_scores = torch.matmul(sentence_embeddings, sentence_embeddings.T)
        sentence_scores = sentence_scores.squeeze(0).tolist()
        if not chunk:
            continue  # Skip empty chunks

        hierarchical_summary.append((count,chunk, sentence_scores))
        count +=1

    hierarchical_summary.sort(key=lambda x: x[1], reverse=True)  # Sort sentences by score
    num_summary_tokens = 0
    summary = []
    for ct,sentence, score in hierarchical_summary:
        if num_summary_tokens + len(longformer_tokenizer.tokenize(sentence)) <= 50000:
            summary.append([ct,sentence])
            num_summary_tokens += len(longformer_tokenizer.tokenize(sentence))
        else:
            break
    summary = summary[:out_sentence]
    sorted_data = sorted(summary, key=lambda x: x[0])
    extractive_result = []
    for i in sorted_data:
        extractive_result.append(i[1])
    out = ' '.join(extractive_result)
    return out


Abstractive summarisation functions

In [11]:
def abstractive_summarization(input_text):
    num_of_sentence_per_chunk = 10
    sentences = split_into_sentences(input_text)
    sentence_chunks = []
    summarised_output = ''
    ct = 0
    while ct<=len(sentences):
        chunk = sentences[ct: ct+num_of_sentence_per_chunk]
        chunk = ' '.join(chunk)
        if len(chunk) > 700:    # Little data in the tail-end will lead to super-bad summarisation
            sentence_chunks.append(chunk)
        ct += num_of_sentence_per_chunk
    # length = len(sentence_chunks)
    progress = 0
    for chunk in sentence_chunks:
        if len(chunk) < 10:
            continue
        data = summarizer(chunk,min_length = 150, max_length=200)[0]['summary_text']
        summarised_output += data
        # print(f"Progress = {progress+1}/{length}")
        progress+=1
    return summarised_output

knowledge graph-guided summarization funtions -- In progress

In [12]:
def extract_relations(text):
    doc = nlp(text)
    relations = []
    
    for sent in doc.sents:
        for token in sent:
            if token.dep_ == "ROOT" and token.pos_ == "VERB":
                subject = [t for t in token.children if t.dep_ in ["nsubj", "nsubjpass"]]
                object_ = [t for t in token.children if t.dep_ in ["dobj", "pobj"]]
                if subject and object_:
                    relations.append((subject[0].text, token.text, object_[0].text))
    return relations

def create_knowledge_graph(input_text):
    relations = extract_relations(input_text)
    
    graph = nx.Graph()
    for subject, verb, obj in relations:
        graph.add_node(subject, label="ENTITY")
        graph.add_node(obj, label="ENTITY")
        graph.add_edge(subject, obj, relation=verb)
    return graph

# Function to perform knowledge graph-guided summarization
def knowledge_graph_guided_summarization(input_text):
    # Step 1: Create knowledge graph
    graph = create_knowledge_graph(input_text)

    # Step 2: Perform extractive summarization using knowledge graph
    summary_sentences = []
    for node in graph.nodes(data=True):
        if node[1]['label'].startswith('Ġ'):
            for edge in graph.edges(nbunch=node[0], data=True):
                if edge[2]['relation'] in ['attr', 'dobj', 'prep']:
                    if graph.nodes[edge[1]]['label'].startswith('Ġ'):
                        summary_sentences.append(graph.nodes[edge[1]]['label'])
    summary_sentences = list(set(summary_sentences))
    summary_sentences.sort(key=lambda s: input_text.find(s))
    summary_text = '. '.join(summary_sentences)

    # Step 3: Perform abstractive summarization on the extracted summary
    summary = abstractive_summarization(summary_text)
    return summary

Summarisation functions

In [13]:
def summarize_file(file_path_or_url):
    # Check if the input is a file path or a website URL
    if os.path.isfile(file_path_or_url):
        file_path = file_path_or_url
        # Read and extract text from the input file
        if file_path.endswith('.pdf'):
            text = read_pdf(file_path)
        elif file_path.endswith('.txt'):
            text = read_text(file_path)
        else:
            raise ValueError('Invalid file format. Please provide a valid PDF or text file.')
    else:
        # Read and extract text from the website URL
        text = read_url(file_path_or_url)

    print('TEXT: -------------------\n',text,'\n-----------------------------')

    # Perform extractive summarization
    extractive_summary = extractive_summarization(text)
    
    # Perform abstractive summarization
    abstractive_summary = abstractive_summarization(text)

    # graph = create_knowledge_graph(text)
    # print(graph.nodes())
    # print(graph.edges())
    # Perform knowledge graph-guided summarization on the extracted text
    # summary = knowledge_graph_guided_summarization(text)
    return abstractive_summary, extractive_summary

Summary generation

In [17]:
file_path = 'EARTH EATS.pdf'
abstractive_summary, extractive_summary = summarize_file(file_path)
extractive_summary = '.'.join(extractive_summary.split('\n'))

TEXT: -------------------
 EARTH EATS

We try to bridge the fact that 40% of food produced is wasted while approximately 14.8% of the
people remain undernourished. We will provide an interface via an app for people to buy and
sell excess food from large functions or events thus reducing the practice of using food after
long refrigeration and financial stress on food manufacturers by reducing storage cost and We
hire employees on independent contracts whose wages are based on the number of deliveries
and thus act as a part time job opportunity.

Vendors can sell excess Fruits and vegetables at reduced cost before it spoils. This reduces
stress on these vendors preventing excess use of pesticides and ensures minimum profit.

This app helps producers trade surplus food at functions at 50-60% cost, preventing food
wastage or free distribution giving enough returns. A worker depending on daily wage to win
bread when falls sick and cannot go to work can avail food from a base price of 40% vi

In [18]:
print("Extractive Summary:")
print(extractive_summary)
print("\n-------------------------------------------------------------------------------------\nAbstractive Summary:")
print(abstractive_summary)
# print("\nThe hybrid method Summary is:")
# print(summary)

Extractive Summary:
We will provide an interface via an app for people to buy and.sell excess food from large functions or events thus reducing the practice of using food after.long refrigeration and financial stress on food manufacturers by reducing storage cost and We.hire employees on independent contracts whose wages are based on the number of deliveries.and thus act as a part time job opportunity. Vendors can sell excess Fruits and vegetables at reduced cost before it spoils. Via this app we can reduce the expenditure of buying food from external sources;.reduce the public hunger in some scenarios and reduce the use of refrigerators promoting.greener earth. We have an.advantage because we address both food waste and malnourishment, have wider product.coverage, handle cooked food, and provide information about local food resources, while others.only address one or the other. We work based on the model that is integrated with the ideas of.the above three apps. With expansion in.serv

# Word Count

In [19]:
if os.path.isfile(file_path):
    file_path = file_path
    # Read and extract text from the input file
    if file_path.endswith('.pdf'):
        in_text = read_pdf(file_path)
    elif file_path.endswith('.txt'):
        in_text = read_text(file_path)
    else:
        raise ValueError('Invalid file format. Please provide a valid PDF or text file.')
else:
    # Read and extract text from the website URL
        
        in_text = read_url(file_path)
print(f"Words in Input text is: {count_words(in_text)}")
print(f"Words in Extractive summary is: {count_words(extractive_summary)}")
print(f"Words in Abstractive summary is: {count_words(abstractive_summary)}")

Words in Input text is: 302
Words in Extractive summary is: 180
Words in Abstractive summary is: 128
