In [66]:
!pip install sentence_transformers



In [68]:
#import all the neccessary libraries
import warnings
warnings.filterwarnings("ignore")
import torch
import pickle
import time
import os 
from transformers import T5ForConditionalGeneration,T5Tokenizer

In [69]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [70]:
from transformers import T5ForConditionalGeneration,T5Tokenizer

In [71]:
#getting the summary model and its tokenizer
if os.path.exists("t5_summary_model.pkl"):
    with open('t5_summary_model.pkl', 'rb') as f:
        summary_model = pickle.load(f)
    print("summary model found in the disc ---- model loaded successfully.")

else:
    print("summary model does not exist in the path specified ---- downloading the model from web.")
    start_time = time.time()
    summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')
    end_time = time.time()

    print("downloaded the summary model in ",(end_time-start_time)/60," min, saving it to disc.")

    with open("t5_summary_model.pkl", 'wb') as f:
        pickle.dump(summary_model,f)
    
    print("saved the model to disc.")

if os.path.exists("t5_summary_tokenizer.pkl"):
    with open('t5_summary_tokenizer.pkl', 'rb') as f:
        summary_tokenizer = pickle.load(f)
    print("summary tokenizer found in the disc ---- loaded successfully.")
else: 
    print("summary tokenizer does not exist in the path specified ---- downloading the model from web.")

    start_time = time.time()
    summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')
    end_time = time.time()

    print("downloaded the summary tokenizer in ",(end_time-start_time)/60," min, saving it to disc.")

    with open("t5_summary_tokenizer.pkl",'wb') as f:
        pickle.dump(summary_tokenizer,f)

    print("saved the tokenizer to disc.")

summary model found in the disc ---- model loaded successfully.
summary tokenizer found in the disc ---- loaded successfully.


In [72]:
summary_model = summary_model.to(device)

In [73]:
#test
input_text = "Climate change is one of the most pressing issues facing our planet today. As average temperatures rise due to increased greenhouse gas emissions, ecosystems around the world are experiencing significant shifts. Polar regions are warming at an alarming rate, leading to the melting of glaciers and the loss of habitats for species such as polar bears and seals.In temperate regions, changes in temperature and precipitation patterns are affecting plant and animal life cycles. For example, many species are blooming earlier in the spring, which can disrupt the food chain and impact pollinators like bees. In tropical regions, coral reefs, which are vital for marine biodiversity, are suffering from coral bleaching due to rising sea temperatures and ocean acidification.The effects of climate change are not only limited to natural ecosystems; they also impact human societies. Changes in weather patterns can lead to food and water shortages, displacement of populations, and increased health risks due to heatwaves and the spread of diseases. Mitigating the effects of climate change requires collective action on a global scale, including reducing carbon emissions, transitioning to renewable energy sources, and promoting sustainable land use practices.Addressing climate change is not only essential for preserving biodiversity but also crucial for ensuring the well-being of future generations."

#tokenize the input text
input_ids = summary_tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True).to(device)

#generate the summary
with torch.no_grad():
    summary_ids = summary_model.generate(
        input_ids,
        max_length=150,  #desired max length for the summary
        min_length=40,   #desired min length for the summary
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

#decode the summary
summary = summary_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

#print the generated summary
print("Summary:", summary)

Summary: plant and animal life cycles are disrupting the food chain and impacting pollinators like bees. In tropical regions, coral reefs are suffering from coral bleaching due to rising sea temperatures and ocean acidification. Climate change is one of the most pressing issues facing our planet today.Mitigating the effects of climate change requires collective action on a global scale.


In [74]:
def summarizer(text,model,tokenizer):
  """
  takes the given text along with the model and tokenizer, which summarize the large text into useful information
  """
  text = text.strip().replace("\n"," ")
  text = "summarize: "+text
#   print (text)
    
  #tokenize the input text
  max_len = 512
  encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device)

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  #generate summmmary
  outs = model.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=3,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  min_length = 75,
                                  max_length=300)

  #decode the summary and apply postprocessing
  dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
  summary = dec[0]
  summary = postprocesstext(summary)
  summary= summary.strip()

  return summary

In [75]:
text = """
The transformer architecture, introduced in 2017 by Vaswani et al., revolutionized natural language processing (NLP) by moving away from recurrent neural networks (RNNs) and convolutional neural networks (CNNs). Instead, the transformer relies entirely on attention mechanisms to draw global dependencies between input and output. The self-attention mechanism allows the model to weigh the importance of different words in a sentence relative to each other, enabling it to capture more complex relationships in text. This has made the transformer highly efficient for tasks such as machine translation, text summarization, and language modeling.
Before transformers, RNNs and their variants, such as Long Short-Term Memory (LSTM) networks, were the dominant models for sequence processing tasks. These models processed inputs sequentially, which limited their ability to handle long-range dependencies. The transformer, on the other hand, processes the entire sequence simultaneously, making it faster and more effective for long sequences. It also allows for better parallelization during training, significantly reducing the time required to train large models.
"""
summary = summarizer(text, summary_model, summary_tokenizer)
print("summary")
print(summary)

summary
The transformer architecture revolutionized natural language processing (nlp) it relies entirely on attention mechanisms to draw global dependencies between input and output . This has made it highly efficient for tasks such as machine translation, text summarization, and language modeling. Transformer processes the entire sequence simultaneously, making it faster and more effective for long sequences. Cnn.com/nlp/2017/02/13


In [76]:
def get_nouns_multipartite(content):
    """
    takes the content text given and then outputs the phrases which are build around the nouns , 
    so that we can use them for context based distractors
    extract key phrases centered around nouns from a given text using the pke library's MultipartiteRank model
    """
    out=[] #store the extracted key phrases
    
    try:
        """
        an instance of the MultipartiteRank class from the pke (Python Keyphrase Extraction) library is created
        """
        extractor = pke.unsupervised.MultipartiteRank()
        extractor.load_document(input=content,language='en') #loads the input text
        #not contain punctuation marks or stopwords as candidates.
        #pos = {'PROPN','NOUN',}
        #defines the parts of speech (POS) that will be considered for candidate selection
        pos = {'PROPN', 'NOUN', 'ADJ', 'VERB', 'ADP', 'ADV', 'DET', 'CONJ', 'NUM', 'PRON', 'X'}

        #filter out common punctuation marks and stop words from English
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] #specific tokens for parentheses and brackets 
        stoplist += stopwords.words('english') #list of common English stopwords obtained from NLTK (like "the", "is", etc.).
        
        # extractor.candidate_selection(pos=pos, stoplist=stoplist)
        #identifies potential key phrases based on the given POS tags
        extractor.candidate_selection( pos=pos)
        # 4. build the Multipartite graph and rank candidates using random walk,
        #    alpha controls the weight adjustment mechanism, see TopicRank for
        #    threshold/method parameters.
        extractor.candidate_weighting(alpha=1.1,
                                      threshold=0.75,
                                      method='average')
        keyphrases = extractor.get_n_best(n=15)
        

        for val in keyphrases:
            out.append(val[0])
    except:
        out = []
        #traceback.print_exc()

    return out

In [77]:
def get_keywords(originaltext):
  #just a simpler name
  keywords = get_nouns_multipartite(originaltext)
  return keywords

In [78]:
originaltext = """
Natural Language Processing (NLP) is a subfield of artificial intelligence (AI) that focuses on the interaction between computers and humans through natural language.
The ultimate goal of NLP is to enable computers to understand, interpret, and generate human language in a valuable way.
Recent advancements in NLP have been driven by the development of transformer models, which have significantly improved the performance of various language tasks, including machine translation, sentiment analysis, and summarization.
"""
extracted_keywords = get_keywords(originaltext)

print("Extracted Keywords:", extracted_keywords)

Extracted Keywords: ['understand', 'sentiment analysis', 'enable computers', 'interpret', 'including machine translation', 'nlp', 'natural language processing', 'summarization']


In [79]:
summarized_text = summarizer(originaltext,summary_model,summary_tokenizer)

In [80]:
summarized_text

'Natural language processing (nlp) is a subfield of artificial intelligence (ai) the goal of nlp is to enable computers to understand, interpret, and generate human language . Transformer models have significantly improved the performance of various language tasks, including machine translation, sentiment analysis and summarization. The transformer model is an example of how transformers can be used to analyze human speech.'

In [81]:
get_keywords(originaltext)

['understand',
 'sentiment analysis',
 'enable computers',
 'interpret',
 'including machine translation',
 'nlp',
 'natural language processing',
 'summarization']

In [82]:
get_keywords(summarized_text)

['transformer models',
 'generate human language',
 'understand',
 'interpret',
 'sentiment analysis',
 'enable computers',
 'including machine translation',
 'summarization',
 'nlp',
 'natural language processing',
 'analyze human speech',
 'transformers']