# Code inspired from article https://towardsdatascience.com/text-summarization-with-nlp-textrank-vs-seq2seq-vs-bart-474943efeb09  

# import libraries

In [2]:
!pip install datasets



In [3]:
!pip install pytextrank



In [1]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
     ---------------------------------------- 97.3/97.3 kB 1.9 MB/s eta 0:00:00
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
     --------------------------------------- 10.1/10.1 MB 11.0 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.tom

In [24]:
## for data
import datasets 
import pandas as pd 
import numpy as np

## for preprocessing
import re
import nltk 
#import contractions 
import os
## for textrank
import pytextrank
import spacy
#for LSA summarize
from sumy.summarizers.lsa import LsaSummarizer
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser
##for generating sentence after textrank 
#import openai
import torch
from transformers import pipeline, set_seed
from transformers import GPT2Tokenizer, GPT2LMHeadModel

import pprint as pprint

# import dataset 

In [5]:
## load the full dataset of 300k articles
dataset = datasets.load_dataset("cnn_dailymail", '3.0.0')
lst_dics = [dic for dic in dataset["train"]]

Found cached dataset cnn_dailymail (C:/Users/loren/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
## keep the first N articles if you want to keep it lite 
dtf = pd.DataFrame(lst_dics).rename(columns={"article":"text", 
      "highlights":"y"})[["text","y"]].head(20000)
dtf.head()

Unnamed: 0,text,y
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."


In [7]:
#check one example 
i = 0
print("--- Full text ---")
print(dtf["text"][i])
print("--- Summary ---")
print(dtf["y"][i])

--- Full text ---
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office cha

# create train/test dataset

In [8]:
dtf_train = dtf.iloc[i+1:]
dtf_test = dtf.iloc[:i+1]

# TextRank algorithm

In [9]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1

DEPRECATION: https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl#egg=en_core_web_sm==3.4.1 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617



  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     --------------------------------------- 12.8/12.8 MB 11.1 MB/s eta 0:00:00
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [10]:
# Load a spaCy model
nlp = spacy.load("en_core_web_sm")

# Add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x21f3f1521d0>

In [11]:
# Define a function to apply textrank algorithm to corpus with a ratio parameter
def ptrank(corpus, ratio=0.2):
    if type(corpus) is str:
        corpus = [corpus]
    lst_phrases = []
    for txt in corpus:
        # Parse the document with spaCy
        doc = nlp(txt)
        # Extract the top-ranked phrases from the document
        phrases = []
        for phrase in doc._.phrases:
            phrases.append((phrase.text, phrase.rank))
        # Sort the phrases by rank
        phrases = sorted(phrases, key=lambda x: x[1], reverse=True)
        # Limit the number of phrases based on the ratio
        limit = int(len(phrases) * ratio)
        # Append only the top phrases to the lst_phrases list
        lst_phrases.append(phrases[:limit])
    return lst_phrases

In [12]:
# Apply the function to corpus with a ratio of 0.2
predicted  = ptrank(corpus=dtf_test["text"], ratio=0.2)
predicted [i]

[('last month', 0.08595116774453293),
 ('celebrity parties', 0.07360487668427271),
 ('Harry Potter star Daniel Radcliffe', 0.07173102430947365),
 ('Harry Potter', 0.0700529663997606),
 ('fast cars', 0.06614130599204038),
 ('party', 0.06235120083336138),
 ('Potter', 0.06000465313631846),
 ('author Rudyard Kipling', 0.058800658974486227),
 ('Part II', 0.05854371943929253),
 ('Rudyard Kipling', 0.05285235731077936),
 ('the UK box office chart', 0.05004295304209587),
 ('kid star', 0.048683864451682046),
 ('DVDs', 0.048405433842217314),
 ('Reuters', 0.04807361820664859),
 ('gossip columnists', 0.046926515780545304),
 ('Australian', 0.046474305422124496),
 ('release', 0.04609343323620385),
 ('fair game', 0.04459941879428323),
 ('UK', 0.04405555003201066),
 ('an Australian film', 0.043610134638487646),
 ('Daniel Radcliffe', 0.0435964701698019),
 ('the horror film', 0.04193512504589283),
 ('a massive sports car collection', 0.04148315779730591),
 ('wraps', 0.041008660529068666),
 ('Potters lat

## Result using basic concatenation of top[ratio] words 

In [13]:
dirty_sentence = " ".join([str(t[0]) for t in predicted[i]])
print(dirty_sentence)

last month celebrity parties Harry Potter star Daniel Radcliffe Harry Potter fast cars party Potter author Rudyard Kipling Part II Rudyard Kipling the UK box office chart kid star DVDs Reuters gossip columnists Australian release fair game UK an Australian film Daniel Radcliffe the horror film a massive sports car collection wraps Potters latest » Hostel: Part II


## Generate content using GPT-2 

In [25]:
generator = pipeline('text-generation', model='gpt2')
set_seed(42)

In [29]:
!pip install --upgrade numpy



ERROR: Could not install packages due to an OSError: [WinError 5] Accès refusé: 'C:\\Users\\loren\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\~umpy\\.libs\\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll'
Check the permissions.




Collecting numpy
  Downloading numpy-1.24.3-cp310-cp310-win_amd64.whl (14.8 MB)
     --------------------------------------- 14.8/14.8 MB 11.1 MB/s eta 0:00:00
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.3
    Uninstalling numpy-1.22.3:
      Successfully uninstalled numpy-1.22.3


In [31]:
import numpy as np

In [33]:
generator(dirty_sentence, max_length=100, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


RuntimeError: Numpy is not available

## Result using keyword and OPEN API GPT3 Davinci (billing problem for now, contacted openAI)

In [59]:
with open("open_ai_api_key.txt", "r") as f:
    openai.api_key = f.read().rstrip()

In [60]:
separator = ","

In [61]:
# Format predicted as a partial text
input = "Generate a sentence from the following keywords:\n\n"
for keyword, value in predicted[i]:
    input += f"- {keyword}, {value}\n"
input += "\nSentence:"

In [64]:
# Send a request to the OpenAI API using the text completion feature
response = openai.Completion.create(
    engine="davinci",
    prompt=input,
    max_tokens=50,
    temperature=0.5,
    frequency_penalty=0.5,
)

RateLimitError: You exceeded your current quota, please check your plan and billing details.

In [55]:
# Print the generated sentence
print(response["choices"][0]["text"])

NameError: name 'response' is not defined

# Using LSA Method 

In [28]:
parser = PlaintextParser.from_string(dtf_train["text"], Tokenizer("english"))
summarizer_lsa = LsaSummarizer()
summary = summarizer_lsa(parser.document, 8)
for sentence in summary:
    pprint.pprint(sentence)

<Sentence: 1        Editor's note: In our Behind the Scenes series... 2        MINNEAPOLIS, Minnesota (CNN) -- Drivers who we... 3        WASHINGTON (CNN) -- Doctors removed five small... 4        (CNN)  -- The National Football League has ind... 5        BAGHDAD, Iraq (CNN) -- Dressed in a Superman s... ... 19995    Los Angeles (CNN) -- A 24-year-old Connecticut... 19996    Jerusalem (CNN) -- Israeli officials were work... 19997    New York City is the quintessential metropolis... 19998    Los Angeles Film Festival (June 18 -- 28) Sinc... 19999    (CNN) -- Ten years ago, NBC, eager to come up ... Name: text, Length: 19999, dtype: object>
