### Importing libraries

In [1]:
import requests
from bs4 import BeautifulSoup
from googlesearch import search as google_search
from transformers import pipeline
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from heapq import nlargest

nltk.download('punkt')
nltk.download('stopwords')


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/a81081529/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/a81081529/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### fetch articles using google_Search and bs4

In [2]:
def fetch_articles_google(query, num_results=5):
    return list(google_search(query, num_results=num_results))

def get_article_text(url):
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        paragraphs = soup.find_all('p')
        return ' '.join([p.get_text() for p in paragraphs])
    except Exception as e:
        return f"Error: {e}"

query = "Virat Kohli IPL 2023"
articles = fetch_articles_google(query)
print("Fetched article URLs:")
for url in articles:
    print(url)

sample_text = get_article_text(articles[0])
print("\nSample Article Text (first 1000 chars):\n", sample_text[:1000])


Fetched article URLs:
https://www.mykhel.com/cricket/players/virat-kohli-ipl-p3788/
https://www.espncricinfo.com/ask/cricket-qna/Virat-Kohli-strike-rate-in-each-IPL-year&tournament=ipl/1000
https://advancecricket.com/player/virat-kohli-ipl-stats-2023/102442
https://timesofindia.indiatimes.com/sports/cricket/ipl/virat-kohli-ipl-career/featureshow/116493433.cms
https://www.espncricinfo.com/records/tournament/batting-most-runs-career/indian-premier-league-2023-15129

Sample Article Text (first 1000 chars):
 This website is using a security service to protect itself from online attacks. The action you just performed triggered the security solution. There are several actions that could trigger this block including submitting a certain word or phrase, a SQL command or malformed data. You can email the site owner to let them know you were blocked. Please include what you were doing when this page came up and the Cloudflare Ray ID found at the bottom of this page. 
Cloudflare Ray ID: 961932917

### Summarising using extractive summarizer

In [3]:
def extractive_summarizer(text, target_length=150):
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    word_freq = {}
    for word in words:
        if word.isalnum() and word not in stop_words:
            word_freq[word] = word_freq.get(word, 0) + 1
    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_freq:
                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_freq[word]
    summary_sentences = nlargest(max(1, min(len(sentences), target_length // 20)), sentence_scores, key=sentence_scores.get)
    return ' '.join(summary_sentences)

extractive_summary = extractive_summarizer(sample_text, 150)
print("\nExtractive Summary:\n", extractive_summary)



Extractive Summary:
 Cloudflare Ray ID: 961932917cc8ba26
•

      Your IP:
      Click to reveal
2401:4900:1cb0:c0c4:3c40:41a7:cd54:18e3
•

Performance & security by Cloudflare Please include what you were doing when this page came up and the Cloudflare Ray ID found at the bottom of this page. There are several actions that could trigger this block including submitting a certain word or phrase, a SQL command or malformed data. This website is using a security service to protect itself from online attacks. The action you just performed triggered the security solution. You can email the site owner to let them know you were blocked.


### Using fine-tuned bart model, news articles are summarised.

In [4]:
summarizer = pipeline("summarization", model="facebook/bart-base")
abstractive_summary = summarizer(sample_text, max_length=180, min_length=30, do_sample=False)[0]['summary_text']
print("\nAbstractive Summary (BART):\n", abstractive_summary)


Device set to use mps:0
Your max_length is set to 180, but your input_length is only 169. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=84)
Both `max_new_tokens` (=256) and `max_length`(=180) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Abstractive Summary (BART):
 This website is using a security service to protect itself from online attacks. The action you just performed triggered the security solution. There are several actions that could trigger this block including submitting a certain word or phrase, a SQL command or malformed data. You can email the site owner to let them know you were blocked. Please include what you were doing when this page came up and the Cloudflare Ray ID found at the bottom of this page. __________________________________________Cloudflare ID: 961932917cc8ba26• Â    Your IP: [email protected]•   •   Click to reveal your IP address:• IP Address: 961942401:4900:1cb0:c0c4:3c40:41a7:cd54:18e3• Email Address: https://www.cloudflare.com/Email Address: __________________________Performance & security by CloudFlare Ray
