In [2]:
!pip install newsapi-python lxml_html_clean newspaper3k sacremoses nltk --quiet

In [3]:
from newsapi import NewsApiClient
import pandas as pd
from datetime import datetime, timedelta
from newspaper import Article
from transformers import BartTokenizer, AutoTokenizer, AutoModelForSeq2SeqLM
import json
from google.colab import drive
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import sent_tokenize

drive.mount('/content/drive/')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [4]:
# API
api_key = ""
newsapi = NewsApiClient(api_key=api_key)

# Only tomorrow's articles will be loaded
yesterday = datetime.utcnow().date() - timedelta(days=1)
from_date = to_date = yesterday.isoformat()  # 'YYYY-MM-DD'

# The articles will be filtered by the next keywords
query = "(artificial intelligence OR machine learning OR data science) AND (business OR marketing OR logistics OR finance OR operations)"

response = newsapi.get_everything(
  q=query,
  from_param=from_date,
  to=to_date,
  language='en',
  sort_by='publishedAt',
  page_size=100
)

# Parse results:
articles = response['articles']
df = pd.DataFrame(articles)[['title', 'description', 'content', 'url', 'publishedAt', 'source']]
df['source'] = df['source'].apply(lambda s: s['name'] if isinstance(s, dict) else s)

print(f"{len(df)} articles published on {from_date}:")
df.head()

30 articles published on 2025-06-04:


Unnamed: 0,title,description,content,url,publishedAt,source
0,Fooling ourselves,"Truth and Love will survive AI, despite humani...",Steven Spielbergs 2001 film A.I.: Artificial I...,https://wng.org/articles/fooling-ourselves-174...,2025-06-04T22:05:00Z,Wng.org
1,Ocean Power Technologies Announces Hiring of J...,"MONROE TOWNSHIP, N.J., June 04, 2025 (GLOBE NE...","MONROE TOWNSHIP, N.J., June 04, 2025 (GLOBE NE...",https://www.globenewswire.com/news-release/202...,2025-06-04T20:15:00Z,GlobeNewswire
2,Snowflake orchestrates its AI strategy through...,A small orchestra provided the opening prelude...,A small orchestra provided the opening prelude...,https://siliconangle.com/2025/06/04/snowflake-...,2025-06-04T20:05:35Z,SiliconANGLE News
3,The Seabed Is Now a Battlefield,Great power competition is taking new forms un...,The rules-based global order is under siege at...,http://foreignpolicy.com/2025/06/04/seabed-chi...,2025-06-04T19:11:04Z,Foreign Policy
4,Dream job no more? AI is coming for Wall Stree...,AI is starting to take over entry-level jobs o...,Wall Streets entry-level junior analyst jobs a...,https://economictimes.indiatimes.com/news/inte...,2025-06-04T16:29:18Z,The Times of India


In [5]:
# As NewsAPI on free account allows to load only very cutted content, let's load content by url using newspaper3k package

def get_full_article_text(url):
  try:
    article = Article(url)
    article.download()
    article.parse()
    return article.text
  except Exception as e:
    print(f"Error parsing {url}: {e}")
    return None

df['text'] = df['url'].apply(get_full_article_text)

# Let's filter from the results rows with articles forbidden to parse
df = df[df.text.notnull()].reset_index(drop=True)

Error parsing https://siliconangle.com/2025/06/04/snowflake-orchestrates-ai-strategy-cortex-unstructured-data-solutions-agents/: Article `download()` failed with 403 Client Error: Forbidden for url: https://siliconangle.com/2025/06/04/snowflake-orchestrates-ai-strategy-cortex-unstructured-data-solutions-agents/ on URL https://siliconangle.com/2025/06/04/snowflake-orchestrates-ai-strategy-cortex-unstructured-data-solutions-agents/
Error parsing https://www.forbes.com/sites/juliadhar/2025/06/04/ai-talent-meet-the-guardians-of-the-ai-algorithms/: Article `download()` failed with 403 Client Error: Max restarts limit reached for url: https://www.forbes.com/sites/juliadhar/2025/06/04/ai-talent-meet-the-guardians-of-the-ai-algorithms/ on URL https://www.forbes.com/sites/juliadhar/2025/06/04/ai-talent-meet-the-guardians-of-the-ai-algorithms/
Error parsing https://www.forbes.com/councils/forbestechcouncil/2025/06/04/if-ai-gave-you-all-the-answers-how-would-you-change-the-world/: Article `downlo

# Summarization:

In [6]:
# Load model:
summarizer_model_path = "./bart-lora-tuned"
summarizer = AutoModelForSeq2SeqLM.from_pretrained(summarizer_model_path)
tokenizer = BartTokenizer.from_pretrained(summarizer_model_path)

def summarize(text, max_input_length=1024, max_output_length=128):
  inputs = tokenizer(text, return_tensors="pt", max_length=max_input_length, truncation=True)
  input_ids = inputs["input_ids"]
  attention_mask = inputs["attention_mask"]

  summary_ids = summarizer.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_output_length)
  return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Translation from English to Ukrainian:

In [7]:
translator_model_path = "Helsinki-NLP/opus-mt-en-uk"
translator = AutoModelForSeq2SeqLM.from_pretrained(translator_model_path)
translator_tokenizer = AutoTokenizer.from_pretrained(translator_model_path)

def translate_to_ua(text):
  sentences = sent_tokenize(text)
  translated_sentences = []

  for sent in sentences:
    tokens = translator_tokenizer(sent, return_tensors="pt", truncation=True, max_length=1025)
    output = translator.generate(**tokens, max_length=1025)
    translated_sentences.append(translator_tokenizer.decode(output[0], skip_special_tokens=True))

  return " ".join(translated_sentences)

# Generate output:

In [8]:
results = []
for i in range(len(df)):
  results.extend([{
    'summary': translate_to_ua(summarize(df['text'][i])),
    'source': df['source'][i],
    'url': df['url'][i]
  }])

with open("summary.json", "w", encoding="utf-8") as f:
  json.dump(results, f, ensure_ascii=False, indent=4)

In [9]:
def backup_colab_content_to_drive(folder_name='Colab Notebooks'):
  import shutil
  import os

  src = '/content'
  dest = f'/content/drive/MyDrive/{folder_name}'
  os.makedirs(dest, exist_ok=True)

  for item in os.listdir(src):
    if item == 'drive':
      continue
    s = os.path.join(src, item)
    d = os.path.join(dest, item)
    if os.path.isdir(s):
      shutil.copytree(s, d)
    else:
      shutil.copy2(s, d)

  print(f'📁 Backup complete. Files saved to: {dest}')

backup_colab_content_to_drive('robot_dreams/backup/results')

📁 Backup complete. Files saved to: /content/drive/MyDrive/robot_dreams/backup/results
