# installing libraries

In [None]:
import sys
print(sys.executable)

import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip uninstall -y newspaper3k newspaper
!{sys.executable} -m pip install newspaper3k
!{sys.executable} -m pip install lxml-html-clean
!{sys.executable} -m pip install pymongo


# 🧠 InBrief: Tech

This notebook automates a pipeline that:

1. ⏬ Fetches the latest **technology-related news articles**
2. ✂️ Summarizes each article using **Google's Gemini (Generative AI)**
3. 🧮 Scores each article based on **relevance, impact, sentiment, and novelty**
4. 🗃 Stores raw articles, summaries, and scores into separate **MongoDB collections**

*use your own api key

In [None]:
import requests
from datetime import date,timedelta
from newspaper import Article
import json
from pymongo import MongoClient
from google import genai
from google.genai import types
from pydantic import BaseModel

# Utility function to generate a date range string for the News API
def get_date_range(No_of_days):
    current_date = date.today()
    two_days_ago = current_date - timedelta(days=No_of_days)
    current_date = str(current_date)
    two_days_ago = str(two_days_ago)
    date_string = "from=" + two_days_ago + "&to=" + current_date
    return date_string

# Fetches technology news articles from the News API and parses their full text
def get_articles(date_range):
    L = []
    url = "https://newsapi.org/v2/everything?q=technology&"+date_range+"&language=en&sortBy=relevancy"
    headers = {"Authorization": "API_KEY",
    "Content-Type": "application/json"}
    response = requests.get(url,headers = headers)
    data = response.json()
    L = []
    for j in data['articles']:
        url = j['url']
        try:
            article = Article(url)
            article.download()
            article.parse()
            art = {
                'title' : j['title'],
                'text'  : article.text
            }
            L.append(art)
        except Exception as e:
            # Silently skip failed articles
            pass

    # Return both the string and original list format
    art_str = json.dumps(L)
    return art_str,L

# Uses Gemini (via Google GenAI) to summarize articles and extract key metadata
def summarization(input_string):

    # Schema definition for expected structured response
    class Recipe(BaseModel):
        title: str
        summary: str
        keyword: list[str]
        topic: str
        entities: list[str]
        sentiment: str

    client = genai.Client(api_key="API_KEY")

    # Instruction prompt for the GenAI model
    system_instruction='''generate the output stricly in json 

instructions:
-remove the word json at the start, the triple quotes(most importantly)
-remove any newline elements in the output string
-format the json in a way it could be successfully parsed and given as input to json.load()
-I have provided a list containing dictionaries of multiple articles,each provinding the id, title and text of a particular article.
-summarize the text given in a few sentences
-add a key 'title' and add value x['title'] to it.
-add a key 'summary' which contains the summarised text including all the vital points
-add a key 'keyword' which includes all significant keywords and terms related to the relavant technology and trends
-add a key 'topic' which includes the catergory the news belong to.
-add a key 'entities' which includes a list of important entities involved in the technology, for example-organization,people,location.
-add a key 'sentiment' which includes sentiments as positive, negative or nuetral.
'''
    
    # Make the actual call to Gemini
    response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=[input_string,system_instruction],
    config={
        "response_mime_type": "application/json",
        "response_schema": list[Recipe],
    },
)
    return response.text

# Uses Gemini to assign a relevancy score to each article based on multiple dimensions
def relevancy_score(input_string):
    class Recipe(BaseModel):
        title: str
        summary: str
        keyword: list[str]
        topic: str
        entities: list[str]
        sentiment: str
        relevancy_score:str # Additional score field

    client = genai.Client(api_key="API_KEY")
    system_instruction='''generate the output stricly in json 

instructions:
-remove the word json at the start, the triple quotes(most importantly)
-remove any newline elements in the output string
-format the json in a way it could be successfully parsed and given as input to json.load()
-I have provided a json string including fields such as topic,summary,keyword,entities and sentiment.
-add an extra field 'relevancy_score' to rank each acticles on the basis of criteria such as impact, novelty, most relevant and important technology,
sentiments,etc. after analyzing all the fields provided.
'''
    response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=[input_string,system_instruction],
    config={
        "response_mime_type": "application/json",
        "response_schema": list[Recipe],
    },
)

    return response.text


#getting news info from news api

date_range = get_date_range(2)
articles_string, articles_list  = get_articles(date_range)

#Creating a database named tech_news_db to save text and title generated from the newsapi

client = MongoClient("mongodb://localhost:27017/")
db = client["tech_news_db"]
collection = db["articles"]
result = collection.insert_many(articles_list)

#summarizing the text of each news article

summary_index = summarization(articles_string)
list_of_summarized_articles = json.loads(summary_index)


#database to store the summary,title,keyword....
client = MongoClient()
new_database = client['new_database']
new_collection = new_database['articles']
y = new_collection.insert_many(list_of_summarized_articles)

#getting relevancy score data


relevancy_score_data = relevancy_score(summary_index)
print(relevancy_score)
list_of_relevant_articles = json.loads(relevancy_score_data)

# 📚 Store final articles with relevancy score into DB

Client_2 = MongoClient()
news_articles_info = Client_2['news_article_info']
news_collection = news_articles_info['news_collection']
result = news_collection.insert_many(list_of_relevant_articles)





[
  {
    "title": "UK Scientists Achieve First Commercial Tritium Production",
    "summary": "UK scientists, led by Professor Tom Scott from the University of Bristol, have successfully achieved the first commercial production of tritium, a crucial step towards scalable nuclear fusion energy. This breakthrough demonstrates the potential of Multi-State Fusion for isotope production, with nuclear fusion offering an energy output four times higher than fission.",
    "keyword": ["Tritium production", "nuclear fusion", "isotopes", "Multi-State Fusion", "energy output", "commercial production"],
    "topic": "Science & Technology",
    "entities": ["University of Bristol", "Royal Academy of Engineering", "UK Atomic Energy Authority", "Tom Scott", "International Atomic Energy Agency", "The Cooldown"],
    "sentiment": "positive"
  },
  {
    "title": "XBOW's AI-Powered Pentester Grabs Top Rank on HackerOne, Raises $75M to Grow Platform",
    "summary": "XBOW's AI-powered penetration tester

In [9]:
for x in news_collection.find({'relevancy_score': {'$gt': 7}}):
    print('title:',x['title'])
    print( 'news_summary:',x['summary'])
    print('relevance_score',x['relevancy_score'])
    print()

title: Huawei makes bold claim with 3,000 km-range electric car battery
news_summary: Chinese tech giant Huawei has filed a patent for a solid-state electric vehicle battery claiming an astonishing 3,000 km range and 5-minute charging time, a "holy grail" for the EV sector. While this claim is met with skepticism regarding feasibility and weight, major automakers like BMW, Mercedes-Benz, Toyota, and Stellantis are also actively developing solid-state batteries due to their higher energy density and faster charging capabilities. This development aims to combat "range anxiety" and long charging times, which are significant barriers to EV adoption.
relevance_score 9

title: HOLY SMOKES! A new, 200% faster DeepSeek R1-0528 variant appears from German lab TNG Technology Consulting GmbH
news_summary: German firm TNG Technology Consulting GmbH has released DeepSeek-TNG R1T2 Chimera, a new variant of the DeepSeek R1-0528 large language model (LLM) that is significantly faster and more concise,