# Imports

In [1]:
from newsapi import NewsApiClient
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import requests
import json
from bs4 import BeautifulSoup

# Source level categorization for countru and category

In [2]:
newsapi = NewsApiClient(api_key='3eb3d1ecdf60403f97e0352fcbc286b4')
data = newsapi.get_sources()

In [3]:
# Extracting information
status = data['status']
sources = data['sources']

In [4]:
# Listing sources 
source_list = []

for source in sources:
    id = source['id']
    name = source['name']
    source_list.append((name,id))
    
source_list

[('ABC News', 'abc-news'),
 ('ABC News (AU)', 'abc-news-au'),
 ('Aftenposten', 'aftenposten'),
 ('Al Jazeera English', 'al-jazeera-english'),
 ('ANSA.it', 'ansa'),
 ('Argaam', 'argaam'),
 ('Ars Technica', 'ars-technica'),
 ('Ary News', 'ary-news'),
 ('Associated Press', 'associated-press'),
 ('Australian Financial Review', 'australian-financial-review'),
 ('Axios', 'axios'),
 ('BBC News', 'bbc-news'),
 ('BBC Sport', 'bbc-sport'),
 ('Bild', 'bild'),
 ('Blasting News (BR)', 'blasting-news-br'),
 ('Bleacher Report', 'bleacher-report'),
 ('Bloomberg', 'bloomberg'),
 ('Breitbart News', 'breitbart-news'),
 ('Business Insider', 'business-insider'),
 ('Business Insider (UK)', 'business-insider-uk'),
 ('Buzzfeed', 'buzzfeed'),
 ('CBC News', 'cbc-news'),
 ('CBS News', 'cbs-news'),
 ('CNN', 'cnn'),
 ('CNN Spanish', 'cnn-es'),
 ('Crypto Coins News', 'crypto-coins-news'),
 ('Der Tagesspiegel', 'der-tagesspiegel'),
 ('Die Zeit', 'die-zeit'),
 ('El Mundo', 'el-mundo'),
 ('Engadget', 'engadget'),
 ('E

In [5]:
# # Filtering by categories
# categorie_source = {}
# for source in sources:
#     category = source['category']
#     if category not in categorie_source:
#         categorie_source[category] = []
#     categorie_source[category].append(source['name'])

# To be filtered manually and generate a mask

In [6]:
# Classify sources by category
categories = {}
countries = {}

for source in sources:
    category = source['category']
    country = source['country']
    
    if category not in categories:
        categories[category] = []
    categories[category].append(source)
    
    if country not in countries:
        countries[country] = []
    countries[country].append(source)

In [7]:
def create_dataframe(data_dict, outer_key_name, inner_key_name):
    records = []
    for outer_key, sources in data_dict.items():
        for source in sources:
            record = {
                outer_key_name: outer_key,
                inner_key_name: source['name'],
                'id': source['id'],
                'description': source['description'],
                'url': source['url'],
                'category': source['category'],
                'language': source['language'],
                'country': source['country']
            }
            records.append(record)
    return pd.DataFrame(records).set_index([outer_key_name, inner_key_name])

# Create DataFrame for categories
df_categories = create_dataframe(categories, 'Category', 'Source Name')
yesterday_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
df_categories['yesterday_date'] = yesterday_date
df_categories.drop(labels='category', axis=1, inplace=True)

# Create DataFrame for countries
df_countries = create_dataframe(countries, 'Country', 'Source Name')

Filtering mask will be applied

In [8]:
df_categories

Unnamed: 0_level_0,Unnamed: 1_level_0,id,description,url,language,country,yesterday_date
Category,Source Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
general,ABC News,abc-news,"Your trusted source for breaking news, analysi...",https://abcnews.go.com,en,us,2024-06-07
general,ABC News (AU),abc-news-au,"Australia's most trusted source of local, nati...",https://www.abc.net.au/news,en,au,2024-06-07
general,Aftenposten,aftenposten,Norges ledende nettavis med alltid oppdaterte ...,https://www.aftenposten.no,no,no,2024-06-07
general,Al Jazeera English,al-jazeera-english,"News, analysis from the Middle East and worldw...",https://www.aljazeera.com,en,us,2024-06-07
general,ANSA.it,ansa,"Agenzia ANSA: ultime notizie, foto, video e ap...",https://www.ansa.it,it,it,2024-06-07
...,...,...,...,...,...,...,...
entertainment,The Lad Bible,the-lad-bible,The LAD Bible is one of the largest community ...,https://www.theladbible.com,en,gb,2024-06-07
health,Medical News Today,medical-news-today,Medical news and health news headlines posted ...,http://www.medicalnewstoday.com,en,us,2024-06-07
science,National Geographic,national-geographic,Reporting our world daily: original nature and...,http://news.nationalgeographic.com,en,us,2024-06-07
science,New Scientist,new-scientist,Breaking science and technology news from arou...,https://www.newscientist.com/section/news,en,us,2024-06-07


In [9]:
# Define a function to fetch and convert JSON data to a DataFrame
def fetch_json(source_id, date):
    url = (f'https://newsapi.org/v2/everything?'
           f'sources={source_id}&'
           f'from={date}&'
           'sortBy=popularity&'
           'apiKey=3eb3d1ecdf60403f97e0352fcbc286b4')
    
    response = requests.get(url)
    return response.json()

In [29]:
def json_to_df(data):
    articles = data['articles']
    df = pd.DataFrame(articles)
    df = df[['author', 'title', 'url', 'description', 'publishedAt']]
    df.columns = ['author', 'title', 'url', 'description', 'date']
    
    # Adding the content column by scraping the URLs
    contents = []
    for url in df['url']:
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            article_content = soup.find_all('p')
            content_text = ' '.join([paragraph.get_text() for paragraph in article_content])
            contents.append(content_text)
        except Exception as e:
            contents.append(f"Error: {str(e)}")
    
    df['content'] = contents
    return df

Run the next block only after filtering

In [30]:
# df_categories['data'] = df_categories.apply(
#     lambda row: fetch_and_convert_to_dataframe(row['id'], row['yesterday_date']),
#     axis=1
# )

demo

In [31]:
df_first_two_rows = df_categories.head(2).copy()
df_first_two_rows['data'] = df_first_two_rows.apply(
    lambda row: json_to_df(fetch_json(row['id'], row['yesterday_date'])),
    axis=1
)

In [32]:
df_first_two_rows['data'][0]['content'][6]

"Prince Harry has been given permission to appeal the British government's rejection to provide him with police protection in the U.K. The Court of Appeal gave the Duke of Sussex the go-ahead to challenge a ruling earlier this year in the High Court LONDON -- Prince Harry has been given permission to appeal the British government’s rejection to provide him with publicly funded police protection in the U.K.  The Court of Appeal gave the Duke of Sussex the go-ahead to challenge a ruling earlier this year in the High Court. The permission was granted in May but only reported Thursday. Judge Peter Lane ruled in February that a government panel’s decision to provide “bespoke” security on an as-needed basis after Harry quit as a working member of the royal family was not unlawful, irrational or unjustified.  “Insofar as the case-by-case approach may otherwise have caused difficulties, they have not been shown to be such as to overcome the high hurdle so as to render the decision-making irrat

In [None]:
df_categories

Unnamed: 0_level_0,Unnamed: 1_level_0,id,description,url,language,country,yesterday_date
Category,Source Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
general,ABC News,abc-news,"Your trusted source for breaking news, analysi...",https://abcnews.go.com,en,us,2024-06-07
general,ABC News (AU),abc-news-au,"Australia's most trusted source of local, nati...",https://www.abc.net.au/news,en,au,2024-06-07
general,Aftenposten,aftenposten,Norges ledende nettavis med alltid oppdaterte ...,https://www.aftenposten.no,no,no,2024-06-07
general,Al Jazeera English,al-jazeera-english,"News, analysis from the Middle East and worldw...",https://www.aljazeera.com,en,us,2024-06-07
general,ANSA.it,ansa,"Agenzia ANSA: ultime notizie, foto, video e ap...",https://www.ansa.it,it,it,2024-06-07
...,...,...,...,...,...,...,...
entertainment,The Lad Bible,the-lad-bible,The LAD Bible is one of the largest community ...,https://www.theladbible.com,en,gb,2024-06-07
health,Medical News Today,medical-news-today,Medical news and health news headlines posted ...,http://www.medicalnewstoday.com,en,us,2024-06-07
science,National Geographic,national-geographic,Reporting our world daily: original nature and...,http://news.nationalgeographic.com,en,us,2024-06-07
science,New Scientist,new-scientist,Breaking science and technology news from arou...,https://www.newscientist.com/section/news,en,us,2024-06-07


# Hence pipeline created
will be improved later