# Imports

In [59]:
!pip install newsapi-python



In [60]:
from newsapi import NewsApiClient
import pandas as pd
import numpy as np1
from datetime import datetime, timedelta
import requests
import json
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import os

load_dotenv()

True

# Source level categorization for countru and category

In [61]:
newsapi = NewsApiClient(api_key=os.getenv("NEWS_API_key"))
data = newsapi.get_sources()

In [62]:
# Extracting information
status = data['status']
sources = data['sources']

In [63]:
# Listing sources 
source_list = []

for source in sources:
    id = source['id']
    name = source['name']
    source_list.append((name,id))
    
source_list

[('ABC News', 'abc-news'),
 ('ABC News (AU)', 'abc-news-au'),
 ('Aftenposten', 'aftenposten'),
 ('Al Jazeera English', 'al-jazeera-english'),
 ('ANSA.it', 'ansa'),
 ('Argaam', 'argaam'),
 ('Ars Technica', 'ars-technica'),
 ('Ary News', 'ary-news'),
 ('Associated Press', 'associated-press'),
 ('Australian Financial Review', 'australian-financial-review'),
 ('Axios', 'axios'),
 ('BBC News', 'bbc-news'),
 ('BBC Sport', 'bbc-sport'),
 ('Bild', 'bild'),
 ('Blasting News (BR)', 'blasting-news-br'),
 ('Bleacher Report', 'bleacher-report'),
 ('Bloomberg', 'bloomberg'),
 ('Breitbart News', 'breitbart-news'),
 ('Business Insider', 'business-insider'),
 ('Business Insider (UK)', 'business-insider-uk'),
 ('Buzzfeed', 'buzzfeed'),
 ('CBC News', 'cbc-news'),
 ('CBS News', 'cbs-news'),
 ('CNN', 'cnn'),
 ('CNN Spanish', 'cnn-es'),
 ('Crypto Coins News', 'crypto-coins-news'),
 ('Der Tagesspiegel', 'der-tagesspiegel'),
 ('Die Zeit', 'die-zeit'),
 ('El Mundo', 'el-mundo'),
 ('Engadget', 'engadget'),
 ('E

In [64]:
# # Filtering by categories
# categorie_source = {}
# for source in sources:
#     category = source['category']
#     if category not in categorie_source:
#         categorie_source[category] = []
#     categorie_source[category].append(source['name'])

# To be filtered manually and generate a mask

In [65]:
# Classify sources by category
categories = {}
countries = {}

for source in sources:
    category = source['category']
    country = source['country']
    
    if category not in categories:
        categories[category] = []
    categories[category].append(source)
    
    if country not in countries:
        countries[country] = []
    countries[country].append(source)

In [87]:
def create_dataframe(data_dict, outer_key_name, inner_key_name):
    records = []
    for outer_key, sources in data_dict.items():
        for source in sources:
            record = {
                outer_key_name: outer_key,
                inner_key_name: source['name'],
                'id': source['id'],
                'description': source['description'],
                'url': source['url'],
                'category': source['category'],
                'language': source['language'],
                'country': source['country']
            }
            records.append(record)
    return pd.DataFrame(records).set_index([outer_key_name, inner_key_name])

# Create DataFrame for categories
df_categories = create_dataframe(categories, 'Category', 'Source Name')
yesterday_date = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')
df_categories['yesterday_date'] = yesterday_date
df_categories.drop(labels='category', axis=1, inplace=True)

# Create DataFrame for countries
df_countries = create_dataframe(countries, 'Country', 'Source Name')

In [88]:
df_categories['id'].unique()

array(['abc-news', 'abc-news-au', 'aftenposten', 'al-jazeera-english',
       'ansa', 'ary-news', 'associated-press', 'axios', 'bbc-news',
       'bild', 'blasting-news-br', 'breitbart-news', 'cbc-news',
       'cbs-news', 'cnn', 'cnn-es', 'der-tagesspiegel', 'el-mundo',
       'focus', 'fox-news', 'globo', 'google-news', 'google-news-ar',
       'google-news-au', 'google-news-br', 'google-news-ca',
       'google-news-fr', 'google-news-in', 'google-news-is',
       'google-news-it', 'google-news-ru', 'google-news-sa',
       'google-news-uk', 'goteborgs-posten', 'independent', 'infobae',
       'la-gaceta', 'la-nacion', 'la-repubblica', 'le-monde', 'lenta',
       'liberation', 'msnbc', 'national-review', 'nbc-news', 'news24',
       'news-com-au', 'newsweek', 'new-york-magazine', 'nrk', 'politico',
       'rbc', 'reddit-r-all', 'reuters', 'rt', 'rte', 'rtl-nieuws',
       'sabq', 'spiegel-online', 'svenska-dagbladet',
       'the-american-conservative', 'the-globe-and-mail', 'the-hil

In [89]:
# list_countries = ['us','in','gb', 'fr', 'au', 'sa']
list_sources = ['abc-news', 'al-jazeera-english', 'bbc-news', 'google-news', 'google-news-in', 'politico', 'reuters', 'reddit-r-all', 'the-hindu', 'bloomberg', 'the-times-of-india', 'business-insider', 'vice-news', 'the-wall-street-journal'  
                ,'wired', 'national-geographic', 'bbc-sport', 'espn', 'techcrunch','crypto-coins-news','buzzfeed','entertainment-weekly','medical-news-today','fortune','new-scientist','the-lad-bible']

In [90]:
df_categories = df_categories[df_categories['id'].isin(list_sources)]

Filtering mask will be applied

In [91]:
df_categories

Unnamed: 0_level_0,Unnamed: 1_level_0,id,description,url,language,country,yesterday_date
Category,Source Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
general,ABC News,abc-news,"Your trusted source for breaking news, analysi...",https://abcnews.go.com,en,us,2024-06-15
general,Al Jazeera English,al-jazeera-english,"News, analysis from the Middle East and worldw...",https://www.aljazeera.com,en,us,2024-06-15
general,BBC News,bbc-news,"Use BBC News for up-to-the-minute news, breaki...",https://www.bbc.co.uk/news,en,gb,2024-06-15
general,Google News,google-news,"Comprehensive, up-to-date news coverage, aggre...",https://news.google.com,en,us,2024-06-15
general,Google News (India),google-news-in,"Comprehensive, up-to-date India news coverage,...",https://news.google.com,en,in,2024-06-15
general,Politico,politico,"Political news about Congress, the White House...",https://www.politico.com,en,us,2024-06-15
general,Reddit /r/all,reddit-r-all,"Reddit is an entertainment, social news networ...",https://www.reddit.com/r/all,en,us,2024-06-15
general,Reuters,reuters,Reuters.com brings you the latest news from ar...,https://www.reuters.com,en,us,2024-06-15
general,The Hindu,the-hindu,"The Hindu. latest news, analysis, comment, in-...",http://www.thehindu.com,en,in,2024-06-15
general,The Times of India,the-times-of-india,Times of India brings the Latest News and Top ...,http://timesofindia.indiatimes.com,en,in,2024-06-15


In [92]:
# Define a function to fetch and convert JSON data to a DataFrame
def fetch_json(source_id, date):
    url = (f'https://newsapi.org/v2/everything?'
           f'sources={source_id}&'
           f'from={date}&'
           'sortBy=popularity&'
           'apiKey=3eb3d1ecdf60403f97e0352fcbc286b4')
    
    response = requests.get(url)
    return response.json()

In [93]:
def json_to_df(data):
    articles = data['articles']
    df = pd.DataFrame(articles)
    df = df[['author', 'title', 'url', 'description', 'publishedAt']]
    df.columns = ['author', 'title', 'url', 'description', 'date']
    
    # Adding the content column by scraping the URLs
    contents = []
    for url in df['url']:
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'html.parser')
            article_content = soup.find_all('p')
            content_text = ' '.join([paragraph.get_text() for paragraph in article_content])
            contents.append(content_text)
        except Exception as e:
            contents.append(f"Error: {str(e)}")
    
    df['content'] = contents
    return df

Run the next block only after filtering

In [94]:
# df_categories['data'] = df_categories.apply(
#     lambda row: fetch_json(row['id'], row['yesterday_date']),
#     axis=1
# )

In [95]:
df_categories

Unnamed: 0_level_0,Unnamed: 1_level_0,id,description,url,language,country,yesterday_date
Category,Source Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
general,ABC News,abc-news,"Your trusted source for breaking news, analysi...",https://abcnews.go.com,en,us,2024-06-15
general,Al Jazeera English,al-jazeera-english,"News, analysis from the Middle East and worldw...",https://www.aljazeera.com,en,us,2024-06-15
general,BBC News,bbc-news,"Use BBC News for up-to-the-minute news, breaki...",https://www.bbc.co.uk/news,en,gb,2024-06-15
general,Google News,google-news,"Comprehensive, up-to-date news coverage, aggre...",https://news.google.com,en,us,2024-06-15
general,Google News (India),google-news-in,"Comprehensive, up-to-date India news coverage,...",https://news.google.com,en,in,2024-06-15
general,Politico,politico,"Political news about Congress, the White House...",https://www.politico.com,en,us,2024-06-15
general,Reddit /r/all,reddit-r-all,"Reddit is an entertainment, social news networ...",https://www.reddit.com/r/all,en,us,2024-06-15
general,Reuters,reuters,Reuters.com brings you the latest news from ar...,https://www.reuters.com,en,us,2024-06-15
general,The Hindu,the-hindu,"The Hindu. latest news, analysis, comment, in-...",http://www.thehindu.com,en,in,2024-06-15
general,The Times of India,the-times-of-india,Times of India brings the Latest News and Top ...,http://timesofindia.indiatimes.com,en,in,2024-06-15


demo

# Hence pipeline created
will be improved later

### Improved Pipeline

In [131]:
from newsapi import NewsApiClient
import pandas as pd
import numpy as np1
from datetime import datetime, timedelta
import requests
import json
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import os

load_dotenv()

newsapi = NewsApiClient(api_key=os.getenv("NEWS_API_key"))
data = newsapi.get_sources()

# Classify sources by category
categories = {}
countries = {}

for source in data['sources']:
    category = source['category']
    country = source['country']
    
    if category not in categories:
        categories[category] = []
    categories[category].append(source)
    
    if country not in countries:
        countries[country] = []
    countries[country].append(source)

def create_dataframe(data_dict, outer_key_name, inner_key_name):
    records = []
    for outer_key, sources in data_dict.items():
        for source in sources:
            record = {
                outer_key_name: outer_key,
                inner_key_name: source['name'],
                'id': source['id'],
                'description': source['description'],
                'url': source['url'],
                'category': source['category'],
                'language': source['language'],
                'country': source['country']
            }
            records.append(record)
    return pd.DataFrame(records).set_index([outer_key_name, inner_key_name])

# Create DataFrame for categories
df_categories = create_dataframe(categories, 'Category', 'Source Name')
yesterday_date = (datetime.now() - timedelta(days=3)).strftime('%Y-%m-%d')
df_categories['yesterday_date'] = yesterday_date
df_categories.drop(labels='category', axis=1, inplace=True)

# Create DataFrame for countries
df_countries = create_dataframe(countries, 'Country', 'Source Name')

list_sources = ['abc-news', 'al-jazeera-english', 'bbc-news', 'google-news', 'google-news-in', 'politico', 'reuters', 'reddit-r-all', 'the-hindu', 'bloomberg', 
                'the-times-of-india', 'business-insider', 'vice-news', 'the-wall-street-journal', 'wired', 'national-geographic', 'bbc-sport', 'espn', 'techcrunch',
                'crypto-coins-news','buzzfeed','entertainment-weekly','medical-news-today','fortune','new-scientist','the-lad-bible', 'new-york-magazine', 'the-washington-times',
                'the-washington-post', 'cnn', 'ign', 'the-verge', 'fox-sports', 'talksport', 'the-sport-bible', 'hacker-news', 'recode', 'the-next-web', 'entertainment-weekly',
                'time', 'the-hindu', 'cbs-news', 'asutralian-financial-review', 'business-insider-uk', 'financial-post', 'fortune', 'info-money']

df_categories = df_categories[df_categories['id'].isin(list_sources)]

def fetch_json(source_id, date):
    url = (f'https://newsapi.org/v2/everything?'
           f'sources={source_id}&'
           f'from={date}&'
           'sortBy=popularity&'
           'apiKey=3eb3d1ecdf60403f97e0352fcbc286b4')
    
    response = requests.get(url)
    return response.json()

def json_to_df(data):
    try:
        articles = data['articles']
        if not articles:
            raise ValueError("No articles found for the given date.")
        df = pd.DataFrame(articles)
        df = df[['author', 'title', 'url', 'description', 'publishedAt']]
        df.columns = ['author', 'title', 'url', 'description', 'date']
        
        # Adding the content column by scraping the URLs
        contents = []
        for url in df['url']:
            try:
                response = requests.get(url)
                soup = BeautifulSoup(response.content, 'html.parser')
                article_content = soup.find_all('p')
                content_text = ' '.join([paragraph.get_text() for paragraph in article_content])
                contents.append(content_text)
            except Exception as e:
                contents.append(f"Error: {str(e)}")
        
        df['content'] = contents
        return df
    except KeyError as e:
        print(f"KeyError: {e}")
        return pd.DataFrame()
    except ValueError as e:
        print(f"ValueError: {e}")
        return pd.DataFrame()
    
# Function to remove rows with empty content
def remove_empty_content(df):
    if 'content' in df.columns:
        # Removing rows with empty content
        df = df[df['content'].str.strip().astype(bool)]
        df = df.reset_index(drop=True)
    return df




NewsAPIException: {'status': 'error', 'code': 'rateLimited', 'message': 'You have made too many requests recently. Developer accounts are limited to 100 requests over a 24 hour period (50 requests available every 12 hours). Please upgrade to a paid plan if you need more requests.'}

In [122]:
df_trial = df_categories.copy()
df_trial['data'] = df_trial.apply(
    lambda row: json_to_df(fetch_json(row['id'], row['yesterday_date'])),
    axis=1
)

# Apply the function to clean up each DataFrame in the 'data' column
df_trial['data'] = df_trial['data'].apply(remove_empty_content)
# Remove rows where 'data' column has empty DataFrames
df_trial1 = df_trial[df_trial['data'].apply(lambda x: not x.empty)].copy()

ValueError: No articles found for the given date.
ValueError: No articles found for the given date.
ValueError: No articles found for the given date.
ValueError: No articles found for the given date.
ValueError: No articles found for the given date.
ValueError: No articles found for the given date.
ValueError: No articles found for the given date.
KeyError: 'articles'
KeyError: 'articles'


In [132]:
df_trial.shape

(26, 7)

In [133]:
df_trial1

Unnamed: 0_level_0,Unnamed: 1_level_0,id,description,url,language,country,yesterday_date,data
Category,Source Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
general,ABC News,abc-news,"Your trusted source for breaking news, analysi...",https://abcnews.go.com,en,us,2024-06-13,...
general,Al Jazeera English,al-jazeera-english,"News, analysis from the Middle East and worldw...",https://www.aljazeera.com,en,us,2024-06-13,author ...
general,BBC News,bbc-news,"Use BBC News for up-to-the-minute news, breaki...",https://www.bbc.co.uk/news,en,gb,2024-06-13,author \ 0 ht...
general,Google News,google-news,"Comprehensive, up-to-date news coverage, aggre...",https://news.google.com,en,us,2024-06-13,author ...
general,Google News (India),google-news-in,"Comprehensive, up-to-date India news coverage,...",https://news.google.com,en,in,2024-06-13,author ...
general,The Times of India,the-times-of-india,Times of India brings the Latest News and Top ...,http://timesofindia.indiatimes.com,en,in,2024-06-13,author ...
general,Vice News,vice-news,"Vice News is Vice Media, Inc.'s current affair...",https://news.vice.com,en,us,2024-06-13,author \ 0 ...
business,Bloomberg,bloomberg,"Bloomberg delivers business and markets news, ...",http://www.bloomberg.com,en,us,2024-06-13,author ...
business,Business Insider,business-insider,Business Insider is a fast-growing business si...,http://www.businessinsider.com,en,us,2024-06-13,author ...
business,Fortune,fortune,Fortune 500 Daily and Breaking Business News,http://fortune.com,en,us,2024-06-13,...


In [136]:
df_trial1.to_csv("trial.csv", index=False)

In [138]:
# Export the DataFrame to JSON
df_trial1.to_json('trial.json', orient='records', lines=True)


In [143]:
df_trial1

Unnamed: 0_level_0,Unnamed: 1_level_0,id,description,url,language,country,yesterday_date,data
Category,Source Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
general,ABC News,abc-news,"Your trusted source for breaking news, analysi...",https://abcnews.go.com,en,us,2024-06-13,...
general,Al Jazeera English,al-jazeera-english,"News, analysis from the Middle East and worldw...",https://www.aljazeera.com,en,us,2024-06-13,author ...
general,BBC News,bbc-news,"Use BBC News for up-to-the-minute news, breaki...",https://www.bbc.co.uk/news,en,gb,2024-06-13,author \ 0 ht...
general,Google News,google-news,"Comprehensive, up-to-date news coverage, aggre...",https://news.google.com,en,us,2024-06-13,author ...
general,Google News (India),google-news-in,"Comprehensive, up-to-date India news coverage,...",https://news.google.com,en,in,2024-06-13,author ...
general,The Times of India,the-times-of-india,Times of India brings the Latest News and Top ...,http://timesofindia.indiatimes.com,en,in,2024-06-13,author ...
general,Vice News,vice-news,"Vice News is Vice Media, Inc.'s current affair...",https://news.vice.com,en,us,2024-06-13,author \ 0 ...
business,Bloomberg,bloomberg,"Bloomberg delivers business and markets news, ...",http://www.bloomberg.com,en,us,2024-06-13,author ...
business,Business Insider,business-insider,Business Insider is a fast-growing business si...,http://www.businessinsider.com,en,us,2024-06-13,author ...
business,Fortune,fortune,Fortune 500 Daily and Breaking Business News,http://fortune.com,en,us,2024-06-13,...
