In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta, timezone
import getpass
import json

In [2]:
# Base URL for News API
NEWSAPI_BASE_URL = 'https://newsapi.org/v2/everything'

# API Key from newsapi.org
newsapi_key = ['5ddc207db38548beb32baf54a71c680e', # N2N
               'fe81eb5f8465452ca563ecc0f39cdf9e', # V2S
               '260e896096e340069621dc89e664e4d7' # S2V
               #'994aa1fc07e74b39acf7df2a61ec6df1'  # N53
              ]

# NEWSAPI Usage
query_string = 'US Presidential Elections OR 2024 Elections OR Election Results OR Democrat OR Republican'

# Global variables
all_articles = []
data_rows = []
run_id = 'UNDEFINED'

# Output Files
articles_csv = "news_data/filtered_news_data.csv"  # Specify the file path
rundata_csv = "news_data/run_data.csv"


articles_columns = ['run_id', 'api_key', 'start_time', 'end_time', 'Total_articles', 'Filtered_articles']
rundata_columns = ["API","Source", "Title", "PublishedAt", "Url"]

In [3]:
# Function to get news data
def get_news(api_key=None, country='us', category=None, query=None, from_time=None, to_time=None, page_size=10):
    # Define parameters
    params = {
        'apiKey': api_key,
        'from': from_time,              # Start date for the articles
        'to': to_time,                # End date for the articles
        'q': query,            # Optional: keyword or phrase to search for
        'pageSize': page_size, # Number of articles to return
        'searchIn': "title,content,description",
        'sortBy': 'popularity',
        'pageSize': 100,
        'language': 'en',
        #'country': country,    # Country code (e.g., 'us' for the United States)
        #'category': category,  # Optional: e.g., 'technology', 'sports'
        #'from': "2024-10-21",              # Start date for the articles
        #'to': "2024-10-21",                # End date for the articles

    }

    # Send GET request to the API
    response = requests.get(NEWSAPI_BASE_URL, params=params)

    #if response.status_code == 200:
    #    articles = response.json().get("articles", [])
    #    for article in articles:
    #        print(f"Title: {article['title']}")
    #        print(f"Source: {article['source']['name']}")
    #        print(f"Published At: {article['publishedAt']}")
    #        print(f"URL: {article['url']}")
    #        print("-" * 80)
    #else:
    #    print(f"Failed to retrieve data: {response.status_code}, {response.text}")
    
    # Check if the request was successful
    if response.status_code == 200:
        # Parse JSON response
        news_data = response.json()
        return news_data['articles']
    else:
        print("Failed to retrieve data:", response.status_code, response.text)
        return None

In [4]:
def create_newsDf_newsapi(articles):
    filtered_data = []
    
    for item in articles:
        title_removed = item.get('title') == '[Removed]'
        source_removed = item.get('source', {}).get('name') == '[Removed]'
        url_videos = 'videos' in item.get('url', '')

        if not ((title_removed and source_removed) or url_videos):
            filtered_data.append({
                'API': 'newsapi',  # Fixed value
                'source': item.get('source', {}).get('name'),
                'title': item.get('title'),
                'publishedAt': item.get('publishedAt'),
                'url': item.get('url')
            })
    return pd.DataFrame(filtered_data)

In [5]:
def print_news(article):
    print(f"Keys: {article[0].keys()}")
    
    for idx, article in enumerate(articles, start=1):
        print(f"{idx}. Title: {article['title']}")
        print(f"   Source: {article['source']['name']}")
        #print(f"   Description: {article['description']}")
        print(f"   Published At: {article['publishedAt']}")
        print(f"   URL: {article['url']}")
        print("-" * 80)

In [6]:
def write_to_csv(df, file_path, head):
    df.to_csv(file_path, index=False, mode='a', header=False)
    print(f"Data written to {file_path}")

In [7]:
def create_runDf_data(api_key, articles, newsDf, start_time, end_time):
    run_data = []

    # Data for this iteration
    length_of_fetched_articles = len(articles)
    length_of_filtered_articles = newsDf.shape[0]

    all_articles.extend(articles)
    
    # Append a row for the DataFrame
    run_data.append({
        "run_id": run_id,
        "api_key": api_key,
        "start_time": start_time,
        "end_time": end_time,
        "Total_articles": length_of_fetched_articles,
        "Filtered_articles": length_of_filtered_articles,
    })

    return pd.DataFrame(run_data)
    

In [8]:
def generate_run_id():
    username = getpass.getuser()
    timestamp = datetime.now().strftime('%Y%m%d%H%M')
    run_id = "{}_{}".format(username, timestamp)
    print("\n\033[94mRun ID: {}\033[0m\n".format(run_id))
    return run_id

In [9]:
def generate_time_blocks(start_time, end_time):
    """
    Generate 6-hour time blocks between start_time and end_time.

    Args:
        start_time (datetime): The start datetime.
        end_time (datetime): The end datetime.

    Returns:
        list: A list of dictionaries containing 'from' and 'to' for each time block.
    """
    time_blocks = []
    current_time = start_time

    while current_time <= end_time:
        # Divide each day into 6-hour blocks
        for start_hour in range(0, 24, 6):
            block_start = datetime(current_time.year, current_time.month, current_time.day, start_hour, 0, 0)
            block_end = block_start + timedelta(hours=6) - timedelta(seconds=1)

            # Ensure block_end does not exceed end_time
            if block_start > end_time:
                break

            time_blocks.append({
                'from': block_start.strftime("%Y-%m-%dT%H:%M:%S"),
                'to': block_end.strftime("%Y-%m-%dT%H:%M:%S")
            })

        # Move to the next day
        current_time += timedelta(days=1)

    return time_blocks

In [10]:
#Generate Run ID
run_id = generate_run_id()

# Define the start and end times
start_time = datetime.utcnow() - timedelta(days=30)
#end_time = datetime.utcnow() - timedelta(days=29)
end_time = datetime.utcnow()

# Call the function
time_blocks = generate_time_blocks(start_time, end_time)

request_count = 0
#newsDf = pd.DataFrame()

# Print the time blocks
for block in time_blocks:
    request_count += 1
    #api_key = (lambda count: newsapi_key[(count // 2) % len(newsapi_key)])(int(request_count))
    api_key = (lambda count: newsapi_key[(count // 40) % len(newsapi_key)])(int(request_count))
    print(f"count: {request_count} api_key: {api_key}")
    
    articles = get_news(api_key,
                        query = query_string, 
                        category='general', 
                        from_time=block['from'],
                        to_time=block['to'],
                        page_size=10)
    
    newsDf = create_newsDf_newsapi(articles)
    write_to_csv(newsDf, articles_csv, articles_columns)

    #if articles:
    #   print_news(articles)

    runDf = create_runDf_data(api_key, articles, newsDf, block['from'], block['to'])
    write_to_csv(runDf, rundata_csv, rundata_columns)

# Save all articles to a JSON file
with open(f"news_data/{run_id}.json", "w") as json_file:
    json.dump(all_articles, json_file, indent=4)


[94mRun ID: nirmv_202411211653[0m

count: 1 api_key: 5ddc207db38548beb32baf54a71c680e


  start_time = datetime.utcnow() - timedelta(days=30)
  end_time = datetime.utcnow()


Data written to news_data/filtered_news_data.csv
Data written to news_data/run_data.csv
count: 2 api_key: 5ddc207db38548beb32baf54a71c680e
Data written to news_data/filtered_news_data.csv
Data written to news_data/run_data.csv
count: 3 api_key: 5ddc207db38548beb32baf54a71c680e
Data written to news_data/filtered_news_data.csv
Data written to news_data/run_data.csv
count: 4 api_key: 5ddc207db38548beb32baf54a71c680e
Data written to news_data/filtered_news_data.csv
Data written to news_data/run_data.csv
count: 5 api_key: 5ddc207db38548beb32baf54a71c680e
Data written to news_data/filtered_news_data.csv
Data written to news_data/run_data.csv
count: 6 api_key: 5ddc207db38548beb32baf54a71c680e
Data written to news_data/filtered_news_data.csv
Data written to news_data/run_data.csv
count: 7 api_key: 5ddc207db38548beb32baf54a71c680e
Data written to news_data/filtered_news_data.csv
Data written to news_data/run_data.csv
count: 8 api_key: 5ddc207db38548beb32baf54a71c680e
Data written to news_data/f