In [4]:
import requests
import json
from datetime import datetime, timedelta
import re
import pandas as pd

API_KEY = "9c56272ec3ef4a73a5bfe06c8bc1a4e9"
URL = "https://newsapi.org/v2/everything"

In [2]:
def fetch_articles(query, from_date, to_date, page_size=100):
    params = {
        "q": query,
        "from": from_date,
        "to": to_date,
        "language": "en",
        "pageSize": page_size,
        "searchIn": "title,description",  # Limit search to title and description
        "apiKey": API_KEY,
    }
    response = requests.get(URL, params=params)
    data = response.json()

    if data["status"] != "ok":
        print(f"Error: {data['message']}")
        return []

    return data.get("articles", [])


def save_articles_to_json(articles, filename):
    with open(filename, "w") as file:
        json.dump(articles, file, indent=4)


# Function to split date range into weekly intervals
def date_range_splitter(start_date, end_date, delta=7):
    date_ranges = []
    current_date = start_date
    while current_date < end_date:
        next_date = current_date + timedelta(days=delta)
        date_ranges.append(
            (current_date.strftime("%Y-%m-%d"), next_date.strftime("%Y-%m-%d"))
        )
        current_date = next_date
    return date_ranges

In [3]:
# # Set the date range for the last month
# today = datetime.now()
# start_date = datetime(2024, 9, 16)
# date_ranges = date_range_splitter(
#     start_date, today, delta=7
# )  # Split into weekly intervals

# all_trump_articles = []
# all_harris_articles = []

# # Fetch articles for each date range
# for from_date, to_date in date_ranges:
#     print(f"Fetching Trump articles from {from_date} to {to_date}")
#     trump_articles = fetch_articles("Donald Trump", from_date, to_date)
#     all_trump_articles.extend(trump_articles)

#     print(f"Fetching Harris articles from {from_date} to {to_date}")
#     harris_articles = fetch_articles("Kamala Harris", from_date, to_date)
#     all_harris_articles.extend(harris_articles)

# # Save the articles in dataset/ folder
# save_articles_to_json(all_trump_articles, "dataset/trump_articles.json")
# save_articles_to_json(all_harris_articles, "dataset/harris_articles.json")

# print(
#     f"Saved {len(all_trump_articles)} Trump articles and {len(all_harris_articles)} Harris articles."
# )

Fetching Trump articles from 2024-09-16 to 2024-09-23
Fetching Harris articles from 2024-09-16 to 2024-09-23
Fetching Trump articles from 2024-09-23 to 2024-09-30
Fetching Harris articles from 2024-09-23 to 2024-09-30
Fetching Trump articles from 2024-09-30 to 2024-10-07
Fetching Harris articles from 2024-09-30 to 2024-10-07
Fetching Trump articles from 2024-10-07 to 2024-10-14
Fetching Harris articles from 2024-10-07 to 2024-10-14
Fetching Trump articles from 2024-10-14 to 2024-10-21
Fetching Harris articles from 2024-10-14 to 2024-10-21
Saved 500 Trump articles and 500 Harris articles.


In [7]:
# Load the saved articles
with open("dataset/trump_articles.json", "r") as file:
    trump_articles = json.load(file)

with open("dataset/harris_articles.json", "r") as file:
    harris_articles = json.load(file)

# Convert to DataFrame for easy manipulation
df_trump = pd.DataFrame(trump_articles)
df_harris = pd.DataFrame(harris_articles)


# Basic cleaning functions
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Remove non-alphanumeric characters (except spaces)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text

# Function to filter out entries with "removed" in title or description
def filter_removed_entries(df):
    # Remove rows where title or description contains 'removed'
    df_filtered = df[~((df['title'].str.contains('removed', case=False, na=False)) |
                       (df['description'].str.contains('removed', case=False, na=False)))]
    return df_filtered

In [8]:
# Apply cleaning to title and description columns
df_trump['title'] = df_trump['title'].apply(lambda x: clean_text(x) if pd.notnull(x) else '')
df_trump['description'] = df_trump['description'].apply(lambda x: clean_text(x) if pd.notnull(x) else '')

df_harris['title'] = df_harris['title'].apply(lambda x: clean_text(x) if pd.notnull(x) else '')
df_harris['description'] = df_harris['description'].apply(lambda x: clean_text(x) if pd.notnull(x) else '')

# Remove duplicates (if any)
df_trump = df_trump.drop_duplicates(subset=['title', 'description'])
df_harris = df_harris.drop_duplicates(subset=['title', 'description'])

# Filter out "removed" entries
df_trump = filter_removed_entries(df_trump)
df_harris = filter_removed_entries(df_harris)

# Save cleaned data to new JSON files
df_trump.to_json('dataset/cleaned_trump_articles.json', orient='records', indent=4)
df_harris.to_json('dataset/cleaned_harris_articles.json', orient='records', indent=4)

print(f"Cleaned data: {len(df_trump)} Trump articles and {len(df_harris)} Harris articles saved.")

Cleaned data: 444 Trump articles and 444 Harris articles saved.
