In [None]:
import base64
from fp.fp import FreeProxy
from googlenewsdecoder import gnewsdecoder
import pandas as pd
from pygooglenews import GoogleNews
import datetime
import openai
import ast
from tqdm import tqdm

### get dataset

In [None]:
import pandas as pd
from pygooglenews import GoogleNews
import datetime

gn = GoogleNews()

def get_news(search):
    stories = []
    start_date = datetime.date(2025,1,1)
    end_date = datetime.date(2025,2,28)
    delta = datetime.timedelta(days=1)
    date_list = pd.date_range(start_date, end_date).tolist()
    
    for date in date_list[:-1]:
        result = gn.search(search, from_=date.strftime('%Y-%m-%d'), to_=(date+delta).strftime('%Y-%m-%d'))
        newsitem = result['entries']

        for item in newsitem:
            story = {
                'title':item.title,
                'link':item.link,
                'published':item.published
            }
            stories.append(story)

    return stories

In [None]:
df = pd.DataFrame(get_news('Tariffs'))

In [None]:
len(df) 

In [None]:
df.to_excel("news_dataset.xlsx")

### Decode URLs

### ReIndex DFs

In [None]:
### Only do 100 at a time to avoid 429 error

In [None]:
def decode_url(link): 
    try:
        decoded_url = gnewsdecoder(link)

        if decoded_url.get("status"):
            return decoded_url["decoded_url"]
        else:
            return "error"
            
    except Exception as e:
        print(f"Error occurred: {e}")

In [None]:
df = pd.read_excel('news_dataset.xlsx')

In [None]:
df.iloc[2999]['URL']

In [None]:
for index,row in df.iloc[4700:4850].iterrows(): 
    df.at[index,'URL'] = decode_url(row['link'])

In [None]:
df.iloc[4849]['URL']

In [None]:
df.to_excel("news_dataset.xlsx")

In [None]:
#4849 total articles

### Create Summaries

In [None]:
## Have to make sure that if the article doesn't exist, we just return nothing, if its a video... do something

In [None]:
from newspaper import Article

In [None]:
openai.api_key = ""

def summarize_text(url, model="gpt-3.5-turbo"):
    """
    Sends a text to OpenAI's ChatGPT model and returns a summary.

    Args:
        text: The text to summarize.
        model: The OpenAI model to use (default: "gpt-3.5-turbo").

    Returns:
        A string containing the summarized text, or None if an error occurs.
    """
    prompt = f"""Please analyze the following link:\n{url}
    Create a paragraph summary of the article and extract the key entities (people, organizations, countries) as well as the key topics of the article. If the link
    directs to a page that doesn't exist or get redirected to a new page, then return None for the summary, the entities, and topics

    Format your return as a JSON format where the keys are summary, entities, and topics and the summary value is a paragraph of. The entities and topics values
    should be a list of strings. 

    """
    
    try:
        response = openai.chat.completions.create(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ]
        )
        summary = response.choices[0].message.content
        result = ast.literal_eval(summary)
        return result
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
df = pd.read_excel('news_dataset.xlsx', index_col=0)
df['summary'] = None
df['entities'] = None
df['topics'] = None
df['entities'] = df['entities'].astype('object')
df['entities'] = df['topics'].astype('object')

In [None]:
df = pd.read_excel('news_dataset.xlsx', index_col=0)

In [None]:
for index,row in tqdm(df.iloc[4000:4745].iterrows()): 
    summaries = summarize_text(df.at[index,'URL'])
    #results = ast.literal_eval(summaries)
    if summaries is None: 
        summaries_2 = summarize_text(df.at[index,'URL'])
        df.at[index,'summary'] = summaries_2['summary']
        df.at[index,'entities'] = summaries_2['entities']
        df.at[index,'topics'] = summaries_2['topics']
    else: 
        df.at[index,'summary'] = summaries['summary']
        df.at[index,'entities'] = summaries['entities']
        df.at[index,'topics'] = summaries['topics']

In [None]:
df.to_excel("news_dataset.xlsx")

In [None]:
len(df)

In [None]:
df.iloc[3998]

In [None]:
results = ast.literal_eval(summarize_text(df.iloc[474]['URL']))

In [None]:
## Need error catching and retrying if ChatGPT breaks/fails
## Sometimes ChatGPT can't read the article

In [None]:
# Actors and what they broadly want to do
# sorting and applying things? 

### GovInfo

In [None]:
import requests
import csv
from bs4 import BeautifulSoup
import pandas as pd

content_url = f"https://api.govinfo.gov/published/2025-01-01/2025-02-28?offsetMark=*&pageSize=100&collection=CPD&api_key=ncS7uKAowhhCuCTd4YO80ZHzRCCiFycccaG2yoCf"
content_response = requests.get(content_url)

In [None]:
content_data = content_response.json()

In [None]:
API_KEY = ""
BASE_URL = "https://api.govinfo.gov"

documents_list = [] 

for package in content_data['packages']:
    title = package['title']
    date_issued = package['dateIssued']
    content_url = f"{BASE_URL}/packages/{package['packageId']}/htm?api_key={API_KEY}"
    content_response = requests.get(content_url)
    if content_response.status_code == 200:
        content_data = content_response.text
        soup = BeautifulSoup(content_data, 'html.parser')
        content = str(soup.get_text())
    else:
        content = "Error retrieving content"
    documents_list.append([title, date_issued, content])

# Convert to DataFrame
df = pd.DataFrame(documents_list, columns=["Title", "Date", "Content"])

# Save as Excel file
xlsx_filename = "cpd_documents.xlsx"
df.to_excel(xlsx_filename, index=False)

print(f"Data successfully saved to {xlsx_filename}")

In [None]:
content_data