# Imports

In [1]:
# !pip install yfinance --upgrade --no-cache-dir
# !pip install finnhub-python
# !pip3 install news-please

# !pip install torch
# !pip install peft
# !pip install -U accelerate
# !pip install transformers bitsandbytes
# !pip install pynvml
# !pip install gnews
# !pip install selenium
# !pip install webdriver_manager
# !pip install --upgrade undetected-chromedriver
# !pip install -U selenium

News from other sources (Google)

In [6]:
# !pip3 install newspaper3k
from gnews import GNews
import json
from datetime import datetime
import time

ticker = "TSLA"
search_term = "Tesla stock"

file_path = f"{ticker}_news.jsonl"
google_news = GNews()
def get_news(search_terms):
    stock_news = google_news.get_news(search_terms)
    # Define a function to convert date strings into datetime objects
    def parse_date(date_str):
        # Example date: "Thu, 08 Feb 2024 15:35:00 GMT"
        return datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S GMT')

    # Sort stock_news by parsed 'publish date'
    stock_news = sorted(stock_news, key=lambda x: parse_date(x['published date']), reverse=True)
    
    return stock_news

def get_content(url, homepage_url):
    article = google_news.get_full_article(url, homepage_url)
    return article

import json
from dateutil import parser


def write_articles_to_file(articles, file_path):
    """Function to update or add articles, not adding an article if the title matches an existing record and the published date is within 1 day."""
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            existing_articles = [json.loads(line) for line in file]
    except FileNotFoundError:
        existing_articles = []

    updated = False  # Flag to track if any article was updated or added
    number = 0

    for new_article in articles:
            new_article_date = parser.parse(new_article['published date'])
            article_exists = False

            # Iterate over existing articles to find a match
            for i, existing_article in enumerate(existing_articles):
                existing_article_date = parser.parse(existing_article['published date'])

                # Check if the titles match and the dates are within 1 day
                if new_article['title'] == existing_article['title'] and abs((new_article_date - existing_article_date).total_seconds()) / 3600 <= 24:
                    article_exists = True
                    
                    # Optional: Update the existing article if the new article is considered "better"
                    # This condition can be modified depending on what "better" means (e.g., more recent, more complete)
                    if new_article.get('content'):
                        existing_articles[i] = new_article  # Update the article in-place
                        updated = True
                        number += 1
                    
                    break  # Stop looking for more matches

            # If the article does not exist, append it to the list
            if not article_exists:
                existing_articles.append(new_article)
                updated = True
                number += 1

    # Write back to the file only if there was an update
    if updated:
        with open(file_path, 'w', encoding='utf-8') as file:
            for article in existing_articles:
                file.write(json.dumps(article, ensure_ascii=False) + '\n')

    print("New Articles?", updated, ":", number)
    return len(articles)  # Returning count of processed articles for simplicity


def load_articles_with_empty_content(file_path):
    """Load articles from a file, returning only those with empty or missing 'content'."""
    articles_with_empty_content = []

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                article = json.loads(line)
                # Check if 'content' is missing or empty
                if not article.get('content'):
                    articles_with_empty_content.append(article)
    except FileNotFoundError:
        print(f"No file found at {file_path}")
    
    return articles_with_empty_content

def load_articles_with_full_content(file_path):
    """Load articles from a file, returning only those with empty or missing 'content'."""
    articles_with_empty_content = []

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                article = json.loads(line)
                # Check if 'content' is missing or empty
                if article.get('content'):
                    articles_with_empty_content.append(article)
    except FileNotFoundError:
        print(f"No file found at {file_path}")
    
    return articles_with_empty_content

def parse_date(date_str):
    # Example date: "Thu, 08 Feb 2024 15:35:00 GMT"
    return datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S GMT')

Entry point for the scraping process to get their contents

In [7]:
exclude = ['wsj.com', 'fool.com']

# Only do x articles before continuing (for loop, a skipped article does not count as a step)
def get_news_content(stock_news, file_path, articles_to_scrape=20):
    print("Articles Unscraped:", len(stock_news))
    for i, article in enumerate(stock_news):
        if any(excluded_domain in article['publisher']['href'] for excluded_domain in exclude):
            print("Closed due to  domain exclusion: ", article['publisher']['href'])
            continue
        
        article['content'] = get_content(article['url'], article['publisher']['href']).text
        article = [article]
        
        print("+_+_+_+_Content: ", article[0]['content'][:100])
        
        write_articles_to_file(article, file_path)
        print(articles_to_scrape, "left")
        
        articles_to_scrape -= 1
        if articles_to_scrape == 0:
            break

Start the process every x seconds

In [8]:
while True:
    stock_news = get_news(search_term)
    write_articles_to_file(stock_news, file_path)
    print(stock_news)

    unscrapped_stock_news = load_articles_with_empty_content(file_path)
    unscrapped_stock_news = sorted(unscrapped_stock_news, key=lambda x: parse_date(x['published date']), reverse=True)
    get_news_content(unscrapped_stock_news, file_path)
    
    print("Sleeping")
    for i in range(300):
        time.sleep(1)

New Articles? True : 41
Articles Unscraped: 185
The 'Accept all' button was not found within the given time.
+_+_+_+_Content:  Because these price spikes are a state of mind. And when that state of mind changes – see Tesla.

Nv
New Articles? True : 1
20 left
The 'Accept all' button was not found within the given time.
+_+_+_+_Content:  (Bloomberg) -- There was a time when the backing of some of the world’s deepest pockets and the mere
New Articles? True : 1
19 left
The 'Accept all' button was not found within the given time.
+_+_+_+_Content:  "Benzinga's Top Stocks to Buy Today" There’s only two mistakes you can make when investing. One is n
New Articles? True : 1
18 left
Closed due to  domain exclusion:  https://www.fool.com
The 'Accept all' button was not found within the given time.
