In [1]:
import numpy as np
import pandas as pd
import time
import twython
import requests
import nltk
import warnings
from newspaper import Article
from htmldate import find_date
from tqdm import tqdm
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.downloader.download('vader_lexicon')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ctori\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [7]:
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

from selenium import webdriver
from selenium.webdriver.common.by import By
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

In [8]:
def get_article_links(company, page_number):
    base_url = "https://investing.com/equities"
    url = f"{base_url}/{company}/{page_number}"
    # print(f"URL - {url}")

    try:
        # print("Inside try block")
        # Fetch the URL
        # print("Before get url")
        driver.get(url)
        # print("After get url")

        # Variables for tracking scroll position
        old_scroll_position = 0
        new_scroll_position = None

        # print("Before new!=old ")
        # Scroll to the bottom of the page until the scroll position stops changing
        while new_scroll_position != old_scroll_position:
            old_scroll_position = driver.execute_script(
                "return (window.pageYOffset !== undefined) ? "
                "window.pageYOffset : (document.documentElement || "
                "document.body.parentNode || document.body);"
            )
            time.sleep(1)
            driver.execute_script(
                "var scrollingElement = (document.scrollingElement || "
                "document.body);scrollingElement.scrollTop = "
                "scrollingElement.scrollHeight;"
            )
            new_scroll_position = driver.execute_script(
                "return (window.pageYOffset !== undefined) ? "
                "window.pageYOffset : (document.documentElement || "
                "document.body.parentNode || document.body);"
            )
        # print("After new !=old")

        # List to store cleaned links
        cleaned_links = []

        # Find the element containing article links
        # print("Before div_element")
        article_div_element = driver.find_element(By.CLASS_NAME, "mb-4")
        # print("After div_element")

        # Extract all anchor elements within the article div
        # print("Before article links")
        article_links = article_div_element.find_elements(By.TAG_NAME, "a")
        # print("After article links")


        # Iterate through the links and filter out unwanted ones
        # print("Before article link for loop")
        for article_link in article_links:
            partial_link = article_link.get_attribute("href")
            if partial_link:
                if "https" in partial_link and "comments" not in partial_link:
                    cleaned_links.append(partial_link)
                elif partial_link.startswith("/") and "comments" not in partial_link:
                    cleaned_links.append(f"{base_url}{partial_link}")
        # print("After article link for loop")
        return np.unique(cleaned_links)

    finally:
        # Close the browser window
        driver.quit()

In [9]:
companies = {"apple":"apple-computer-inc-opinion", "microsoft":"microsoft-corp-opinion", "amazon":"amazon-com-inc-opinion"}
# companies = {"apple":"apple-computer-inc-opinion"}

article_urls = {"apple":[], "microsoft":[], "amazon":[]}
for company in companies:
    for page in range(1,100):
        results = get_article_links(companies[company], page)
        # print(f"Results = {results}")
        article_urls[company].extend(results)

Results = ['https://www.investing.com/analysis/3-beatendown-stocks-poised-for-rebound-in-2024-200643983'
 'https://www.investing.com/analysis/3-tech-stocks-to-buy-as-fed-pivot-odds-soar-on-peak-inflation-bets-200643620'
 'https://www.investing.com/analysis/apple-can-the-stock-continue-to-go-higher-200643930'
 'https://www.investing.com/analysis/could-a-strong-us-dollar-curb-rise-of-bitcoin-nasdaq-200643572'
 'https://www.investing.com/analysis/is-warren-buffett-bracing-for-a-market-correction-200643837'
 'https://www.investing.com/analysis/microsoft-may-dethrone-apple-as-the-most-valuable-stock-soon-200643877'
 'https://www.investing.com/analysis/sp-500-5-reasons-to-still-expect-a-yearend-rally-200643540'
 'https://www.investing.com/analysis/stocks-week-ahead-nvidia-earnings-pose-crucial-test-for-yearend-rally-hopes-200643722'
 'https://www.investing.com/analysis/unlock-prograde-portfolio-management-for-sustainable-longterm-returns-200643696'
 'https://www.investing.com/analysis/will-u

In [10]:
# Original data
input_tickers = ['APPL', 'MSFT', 'AMZN']

# Dictionary to store article sentiments for each ticker
ticker_sentiments = {}

# Mapping between tickers and company names
ticker_company_name_data = {'APPL': "apple", 'MSFT': "microsoft", 'AMZN': "amazon"}

# Create a DataFrame to populate while iterating
for input_ticker in input_tickers:
    # Initialize an empty DataFrame for each ticker
    ticker_sentiments[input_ticker] = pd.DataFrame({
        'ticker': [],
        'publish_date': [],
        'title': [],
        'body_text': [],
        'url': [],
        'neg': [],
        'neu': [],
        'pos': [],
        'compound': []
    })

    # Loop over all the articles for the current ticker
    for link in article_urls[ticker_company_name_data[input_ticker]]:
        article = Article(link)
        article.download()

        try:
            article.parse()
            text = article.text

        except Exception as e:
            print(f"Error while parsing article: {e}")
            continue

        # Initialize sentiment analyzer
        sid = SentimentIntensityAnalyzer()

        # Get positive, negative, neutral, and compound scores
        polarity = sid.polarity_scores(text)

        # Create a dictionary with article information and sentiment scores
        article_data = {
            'ticker': input_ticker,
            'publish_date': find_date(link),
            'title': article.title,
            'body_text': article.text,
            'url': link
        }

        # Update the dictionary with sentiment scores
        article_data.update(polarity)

        # Concatenate the new data to the existing DataFrame for the current ticker
        ticker_sentiments[input_ticker] = pd.concat([ticker_sentiments[input_ticker], pd.DataFrame(article_data, index=[0])])

        # Reset the index for the DataFrame
        ticker_sentiments[input_ticker].reset_index(drop=True, inplace=True)

In [12]:
# Save DataFrame 
for ticker in input_tickers:
    ticker_sentiments[ticker].to_pickle(f"Raw Data/pickle/{ticker}_sentiments_data.pkl")
    ticker_sentiments[ticker].to_csv(f"Raw Data/csv/{ticker}_sentiments_data.csv", sep=',', encoding='utf-8', header=True)