In [31]:
# !pip install git+https://github.com/psf/requests-html.git#egg=requests-html
%reset
import httpx
import re
import requests
import time
import nltk

import pandas as pd
import yfinance as yf

from bs4 import BeautifulSoup as bs

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options


Once deleted, variables cannot be recovered. Proceed (y/[n])? n
Nothing done.


In [3]:
HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
           'Accept-Encoding': 'gzip, deflate, br',
           'Accept-Language': 'en-US,en;q=0.9,nl;q=0.8',
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
          }

DATA_LOCATION = 'Datasets/'

HOMEPAGE_URL = 'https://www.investopedia.com/markets-news-4427704'

VUSA_DOWNLOAD_URL = 'https://datahub.io/core/s-and-p-500-companies'
SP_DOWNLOAD_URL = 'https://datahub.io/core/s-and-p-500'
DATAHUB_URL = 'https://datahub.io'


YAHOO_URL = 'https://finance.yahoo.com/'

nltk.download(['punkt', 'wordnet', 'stopwords', 'omw-1.4', 'vader_lexicon'])

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu') 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mark\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mark\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mark\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Mark\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Mark\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
def get_response(url, cookies={}, headers=HEADERS):
    r = httpx.get(url, cookies=cookies, headers=headers, follow_redirects=True)
    soup = bs(r.content)
    return soup

In [5]:
def get_card_titles(soup):
    titles = []
    a_tags = soup.find_all('a', class_='card')
    for i in range(len(a_tags)):
        titles.append(a_tags[i].span.text)
    return titles

def get_card_urls(soup):
    urls = []
    a_tags = soup.find_all('a', class_='card')
    for i in range(len(a_tags)):
        urls.append(a_tags[i]['href'])
    return urls

In [6]:
def clean_text(text):
    return re.sub(r'[^\w\s]', '', text)

In [7]:
def get_url_content(urls):
    articles = []
    raw_articles = []
    for url in urls:
        soup = get_response(url)
        paragraphs = soup.find('div', class_='article-content').find_all('p')
        article = ''.join([p.get_text(separator=' ') for p in paragraphs])
        
        raw_articles.append(paragraphs)
        articles.append(clean_text(article))
    return articles, raw_articles

def get_symbols(raw_articles, href_ref):
    symbol_list = []
    for p_list in raw_articles:
        article_symbols = []
        for p in p_list:
            symbols = [a_tag.text for a_tag in p.find_all('a') if re.search(href_ref, a_tag['href'])]
            if symbols:
                for symbol in symbols:
                    article_symbols.append(symbol)
        symbol_list.append(article_symbols)
    return symbol_list

In [8]:
def get_datahub_csv(URL, filename):
    soup = get_response(URL)
    download_url = soup.find_all('table')[1].find_all('a')[1]['href']
    download_url = DATAHUB_URL + download_url
    with open(DATA_LOCATION + filename, 'wb') as f:
        f.write(requests.get(download_url).content)

In [9]:
def get_article_links(driver):
    links = driver.find_elements(By.TAG_NAME, 'a')
    return links

In [10]:
def get_yahoo_news_articles_df():
    yahoo_news_driver = webdriver.Chrome(options=options)

    yahoo_news_driver.get(YAHOO_URL)
    time.sleep(1)
    yahoo_news_driver.find_element(By.CLASS_NAME, 'reject-all').click()
    actions = ActionChains(yahoo_news_driver)
    for i in range(1500):
        actions.scroll_by_amount(0, 500)
    actions.perform()
    actions.reset_actions()

    links = get_article_links(yahoo_news_driver)
    df = pd.DataFrame({'title': [link.text for link in links], 'url': [link.get_attribute('href') for link in links], 'classes': [link.get_attribute('class') for link in links]})
    yahoo_news_driver.quit()
    df = df[df['url'] \
            .str.contains('/news/') & df['classes'].str.contains('js-content-viewer')]\
            .drop_duplicates(subset='url', keep='last')\
            .reset_index(drop=True)
    return df

In [11]:
def progress_bar(current, total, bar_length=20):
    fraction = current / total

    arrow = int(fraction * bar_length - 1) * '-' + '>'
    padding = int(bar_length - len(arrow)) * ' '

    ending = '\n' if current == total else '\r'

    print(f'Progress: [{arrow}{padding}] {int(fraction*100)}%', end=ending)

In [12]:
def scrape_articles(urls):
    articles = []
    raw_articles = []
    yahoo_driver = webdriver.Chrome(options=options)
    yahoo_driver.get(YAHOO_URL)
    yahoo_driver.find_element(By.CLASS_NAME, 'reject-all').click()

    for index, url in enumerate(urls):
        yahoo_driver.get(url)
        try:
            yahoo_driver.find_element(By.CLASS_NAME, 'collapse-button').click()
        except:
            pass
        article_p_elements = yahoo_driver.find_element(By.CLASS_NAME, 'caas-body').find_elements(By.TAG_NAME, 'p')
        article_by_paragraph = [p.text for p in article_p_elements if p.text!='']
        
        raw_article = [bs(element.get_attribute('outerHTML'), "html.parser") for element in article_p_elements]
        article = " ".join(article_by_paragraph)
        clean_article = clean_text(article)
        articles.append(clean_article)
        raw_articles.append(raw_article)
        progress_bar(index+1, len(urls))
    
        
    yahoo_driver.quit()
    
    return articles, raw_articles

In [42]:
def combine_text_dfs(investopedia_df, yahoo_df):
    df1 = investopedia_df.drop(['symbols', 'appears_in_vusa'], axis=1)
    df2 = yahoo_df.drop('classes', axis=1)
    article_df = pd.concat([df1, df2])
    article_df = article_df.reset_index(drop=True)
    return article_df

In [14]:
def clean_and_tokenize(articles):
    lemmatizer = WordNetLemmatizer()
    
    tokenized_articles = []
    for article in articles:
        tokenized_article = nltk.word_tokenize(article)
        tokenized_article = [token for token in tokenized_article if token not in stopwords.words('english')]
        tokenized_article = [lemmatizer.lemmatize(token) for token in tokenized_article]
        
        tokenized_articles.append(tokenized_article)
        
    return tokenized_articles

In [37]:
def get_api_df(vusa_df): 
    symbol_list = list(vusa_df['Symbol'])
    data = yf.download(symbol_list, group_by="ticker")
    filtered_data = data.dropna(axis=1, how='all')
    filtered_data.columns = filtered_data.columns.remove_unused_levels()
    return filtered_data

In [28]:
def combine_dfs(api_df, text_df, downloaded_df):
    pos = text_df.describe().T[:1].reset_index(drop=True)
    neg = text_df.describe().T[1:].reset_index(drop=True)

    pos.columns = ['pos_' + col for col in pos.columns]
    neg.columns = ['neg_' + col for col in neg.columns]

    pos_neg_row = pd.concat([pos, neg], axis=1)
    
    prices_df = api_df.merge(downloaded_df, how='left', left_index=True, right_on='Date')
    
    prices_df = prices_df.iloc[::-1]
    
    final_df = pd.concat([prices_df, pos_neg_row], axis=1)
    
    return final_df

In [38]:
def extract():
    # download
    get_datahub_csv(VUSA_DOWNLOAD_URL, 'vusa_holdings.csv')
    get_datahub_csv(SP_DOWNLOAD_URL, 'time_data.csv')
    
    vusa_df = pd.read_csv(DATA_LOCATION + 'vusa_holdings.csv')
    time_data_df = pd.read_csv(DATA_LOCATION + 'time_data.csv')
    
    #api
    api_df = get_api_df(vusa_df)
    
    #investopedia
    soup = get_response(HOMEPAGE_URL, HEADERS)

    titles = get_card_titles(soup)
    urls = get_card_urls(soup)

    investopedia_df = pd.DataFrame({'url': urls, 'title': titles})

    articles, raw_articles = get_url_content(investopedia_df['url'])
    investopedia_df['article'] = articles
    symbols = get_symbols(raw_articles, 'widgetsymbol')
    investopedia_df['symbols'] = symbols
    
    #yahoo
    yahoo_news_df = get_yahoo_news_articles_df()
    
    yahoo_news_articles, raw_yahoo_news_articles = scrape_articles(yahoo_news_df['url'])

    yahoo_news_df['article'] = yahoo_news_articles
    
    return vusa_df, api_df, time_data_df, investopedia_df, yahoo_news_df


In [39]:
vusa_df, api_df, time_data_df, investopedia_df, yahoo_news_df = extract()

[*********************100%%**********************]  505 of 505 completed


27 Failed downloads:
['PBCT', 'ANTM', 'DISCK', 'BRK.B', 'ABC', 'CERN', 'WLTW', 'INFO', 'XLNX', 'BLL', 'DISCA', 'SIVB', 'ATVI', 'FRC', 'NLSN', 'KSU', 'FBHS', 'PKI', 'RE', 'FISV', 'TWTR', 'DRE', 'VIAC', 'FB', 'CTXS', 'NLOK']: Exception('%ticker%: No timezone found, symbol may be delisted')
['BF.B']: Exception('%ticker%: No price data found, symbol may be delisted (1d 1925-02-08 -> 2024-01-16)')



Progress: [------------------->] 100%


In [43]:
def transform(vusa_df, api_df, time_data_df, investopedia_df, yahoo_news_df):
    symbols_in_vusa = vusa_df['Symbol'].tolist()

    investopedia_df['appears_in_vusa'] = investopedia_df['symbols'].apply(lambda symbols: any(symbol in symbols_in_vusa for symbol in symbols))
    investopedia_df = investopedia_df[investopedia_df['appears_in_vusa']]

    text_df = combine_text_dfs(investopedia_df, yahoo_news_df)

    text_df['tokenized_article'] = clean_and_tokenize(text_df['article'])

    analyzer = SentimentIntensityAnalyzer()
    scaler = MinMaxScaler()
    text_df['positivity_score'] = [analyzer.polarity_scores(article)['pos'] for article in text_df['article']]
    text_df['negativity_score'] = [analyzer.polarity_scores(article)['neg'] for article in text_df['article']]
    
    final_df = combine_dfs(api_df, text_df, time_data_df)
    
    return final_df


In [44]:
final_df = transform(vusa_df, api_df, time_data_df, investopedia_df, yahoo_news_df)
final_df

MergeError: Not allowed to merge between different levels. (2 levels on the left, 1 on the right)

In [None]:
soup = get_response(HOMEPAGE_URL, HEADERS)

titles = get_card_titles(soup)
urls = get_card_urls(soup)

investopedia_df = pd.DataFrame({'url': urls, 'title': titles})

articles, raw_articles = get_url_content(investopedia_df['url'])
investopedia_df['article'] = articles
symbols = get_symbols(raw_articles, 'widgetsymbol')
investopedia_df['symbols'] = symbols

# get_datahub_csv(VUSA_DOWNLOAD_URL, 'vusa_holdings.csv')
# get_datahub_csv(SP_DOWNLOAD_URL, 'time_data.csv')

time_data_df = pd.read_csv(DATA_LOCATION + 'time_data.csv')

# vusa_df = pd.read_csv(DATA_LOCATION + 'vusa_holdings.csv')
# symbols_in_vusa = vusa_df['Symbol'].tolist()

# investopedia_df['appears_in_vusa'] = investopedia_df['symbols'].apply(lambda symbols: any(symbol in symbols_in_vusa for symbol in symbols))
# investopedia_df = investopedia_df[investopedia_df['appears_in_vusa']]

display(investopedia_df)
display(time_data_df)

# yahoo_news_df = get_yahoo_news_articles_df(
# )
# yahoo_news_articles, raw_yahoo_news_articles = scrape_articles(yahoo_news_df['url'])

# yahoo_news_df['article'] = yahoo_news_articles

# df = combine_dfs(investopedia_df, yahoo_news_df)

# df['tokenized_article'] = clean_and_tokenize(df['article'])

# analyzer = SentimentIntensityAnalyzer()
# scaler = MinMaxScaler()
# df['positivity_score'] = [analyzer.polarity_scores(article)['pos'] for article in df['article']]
# df['negativity_score'] = [analyzer.polarity_scores(article)['neg'] for article in df['article']]