In [None]:
!pip install requests
!pip install bs4
!pip install pandas
!pip install pathlib



In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import re

In [None]:
#expects url for finviz news site or for finviz news of a ticker symbol
#returns list of headlines

def get_headlines(url):
  pattern = r"(https://finviz.com/quote.ashx\?t=.*)"
  agent = {"User-Agent":"Mozilla/5.0"}
  page = requests.get(url, headers = agent).text
  bs = BeautifulSoup(page)

  #check if url is finviz main news site
  if url == "https://finviz.com/news.ashx":
    headlines = bs.find_all("a", class_="nn-tab-link")
    #delete all non-news items
    del headlines[91:]
    del headlines[0]
    return headlines

  #check if url is news of ticker symbol on finviz
  elif re.match(pattern, url):
    headlines = bs.find_all("a", class_="tab-link-news")
    return headlines
    
  #unexpected url
  else:
    print("This URL is not supported!")
    return None

In [None]:
#expects list of headlines
#returns the headlines as data frame
def headlines_to_df(headlines):
  news_sentiment_df = pd.DataFrame({
      'news': headlines,
      'sentiment': 0
  })
  return news_sentiment_df

In [None]:
#expects data frame and file name as input
#appends and saves data frame to unlabeled dataset
def append_df(news_sentiment_df, file_name):
  my_file = Path(file_name)
  if my_file.exists():
    print("Appending to existing file named " + file_name)
    orig_df = pd.read_csv(file_name)
    new_df = pd.concat([orig_df, news_sentiment_df], ignore_index=True).drop_duplicates()
    new_df.to_csv(file_name, index=False, header = True, encoding='utf-8-sig')
  else:
    print("Creating new file named" + file_name)
    news_sentiment_df.to_csv(file_name, index=False, header = True, encoding='utf-8-sig')

In [None]:
#gets news headlines from the main finviz site and the given ticker symbols
#finviz main news site
finviz_url = "https://finviz.com/news.ashx"

#base url for ticker symbols (base url + ticker symbol = url for ticker symbol)
ticker_url = "https://finviz.com/quote.ashx?t="

#define ticker symbols where news should be scraped for
ticker_list = ["AAPL", "MSFT", "DIS", "INTC", "JNJ", "JPM", "KO", "O", "PFE", "XOM", "SPG", "T", "UN", "WM", "RDS-A", "LMT", 
               "COST", "CMCSA", "ADBE", "QCOM", "CSCO", "IBM", "TXN"]

#create list of urls that should be scraped for news
url_list = [finviz_url]

#add all ticker symbols in defined list to url list
for ticker in ticker_list:
  url_list.append(ticker_url+ticker)
print("Start Updating File")

#visit all urls, get the headlines and update the csv
for url in url_list:
  headlines = get_headlines(url)
  append_df(headlines_to_df(headlines), "news_headlines.csv")
print("Update Completed")

Start Updating File
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headlines.csv
Appending to existing file named news_headli