# Python LAB - Natural Language Processing in Finance [2.1]

## 0 - Packages

In [9]:
import pandas as pd
import numpy as np
import time
import nltk
import warnings
warnings.filterwarnings('ignore')

from htmldate import find_date
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from newspaper import Article

from selenium import webdriver
from selenium.webdriver.common.by import By
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.downloader.download('vader_lexicon')
from htmldate import find_date

from newspaper import Article

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/marco/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## 1 - Starting the Webdriver

In order for the webscraping function to work you need to install in you computer the webdriver. Depending on the computer you are using and the version of the browser you have (I suggest Chrome, but it works also on other browsers) you will need to download it and put it in the same folder you are working on.

Take a look at the following links in case you are interested:
* [How to install the webdriver](https://www.youtube.com/watch?v=2WVxzRD6Ds4)
* [Chromedriver download](https://chromedriver.chromium.org/) 

In [10]:
driver = webdriver.Chrome()

## 2 - Collecting News Data

### 2.1 News-collection Function

In [12]:
def get_newslinks(company, page_number):
    """For a given URL, scroll to relevant section to load appropriate HTML into driver,
    iterate through all articles on page and append article URLs to a list

    :param company: name of company to scrape articles for
    :param page_number: page number on news website to iterate over 

    :return: list of articles URLs
    """
    
    url = f"https://uk.investing.com/equities/{company}-news/{page_number}"
    driver.get(url)
    
    old_position = 0
    new_position = None

    while new_position != old_position:
        # Get old scroll position
        old_position = driver.execute_script(
                ("return (window.pageYOffset !== undefined) ?"
                " window.pageYOffset : (document.documentElement ||"
                " document.body.parentNode || document.body);"))
        # Sleep and Scroll
        time.sleep(1)
        driver.execute_script((
                "var scrollingElement = (document.scrollingElement ||"
                " document.body);scrollingElement.scrollTop ="
                " scrollingElement.scrollHeight;"))
        # Get new position
        new_position = driver.execute_script(
                ("return (window.pageYOffset !== undefined) ?"
                " window.pageYOffset : (document.documentElement ||"
                " document.body.parentNode || document.body);"))
        
    cleaned_links = []

    # Iterate through all the articles on the page
    for art_n in range(1,11): 
        article = driver.find_element(By.XPATH, value = f'/html/body/div[1]/div[2]/div[2]/div[2]/div[1]/div[2]/ul/li[{art_n}]/article/div')
        article_html = article.get_attribute('innerHTML')
        soup = BeautifulSoup(article_html, "lxml")
        for link in soup.find_all('a'): 
            # Get the href
            partial_link = link.get('href')
            if partial_link is not None:
                if 'https' in partial_link: 
                    cleaned_links.append(partial_link)
                # Some links are 'internal' to the page and don't have https in them. The web page must be appended to these links
                elif partial_link[0] == '/': 
                    cleaned_links.append('https://uk.investing.com/'+partial_link) 

    return np.unique(cleaned_links)

### 2.2 News Collection

In [13]:
all_company_urls = []
for page in range(1,5):
    results = get_newslinks('oracle-corp', page)
    all_company_urls.extend(results)

driver.quit()

In [14]:
all_company_urls

['https://uk.investing.com//news/stock-market-news/1000-invested-in-this-stock-20-years-ago-would-be-worth-8500-today-3330008',
 'https://uk.investing.com//news/stock-market-news/90s-tech-titans-are-outshining-magnificent-seven--analyst-tells-why-market-is-going-gaga-over-them-3312571',
 'https://uk.investing.com//news/stock-market-news/north-carolina-adopts-oracle-cloud-erp-for-financial-management-93CH-3333821',
 'https://uk.investing.com//news/stock-market-news/oracle-named-a-leader-in-the-idc-marketscape-us-ehr-vendor-assessment-93CH-3329846',
 'https://uk.investing.com//news/stock-market-news/orc--analyzing-orchid-island-caps-short-interest-3312894',
 'https://uk.investing.com//pro/offers/breaking-news-offer?referral=3326567_news_8',
 'https://uk.investing.com//news/economy/as-davos-crowd-gathers-governments-urged-to-rein-in-billionaire-class-3298033',
 'https://uk.investing.com//news/stock-market-news/frances-ovhcloud-firstquarter-sales-jump-12-on-europe-boost-3294590',
 'https:/

## 3 - Sentiment Analysis

In [15]:
# stock ticker
ticker = 'ORCL'
# Create a DataFrame to populate while iterating
article_sentiments = pd.DataFrame({'ticker':[],
                                'publish_date':[],
                                'title': [],
                                'body_text': [],
                                'url':[],
                                'neg':[],
                                'neu':[], 
                                'pos':[], 
                                'compound':[]})
# Loop over all the articles
for link in all_company_urls:
      article = Article(link)
      article.download()
      
      try:
          article.parse()
          text = article.text

      except: 
          print("I didn't get this")
          continue

      # Initialise sentiment analyser    
      sid = SentimentIntensityAnalyzer()
      # Get positive, negative, neutral and compound scores
      polarity = sid.polarity_scores(text)

      tmpdic = {'ticker': ticker, 'publish_date': find_date(link), 'title': article.title, 'body_text': article.text, 'url': link}
      # Update ticker with the new entry polarity
      tmpdic.update(polarity)
      # tmpdic now has all keys and values needed to populate the DataFrame
      article_sentiments= article_sentiments.append(pd.DataFrame(tmpdic, index=[0]))
      article_sentiments.reset_index(drop=True, inplace=True)


In [16]:
article_sentiments

Unnamed: 0,ticker,publish_date,title,body_text,url,neg,neu,pos,compound
0,ORCL,2024-02-08,$1000 Invested In This Stock 20 Years Ago Woul...,"Published Feb 08, 2024 15:30 Updated Feb 08, 2...",https://uk.investing.com//news/stock-market-ne...,0.0,0.952,0.048,0.7717
1,ORCL,2024-01-26,90's Tech Titans Are Outshining 'Magnificent S...,"Published Jan 26, 2024 12:12 Updated Jan 26, 2...",https://uk.investing.com//news/stock-market-ne...,0.011,0.871,0.118,0.9962
2,ORCL,2024-02-12,North Carolina adopts Oracle Cloud ERP for fin...,"Published Feb 12, 2024 13:04\n\n© Reuters\n\nO...",https://uk.investing.com//news/stock-market-ne...,0.014,0.905,0.081,0.9647
3,ORCL,2024-02-08,Oracle Named a Leader in the IDC MarketScape U...,"Oracle recognized for AI automation, value del...",https://uk.investing.com//news/stock-market-ne...,0.017,0.848,0.135,0.9966
4,ORCL,2024-01-26,Analyzing Orchid Island Cap's Short Interest B...,"Published Jan 26, 2024 14:30 Updated Jan 26, 2...",https://uk.investing.com//news/stock-market-ne...,0.012,0.825,0.164,0.9965
5,ORCL,2024-01-01,InvestingPro,,https://uk.investing.com//pro/offers/breaking-...,0.0,0.0,0.0,0.0
6,ORCL,2024-01-15,"As Davos crowd gathers, governments urged to r...","Published Jan 15, 2024 00:19 Updated Jan 15, 2...",https://uk.investing.com//news/economy/as-davo...,0.034,0.836,0.13,0.9924
7,ORCL,2024-01-11,France's OVHcloud first-quarter sales rise 12%...,"Published Jan 11, 2024 06:47 Updated Jan 11, 2...",https://uk.investing.com//news/stock-market-ne...,0.013,0.881,0.106,0.9744
8,ORCL,2024-01-25,Hollywood Power Play: David Ellison Aims to Tr...,"Published Jan 25, 2024 18:56 Updated Jan 25, 2...",https://uk.investing.com//news/stock-market-ne...,0.041,0.872,0.088,0.9486
9,ORCL,2024-01-23,If You Invested $1000 In This Stock 5 Years Ag...,"Published Jan 23, 2024 18:37 Updated Jan 23, 2...",https://uk.investing.com//news/stock-market-ne...,0.0,0.962,0.038,0.7003


## 4 - Storing Dataframe

In [None]:
article_sentiments.to_csv("Oracle_articles.csv", sep=',', encoding='utf-8', header=True)

with open('nvidia_urls_20210105.txt', 'w') as f:
    for link in all_company_urls: 
        f.write("%s\n" % link)