# Python LAB - Natural Language Processing in Finance [2.1]

## 0 - Packages

In [None]:
%pip install nltk
%pip install bs4
%pip install htmldate
%pip install selenium
%pip install newspaper3k

In [1]:
import pandas as pd
import numpy as np
import time
import nltk
import warnings
warnings.filterwarnings('ignore')

from htmldate import find_date
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from newspaper import Article

from selenium import webdriver
from selenium.webdriver.common.by import By
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.downloader.download('vader_lexicon')
from htmldate import find_date

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/marco/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## 1 - Starting the Webdriver

In order for the webscraping function to work you need to install in you computer the webdriver. Depending on the computer you are using and the version of the browser you have (I suggest Chrome, but it works also on other browsers) you will need to download it and put it in the same folder you are working on.

Take a look at the following links in case you are interested:
* [How to install the webdriver](https://www.youtube.com/watch?v=2WVxzRD6Ds4)
* [Chromedriver download](https://chromedriver.chromium.org/) 

In [9]:
driver = webdriver.Chrome()

## 2 - Collecting News Data

### 2.1 News-collection Function

In [11]:
def get_newslinks(company, page_number):
    """For a given URL, scroll to relevant section to load appropriate HTML into driver,
    iterate through all articles on page and append article URLs to a list

    :param company: name of company to scrape articles for
    :param page_number: page number on news website to iterate over 

    :return: list of articles URLs
    """
    
    url = f"https://uk.investing.com/equities/{company}-news/{page_number}"
    driver.get(url)
    
    old_position = 0
    new_position = None

    while new_position != old_position:

        old_position = driver.execute_script(
                ("return (window.pageYOffset !== undefined) ?"
                " window.pageYOffset : (document.documentElement ||"
                " document.body.parentNode || document.body);"))

        time.sleep(1)
        driver.execute_script((
                "var scrollingElement = (document.scrollingElement ||"
                " document.body);scrollingElement.scrollTop ="
                " scrollingElement.scrollHeight;"))

        new_position = driver.execute_script(
                ("return (window.pageYOffset !== undefined) ?"
                " window.pageYOffset : (document.documentElement ||"
                " document.body.parentNode || document.body);"))
        
    cleaned_links = []

    # Iteration on the page
    for art_n in range(1,11): 
        article = driver.find_element(By.XPATH, value = f'/html/body/div[1]/div[2]/div[2]/div[2]/div[1]/div[2]/ul/li[{art_n}]/article/div')
        article_html = article.get_attribute('innerHTML')
        soup = BeautifulSoup(article_html, "lxml")
        for link in soup.find_all('a'): 
            # Getting href
            partial_link = link.get('href')
            if partial_link is not None:
                if 'https' in partial_link: 
                    cleaned_links.append(partial_link)
                # Solving issue with internal links
                elif partial_link[0] == '/': 
                    cleaned_links.append('https://uk.investing.com/'+partial_link) 

    return np.unique(cleaned_links)

### 2.2 News Collection

In [12]:
all_company_urls = []
for page in range(1,3):
    results = get_newslinks('royal-dutch-shell-a-shr', page)
    all_company_urls.extend(results)

driver.quit()

In [13]:
all_company_urls

['https://uk.investing.com//news/commodities-news/shell-agrees-to-sell-its-nigeria-onshore-oil-and-gas-business-3298980',
 'https://uk.investing.com//news/stock-market-news/european-stocks-weaken-ahead-of-boe-eurozone-cpi-deutsche-bank-shines-3319459',
 'https://uk.investing.com//news/stock-market-news/ftse-100-live-stocks-in-the-green-supported-by-oil-majors-3314447',
 'https://uk.investing.com//news/stock-market-news/ftse-100-sees-minor-decline-amidst-robust-retail-sector-performance-93CH-3229236',
 'https://uk.investing.com//news/stock-market-news/ftse-100-sees-minor-decline-amidst-robust-retail-sector-performance-93CH-3229236#comments',
 'https://uk.investing.com//news/stock-market-news/labour-would-unashamedly-champion-city-says-shadow-chancellor-3317588',
 'https://uk.investing.com//news/stock-market-news/london-midday-ftse-pares-gains-amid-diverging-fortunes-for-next-jd-sports-3286775',
 'https://uk.investing.com//news/stock-market-news/london-open-stocks-edge-down-on-weak-asian

## 3 - Sentiment Analysis

In [14]:
ticker = 'SHEL'
# DataFrame
article_sentiments = pd.DataFrame({'ticker':[],
                                'publish_date':[],
                                'title': [],
                                'body_text': [],
                                'url':[],
                                'neg':[],
                                'neu':[], 
                                'pos':[], 
                                'compound':[]})
# Loop over articles
for link in all_company_urls:
      article = Article(link)
      article.download()
      
      try:
          article.parse()
          text = article.text

      except: 
          print("I didn't get this")
          continue
        
      sid = SentimentIntensityAnalyzer()
      # Get sentiment scores
      polarity = sid.polarity_scores(text)

      tmpdic = {'ticker': ticker, 'publish_date': find_date(link), 'title': article.title, 'body_text': article.text, 'url': link}
      tmpdic.update(polarity)

      article_sentiments= article_sentiments.append(pd.DataFrame(tmpdic, index=[0]))
      article_sentiments.reset_index(drop=True, inplace=True)


In [15]:
article_sentiments

Unnamed: 0,ticker,publish_date,title,body_text,url,neg,neu,pos,compound
0,SHEL,2024-01-16,Shell to exit Nigeria's troubled onshore oil a...,"Published Jan 16, 2024 09:30 Updated Jan 16, 2...",https://uk.investing.com//news/commodities-new...,0.078,0.843,0.079,-0.2073
1,SHEL,2024-02-01,"European stocks weaken ahead of BOE, eurozone ...","Published Feb 01, 2024 08:24\n\n© Reuters.\n\n...",https://uk.investing.com//news/stock-market-ne...,0.064,0.852,0.083,0.9469
2,SHEL,2024-01-29,FTSE 100 Live: Stocks in the green supported b...,"Published Jan 29, 2024 10:45 Updated Jan 29, 2...",https://uk.investing.com//news/stock-market-ne...,0.103,0.802,0.095,-0.517
3,SHEL,2023-11-08,FTSE 100 sees minor decline amidst robust reta...,"Published Nov 08, 2023 17:48\n\n© Reuters.\n\n...",https://uk.investing.com//news/stock-market-ne...,0.029,0.842,0.129,0.9874
4,SHEL,2023-11-08,FTSE 100 sees minor decline amidst robust reta...,"Published Nov 08, 2023 17:48\n\n© Reuters.\n\n...",https://uk.investing.com//news/stock-market-ne...,0.029,0.842,0.129,0.9874
5,SHEL,2024-01-31,"Labour would 'unashamedly champion' City, says...","Published Jan 31, 2024 09:35 Updated Jan 31, 2...",https://uk.investing.com//news/stock-market-ne...,0.026,0.851,0.123,0.9834
6,SHEL,2024-01-04,London midday: FTSE pares gains amid diverging...,"Published Jan 04, 2024 12:02 Updated Jan 04, 2...",https://uk.investing.com//news/stock-market-ne...,0.038,0.875,0.088,0.996
7,SHEL,2023-11-27,London open: Stocks edge down on weak Asian cu...,"Published Nov 27, 2023 08:35 Updated Nov 27, 2...",https://uk.investing.com//news/stock-market-ne...,0.029,0.911,0.06,0.9559
8,SHEL,2024-02-05,Shell agrees to develop Nigeria gas field for ...,"Published Feb 05, 2024 09:21\n\n© Reuters. FIL...",https://uk.investing.com//news/stock-market-ne...,0.009,0.91,0.081,0.9485
9,SHEL,2023-10-11,Shell Amsterdam shares hit highest since 2001 ...,"Published Oct 11, 2023 08:29\n\n© Reuters. She...",https://uk.investing.com//news/stock-market-ne...,0.012,0.942,0.045,0.3818


## 4 - Storing Dataframe

In [None]:
article_sentiments.to_csv("Shell_articles.csv", sep=',', encoding='utf-8', header=True)

with open('nvidia_urls_20210105.txt', 'w') as f:
    for link in all_company_urls: 
        f.write("%s\n" % link)