In [52]:
%cd /content/drive/MyDrive/Projects/ML_Labs/

/content/drive/MyDrive/Projects/ML_Labs


In [53]:
!pip install transformers



In [179]:
import numpy as np
import pandas as pd
from datetime import datetime
from transformers import InputExample, InputFeatures, BertForSequenceClassification, BertTokenizer
import torch
import plotly.graph_objects as go

# DataFrame cleaning

Import the dataset with the news.

In [55]:
news_df = pd.read_json(path_or_buf="historic_news9.json")

In [56]:
news_df.head(5)

Unnamed: 0,title,body,date,first_paragraph
0,[Dow's 300-point fall led by losses in shares ...,The Dow Jones Industrial Average is declining ...,"[Last Updated: May 12, 2021 at 10:47 a.m. ET, ...",
1,[10-year Treasury yield rises after U.S. infla...,"U.S. Treasury yields rose Wednesday morning, a...","[Last Updated: May 12, 2021 at 9:34 a.m. ET, F...","U.S. Treasury yields rose Wednesday morning, a..."
2,[U.S. Stocks Are Lower as Inflation Data Comes...,"U.S. stocks opened lower on Wednesday, extendi...","[Last Updated: May 12, 2021 at 9:36 a.m. ET, F...","U.S. stocks opened lower on Wednesday, extendi..."
3,[Stocks fall in early trade after stronger-tha...,"Stocks opened lower Wednesday, with the tech-h...","[May 12, 2021 at 9:38 a.m. ET]","Stocks opened lower Wednesday, with the tech-h..."
4,"[Honeywell International Inc., Walmart share l...",Behind negative returns for shares of Honeywel...,"[May 12, 2021 at 9:46 a.m. ET]",


Headlines ("title") and date columns are composed by lists, so the elements will be extracted by list comprenhension. Note that some lists in dates have two elements (last updated and first published). Only first published date will be left.

Also, all thouse rows with more than two na values, meaning some not real news returning only a list of dates, will be dropped.

In [57]:
news_df.dropna(axis=0, thresh=2, inplace=True)
#Convert each element of columns title and date in str from 1-element list
news_df['title'] = [i[0] if len(i)==1 else i[1] for i in news_df['title']] 
# Select the publishing date if last updated available
news_df['date'] = [i[0] if len(i)==1 else i[1] for i in news_df['date']] 
news_df.reset_index(inplace=True, drop=True)

Check the remaining amount of NaN values

In [59]:
news_df.isna().sum()

title               0
body                0
date                0
first_paragraph    54
dtype: int64

Convert datetime string to datetime

In [60]:
# Remove the "Last Updated:" str bit
news_df['date'] = news_df['date'].str.replace('Last Updated: ', '')
# Remove the dots from the str, to clean the a.m./p.m. tag
news_df['date'] = news_df['date'].str.replace('.', '')
# Remove the ET timezone label, not needed
news_df['date'] = news_df['date'].str.replace(' ET', '')
# Change the full month names by the abbreviations
news_df['date'] = news_df['date'].str.replace('March', 'Mar')
news_df['date'] = news_df['date'].str.replace('April', 'Apr')
news_df['date'] = news_df['date'].str.replace('June', 'Jun')
news_df['date'] = news_df['date'].str.replace('July', 'Jul')
# Remove the "first" tag indicating first published.
news_df['date'] = news_df['date'].str.replace('First  ', '')

news_df['date'] = [datetime.strptime(i, '%b %d, %Y at %I:%M %p') for i in news_df['date']] 


In [61]:
news_df.head(10)

Unnamed: 0,title,body,date,first_paragraph
0,Dow's 300-point fall led by losses in shares o...,The Dow Jones Industrial Average is declining ...,2021-05-12 10:46:00,
1,10-year Treasury yield rises after U.S. inflat...,"U.S. Treasury yields rose Wednesday morning, a...",2021-05-12 08:05:00,"U.S. Treasury yields rose Wednesday morning, a..."
2,U.S. Stocks Are Lower as Inflation Data Comes ...,"U.S. stocks opened lower on Wednesday, extendi...",2021-05-12 06:16:00,"U.S. stocks opened lower on Wednesday, extendi..."
3,Stocks fall in early trade after stronger-than...,"Stocks opened lower Wednesday, with the tech-h...",2021-05-12 09:38:00,"Stocks opened lower Wednesday, with the tech-h..."
4,"Honeywell International Inc., Walmart share lo...",Behind negative returns for shares of Honeywel...,2021-05-12 09:46:00,
5,Dow slides after inflation climbs to highest i...,U.S. stock indexes opened lower Wednesday morn...,2021-05-12 06:55:00,U.S. stock indexes opened lower Wednesday morn...
6,Gold prices on track for first back-to-back de...,Gold prices were on track to log their first b...,2021-05-12 08:16:00,Gold prices were on track to log their first b...
7,"FanDuel Boss Quits, Sending Flutter Stock Tumb...",Shares in Flutter Entertainment are falling mo...,2021-05-12 10:21:00,Shares in Flutter Entertainment are falling mo...
8,Shares of Domino's Pizza jumps to record early...,"Domino's Pizza DPZ, trades at records Wednesda...",2021-05-12 10:33:00,"Domino's Pizza DPZ, trades at records Wednesda..."
9,A Sustainable Stock Fund That Isn’t Your ‘Typi...,"Years before carbon neutrality, diversity, equ...",2021-05-12 07:00:00,"Years before carbon neutrality, diversity, equ..."


# Sentiment analysis

IMport the finBERT model, fine tuned with finantial texts for polarity classification. More info can be found here: https://arxiv.org/abs/1908.10063

In [62]:
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

Create the new columns for positivity of the sentiment:

In [63]:
news_df[['Head_Pos', 'Head_neutral', 'Head_Neg']] = ''

Before running the predictions lets send the model to the GPU to speed up the process:

In [None]:
model.to('cuda')

Tokenize each headline, add special tokens for BERT processing, add padding up to an input size of 512, feed it to the model and apply softwax to the scores returned to get a probability alike results.

In [65]:
for i in range(len(news_df)):
  txt = news_df.title[i]
  # Tokenize the headline with no special tokens yet
  tokens = tokenizer.encode_plus(txt, add_special_tokens=False, return_tensors='pt')
  # Check how many tokens we need to fill the input dimension size of 512 (minus the [CLS] and [SEP] tokens, 510 indeed)
  pad_len = 510 - len(tokens['input_ids'][0])
  # Enclose the tokenized headline with the special BERT tokens and pad the tensor with empty tokens.
  input_tokens = torch.cat([torch.Tensor([101]), tokens['input_ids'][0], torch.Tensor([102]), torch.Tensor([0] * pad_len)])
  # Similar process for the attention mask, but special token locations get an attention of 1 and the padding tokens an attention of 0
  input_attention = torch.cat([torch.Tensor([1]), tokens['attention_mask'][0], torch.Tensor([1]), torch.Tensor([0] * pad_len)])
  # Send the input tensors to the GPU
  input_tokens = input_tokens.to('cuda')
  input_attention = input_attention.to('cuda')
  # Rebuild the input dict with the same keys as the tokenized dict had
  input_dict = {
      'input_ids': torch.unsqueeze(input_tokens, dim=0).long(),
      'attention_mask': torch.unsqueeze(input_attention, dim=0).int()
  }
  # Feed the model and pass the resulting output through a softmax to get a
  # probability-alike result
  outputs = model(**input_dict)
  probs = torch.nn.functional.softmax(outputs[0], dim=-1)
  # Fill the new columns with the calculated probabilies
  news_df.iloc[i, 4:7] = probs.tolist()[0]


In [241]:
news_df.head()

Unnamed: 0,title,body,date,first_paragraph,Head_Pos,Head_neutral,Head_Neg,Body_Pos,Body_neutral,Body_Neg
0,Dow's 300-point fall led by losses in shares o...,The Dow Jones Industrial Average is declining ...,2021-05-12 10:46:00,,0.0126025,0.967921,0.0194768,0.0085831,0.97376,0.0176571
1,10-year Treasury yield rises after U.S. inflat...,"U.S. Treasury yields rose Wednesday morning, a...",2021-05-12 08:05:00,"U.S. Treasury yields rose Wednesday morning, a...",0.563017,0.318066,0.118916,0.647187,0.29819,0.0546221
2,U.S. Stocks Are Lower as Inflation Data Comes ...,"U.S. stocks opened lower on Wednesday, extendi...",2021-05-12 06:16:00,"U.S. stocks opened lower on Wednesday, extendi...",0.025765,0.952473,0.0217623,0.00837315,0.971647,0.0199798
3,Stocks fall in early trade after stronger-than...,"Stocks opened lower Wednesday, with the tech-h...",2021-05-12 09:38:00,"Stocks opened lower Wednesday, with the tech-h...",0.0184038,0.949918,0.031678,0.0702624,0.916773,0.0129649
4,"Honeywell International Inc., Walmart share lo...",Behind negative returns for shares of Honeywel...,2021-05-12 09:46:00,,0.0284092,0.94198,0.029611,0.00838892,0.972547,0.0190642


Repeat the same process for the first apragraph. Some few news have no first paragraph retrieved. If that is the case, the first 510 tokens in the body of the text will be used instead (less if the new is that short).

In [72]:
news_df[['Body_Pos', 'Body_neutral', 'Body_Neg']] = ''

In [104]:
for i in range(len(news_df)):
  if news_df.first_paragraph[i] == news_df.first_paragraph[i]: # Using the fact that Nan variables in Python are not equal to themselves
    txt = news_df.first_paragraph[i]
    # Tokenize the headline with no special tokens yet
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False, return_tensors='pt')
    # Check how many tokens we need to fill the input dimension size of 512 (minus the [CLS] and [SEP] tokens, 510 indeed)
    pad_len = 510 - len(tokens['input_ids'][0])
    # Enclose the tokenized headline with the special BERT tokens and pad the tensor with empty tokens.
    input_tokens = torch.cat([torch.Tensor([101]), tokens['input_ids'][0], torch.Tensor([102]), torch.Tensor([0] * pad_len)])
    # Similar process for the attention mask, but special token locations get an attention of 1 and the padding tokens an attention of 0
    input_attention = torch.cat([torch.Tensor([1]), tokens['attention_mask'][0], torch.Tensor([1]), torch.Tensor([0] * pad_len)])
  else:
    txt = news_df.body[i]
    # Tokenize the headline with no special tokens yet
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False, return_tensors='pt')
    # Truncate the tokens vector if too long
    if len(tokens['input_ids'][0]) > 510:
      truncated_tok = tokens['input_ids'][0][0:510].float()
      input_tokens = torch.cat([torch.Tensor([101]), torch.Tensor(truncated_tok), torch.Tensor([102])])
      input_attention = torch.Tensor([1]*512)
    else:
    # Check how many tokens we need to fill the input dimension size of 512 (minus the [CLS] and [SEP] tokens, 510 indeed)
      pad_len = 510 - len(tokens['input_ids'][0])
      # Enclose the tokenized headline with the special BERT tokens and pad the tensor with empty tokens.
      input_tokens = torch.cat([torch.Tensor([101]), tokens['input_ids'][0], torch.Tensor([102]), torch.Tensor([0] * pad_len)])
      # Similar process for the attention mask, but special token locations get an attention of 1 and the padding tokens an attention of 0
      input_attention = torch.cat([torch.Tensor([1]), tokens['attention_mask'][0], torch.Tensor([1]), torch.Tensor([0] * pad_len)])
  
  # Send the input tensors to the GPU
  input_tokens = input_tokens.to('cuda')
  input_attention = input_attention.to('cuda')
  # Rebuild the input dict with the same keys as the tokenized dict had
  input_dict = {
      'input_ids': torch.unsqueeze(input_tokens, dim=0).long(),
      'attention_mask': torch.unsqueeze(input_attention, dim=0).int()
  }
  # Feed the model and pass the resulting output through a softmax to get a
  # probability-alike result
  outputs = model(**input_dict)
  probs = torch.nn.functional.softmax(outputs[0], dim=-1)
  # Fill the new columns with the calculated probabilies
  news_df.iloc[i, 7:10] = probs.tolist()[0]


In [105]:
news_df.head(10)

Unnamed: 0,title,body,date,first_paragraph,Head_Pos,Head_neutral,Head_Neg,Body_Pos,Body_neutral,Body_Neg
0,Dow's 300-point fall led by losses in shares o...,The Dow Jones Industrial Average is declining ...,2021-05-12 10:46:00,,0.0126025,0.967921,0.0194768,0.0085831,0.97376,0.0176571
1,10-year Treasury yield rises after U.S. inflat...,"U.S. Treasury yields rose Wednesday morning, a...",2021-05-12 08:05:00,"U.S. Treasury yields rose Wednesday morning, a...",0.563017,0.318066,0.118916,0.647187,0.29819,0.0546221
2,U.S. Stocks Are Lower as Inflation Data Comes ...,"U.S. stocks opened lower on Wednesday, extendi...",2021-05-12 06:16:00,"U.S. stocks opened lower on Wednesday, extendi...",0.025765,0.952473,0.0217623,0.00837315,0.971647,0.0199798
3,Stocks fall in early trade after stronger-than...,"Stocks opened lower Wednesday, with the tech-h...",2021-05-12 09:38:00,"Stocks opened lower Wednesday, with the tech-h...",0.0184038,0.949918,0.031678,0.0702624,0.916773,0.0129649
4,"Honeywell International Inc., Walmart share lo...",Behind negative returns for shares of Honeywel...,2021-05-12 09:46:00,,0.0284092,0.94198,0.029611,0.00838892,0.972547,0.0190642
5,Dow slides after inflation climbs to highest i...,U.S. stock indexes opened lower Wednesday morn...,2021-05-12 06:55:00,U.S. stock indexes opened lower Wednesday morn...,0.105886,0.838431,0.0556824,0.0109086,0.970619,0.0184724
6,Gold prices on track for first back-to-back de...,Gold prices were on track to log their first b...,2021-05-12 08:16:00,Gold prices were on track to log their first b...,0.11303,0.869712,0.0172585,0.31597,0.636654,0.0473768
7,"FanDuel Boss Quits, Sending Flutter Stock Tumb...",Shares in Flutter Entertainment are falling mo...,2021-05-12 10:21:00,Shares in Flutter Entertainment are falling mo...,0.0269484,0.315002,0.65805,0.00650946,0.973014,0.0204765
8,Shares of Domino's Pizza jumps to record early...,"Domino's Pizza DPZ, trades at records Wednesda...",2021-05-12 10:33:00,"Domino's Pizza DPZ, trades at records Wednesda...",0.53097,0.0875937,0.381436,0.881438,0.0341895,0.0843724
9,A Sustainable Stock Fund That Isn’t Your ‘Typi...,"Years before carbon neutrality, diversity, equ...",2021-05-12 07:00:00,"Years before carbon neutrality, diversity, equ...",0.0669435,0.0167878,0.916269,0.074256,0.0191513,0.906593


# Sentiment aggregation

As a preliminar analysis:
1. The news will be grouped by date.
2. The sentiment scores (positive, neutral and negative) will be averaged within each day.
3. The count of news for each day will be also returned.
4. A polarity score will be calculated substracting the mean negativity from the mean positivity for each day, both for the headlines and the first paragraph.

In [147]:
sent_df = news_df.copy(deep=True)
sent_df = sent_df.drop(['body', 'title', 'first_paragraph'], axis=1)

In [148]:
# COnvert from type object to numeric to be able to perform the mean()
for i in range(1,7,1):
  sent_df.iloc[:,i] = pd.to_numeric(sent_df.iloc[:,i])

In [154]:
# Group by day and perform the mean.
sent_mean = sent_df.groupby(pd.Grouper(key="date", freq="D")).mean()
sent_mean.head()

Unnamed: 0_level_0,Head_Pos,Head_neutral,Head_Neg,Body_Pos,Body_neutral,Body_Neg
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-02-27,0.696476,0.013272,0.290252,0.676893,0.093456,0.22965
2021-02-28,,,,,,
2021-03-01,,,,,,
2021-03-02,0.464482,0.047671,0.487847,0.075778,0.720338,0.203883
2021-03-03,,,,,,


Some days with no news will return NaN. This missing values will be imputed replacing by a value of 0 instead.

In [155]:
sent_mean.fillna(value=0, axis=0, inplace=True)

Define the summarised polarity for headlines and first paragraph (here Summ_Body)

In [162]:
sent_mean['Summ_Head'] = sent_mean['Head_Pos']-sent_mean['Head_Neg']
sent_mean['Summ_Body'] = sent_mean['Body_Pos']-sent_mean['Body_Neg']

In [163]:
sent_mean.head()

Unnamed: 0_level_0,Head_Pos,Head_neutral,Head_Neg,Body_Pos,Body_neutral,Body_Neg,Summ_Head,Summ_Body
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-02-27,0.696476,0.013272,0.290252,0.676893,0.093456,0.22965,0.406223,0.447242
2021-02-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-03-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-03-02,0.464482,0.047671,0.487847,0.075778,0.720338,0.203883,-0.023364,-0.128105
2021-03-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Group by to get the news count for each day.

In [200]:
sent_vol = sent_df.groupby(pd.Grouper(key="date", freq="D")).count()
sent_vol.head()

Unnamed: 0_level_0,Head_Pos,Head_neutral,Head_Neg,Body_Pos,Body_neutral,Body_Neg
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-02-27,1,1,1,1,1,1
2021-02-28,0,0,0,0,0,0
2021-03-01,0,0,0,0,0,0
2021-03-02,1,1,1,1,1,1
2021-03-03,0,0,0,0,0,0


Visualising the aggregations for each day.

In [240]:
fig = go.Figure()
fig.add_trace(go.Line(x=sent_mean.index, y=sent_mean.Summ_Head,
                    mode='lines',
                    name='Headlines'))
fig.add_trace(go.Line(x=sent_mean.index, y=sent_mean.Summ_Body,
                    mode='lines',
                    name='First Paragraph'))
fig.add_trace(go.Bar(x=sent_mean.index, y=sent_vol.Body_Pos,
                    name='Daily news', opacity=0.3, marker=dict(color="Black"), yaxis="y2"))
fig.update_layout(
    xaxis=dict(
        domain=[0.3, 0.7]
    ),
    yaxis=dict(title="Polarity"),
    yaxis2=dict(
        title="Daily news",
        overlaying="y",
        side="left",
        position=0.20
    ))
fig.update_layout(
    title_text="Headlines and First paragraph polarity",
    title_font_size=30,
    xaxis_title="Date")


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


