# Phase 0. Dataset for testing sentiment models

In [None]:
import numpy as np
import pandas as pd

In [None]:
#Dataset (Financial Phrasebank) for testing our three methods
data = pd.read_csv('benchmark_newsSA.csv', names = ['sentence', 'label'], encoding='latin-1')[1:]

In [None]:
data.head()

Unnamed: 0,sentence,label
1,The GeoSolutions technology will leverage Bene...,positive
2,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
3,"For the last quarter of 2010 , Componenta 's n...",positive
4,According to the Finnish-Russian Chamber of Co...,neutral
5,The Swedish buyout firm has sold its remaining...,neutral


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5842 entries, 1 to 5842
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  5842 non-null   object
 1   label     5842 non-null   object
dtypes: object(2)
memory usage: 91.4+ KB


In [None]:
data.label.unique()

array(['positive', 'negative', 'neutral'], dtype=object)

**1.1 Using Keras**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [None]:
def get_sequences(texts):
  tokenizer= Tokenizer()
  tokenizer.fit_on_texts(texts)
  return tokenizer.word_index

In [None]:
def get_sequences(texts):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    
    sequences = tokenizer.texts_to_sequences(texts)
    print("Vocab length:", len(tokenizer.word_index) + 1)
    
    max_seq_length = np.max(list(map(lambda x: len(x), sequences)))
    print("Maximum sequence length:", max_seq_length)
    
    sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    
    return sequences

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    sequences = get_sequences(df['sentence'])
    
    label_mapping = {
        'negative': 0,
        'neutral': 1,
        'positive': 2
    }
    
    y = df['label'].replace(label_mapping)
    
    train_sequences, test_sequences, y_train, y_test = train_test_split(sequences, y, train_size=0.7, shuffle=True, random_state=1)
    
    return train_sequences, test_sequences, y_train, y_test

In [None]:
train_sequences, test_sequences, y_train, y_test = preprocess_inputs(data)

Vocab length: 11547
Maximum sequence length: 71


In [None]:
train_sequences

array([[ 221,  108, 9570, ...,    0,    0,    0],
       [7501,  103,   71, ...,    0,    0,    0],
       [  93,  737,    5, ...,    0,    0,    0],
       ...,
       [  30,   11, 1181, ...,    0,    0,    0],
       [   1,  387,   36, ...,    0,    0,    0],
       [   2,  264,   66, ...,    0,    0,    0]], dtype=int32)

In [None]:
inputs = tf.keras.Input(shape=(train_sequences.shape[1],))
x = tf.keras.layers.Embedding(
    input_dim=11547,
    output_dim=128,
    input_length=train_sequences.shape[1]
)(inputs)
x = tf.keras.layers.GRU(256, return_sequences=True, activation='tanh')(x)
x = tf.keras.layers.Flatten()(x)
outputs = tf.keras.layers.Dense(3, activation='softmax')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    train_sequences,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [None]:
results = model.evaluate(test_sequences, y_test, verbose=0)

print("    Test Loss: {:.5f}".format(results[0]))
print("Test Accuracy: {:.2f}%".format(results[1] * 100))

    Test Loss: 0.75261
Test Accuracy: 68.80%


**1.2 Using Vader**

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
vader = SentimentIntensityAnalyzer()
data_vader = data.copy()
scores = data_vader['sentence'].apply(vader.polarity_scores).tolist()
scores_df = pd.DataFrame(scores)
data_vader = data_vader.join(scores_df, rsuffix='_right')
data_vader.head()



Unnamed: 0,sentence,label,neg,neu,pos,compound
1,The GeoSolutions technology will leverage Bene...,positive,0.167,0.833,0.0,-0.2023
2,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,0.064,0.856,0.08,0.1531
3,"For the last quarter of 2010 , Componenta 's n...",positive,0.0,1.0,0.0,0.0
4,According to the Finnish-Russian Chamber of Co...,neutral,0.0,1.0,0.0,0.0
5,The Swedish buyout firm has sold its remaining...,neutral,0.192,0.808,0.0,-0.1695


In [None]:
data_vader['sentiment_type']=''
data_vader.loc[data_vader.compound>0,'sentiment_type']='positive'
data_vader.loc[data_vader.compound==0,'sentiment_type']='neutral'
data_vader.loc[data_vader.compound<0,'sentiment_type']='negative'
data_vader.head()

Unnamed: 0,sentence,label,neg,neu,pos,compound,sentiment_type
1,The GeoSolutions technology will leverage Bene...,positive,0.167,0.833,0.0,-0.2023,negative
2,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,0.064,0.856,0.08,0.1531,positive
3,"For the last quarter of 2010 , Componenta 's n...",positive,0.0,1.0,0.0,0.0,neutral
4,According to the Finnish-Russian Chamber of Co...,neutral,0.0,1.0,0.0,0.0,neutral
5,The Swedish buyout firm has sold its remaining...,neutral,0.192,0.808,0.0,-0.1695,negative


In [None]:
from sklearn.metrics import f1_score, accuracy_score

In [None]:
accuracy_score(data_vader['label'], data_vader['sentiment_type'])

0.38240328654570355

**1.3 Using FinBERT**

In [None]:
headlines_list = list(data['sentence'])

In [None]:
!pip install transformers

#Getting the tokenizer and the model

from transformers import BertTokenizer, BertForSequenceClassification

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')



Downloading:   0%|          | 0.00/533 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/221k [00:00<?, ?B/s]

In [None]:
X = data['sentence'].to_list()
y = data['label'].to_list()

for i,x in enumerate(X):
    print(i+1,")",x)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
843 ) MarketsBP promotes upstream boss to deputy CEO
844 ) Britain's FTSE bounces back, Mondi and Barratt lead
845 ) Loudeye Corp. , up $ 2.56 at $ 4.33 Nokia Corp. , down 10 cents at $ 19.46 Nokia agreed to buy the digital music distributor for $ 60 million .
846 ) Ragutis , controlled by the Finnish brewery Olvi , achieved a 5.7 percent rise in beer sales to 22.6 million liters and held a 10.75 percent market share .
847 ) Finnair said that the cancellation of flights would cause daily losses of  x20ac 2.5 million US$ 3 million .
848 ) The energy sector accounted for approximately 33 % and the steel industry for about 57 % of the transportation volume .
849 ) The 5,000 megawatt wind farm being planned in Raahe would be built offshore in front of Ruukki 's Raahe Works .
850 ) The Australian company Mirabela Nickel has awarded Outokumpu Technology a contract for grinding technology for its nickel sulfide project in Bahia 

In [None]:
labels = {0:'neutral', 1:'positive',2:'negative'}

In [None]:
sent_val = list()
for x in X:
    inputs = tokenizer(x, return_tensors="pt", padding=True)
    outputs = finbert(**inputs)[0]
   
    val = labels[np.argmax(outputs.detach().numpy())]
    print(x, '----', val)
    print('#######################################################')    
    sent_val.append(val)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
During the past 10 years the factory has produced many of Nokia 's most popular models including the Nokia 2760 , the Nokia 6300 as well as Nokia 's latest music device the Nokia 5800 Express Music . ---- neutral
#######################################################
STX Finland Oy signed a a preliminary agreement for the building of an environmentally friendly , new generation cruise ferry for Viking Line to manage on between Turku , Finland , and Stockholm , Sweden withViking Line ABP . ---- neutral
#######################################################
The rebuilds are designed to improve the machines ' performance and product quality . ---- positive
#######################################################
Operating profit , excluding non-recurring items , totalled EUR 2.2 mn , down from EUR 2.7 mn in the corresponding period in 2008 . ---- negative
#######################################################
Goodwill and 

In [None]:
len(sent_val)

5842

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y, sent_val))

0.6951386511468675
