# Sentiment Analysis for Financial News

In [176]:
import pandas as pd
import numpy as np

# cleaning
import unidecode

# stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# vocab
from collections import Counter

# models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

[nltk_data] Downloading package stopwords to C:\Users\Camille
[nltk_data]     Leempoels\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocessing

Loading the dataset.

In [19]:
data = pd.read_csv('data.csv', encoding='latin-1', names=['sentiment','text'])

In [20]:
data.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


Cleaning the text.

In [21]:
# Removing punctuation and special caracters
data['text'] = data['text'].str.replace('[^\w\s]', '', regex=True)
data['text'] = data['text'].str.replace('_', ' ')
data['text'] = data['text'].astype('unicode')
data['text'] = data['text'].transform(lambda x: unidecode.unidecode(x))

# Lowercase the text
data['text'] = data['text'].str.lower()

# Removing numbers
data['text'] = data['text'].str.replace('\d+', '', regex=True)

# Removing stop words
nltk_stopwords = stopwords.words('english')
data['text'] = data['text'].apply(lambda x: [item for item in x.split() if item not in nltk_stopwords])

# Lemmatization and/or Expand Contractions ?

Split train and test set

In [208]:
_train_set_, _test_set_ = train_test_split(data, test_size=0.2, random_state=1)

Building the vocabulary from the training set

In [209]:
# Extract all the words and count them
vocab = Counter([word for sentence in _train_set_['text'].values.tolist() for word in sentence])

print('Number of words in the training set : ' + str(sum(vocab.values())))
print('Number of unique words in the training set : ' + str(len(list(vocab))))

# Sort the words
vocab = vocab.most_common()

print('Most frequent words : ' + str(vocab[0:10]))

# Convert the vocabulary to a Python Dict
vocab = dict(vocab)

# encode words as integers
sparse_vocab = {word:i for i, word in enumerate(vocab, 1)} 

Number of words in the training set : 46400
Number of unique words in the training set : 8337
Most frequent words : [('eur', 1057), ('company', 680), ('mn', 475), ('said', 437), ('finnish', 405), ('sales', 354), ('million', 335), ('profit', 325), ('net', 323), ('finland', 278)]


In [226]:
def encode_text(data):
    # encode sentences 
    # set -1 if the word is not in the vocabulary
    data['text_encoded'] = data['text'].apply(
        lambda sentence : [sparse_vocab.get(word, -1) for word in sentence]
    )

    # if sentences are extremely short or long, drop them
    data['length'] = data['text_encoded'].apply(lambda sentence : len(sentence))
    limit_low, limit_high  = data['length'].quantile(q=[0.01, 0.99]).values
    data = data[(data['length'] <= limit_high) & (data['length'] >= limit_low)].copy()

    # pad sentences so that each vector has the same length
    max_len = int(limit_high)
    data['text_encoded'] = data.apply(lambda row: row['text_encoded'] + [0] * (max_len - row['length']), axis=1)
    data['length'] = data['text_encoded'].apply(lambda sentence : len(sentence))

    if sum(data['length'] != max_len):
        print('WARNING')

    return data

train_set_txt = encode_text(_train_set_.copy())
test_set_txt = encode_text(_test_set_.copy())

train_set_labels = OneHotEncoder(handle_unknown='ignore', sparse_output=False).fit_transform(train_set_txt['sentiment'].values.reshape(-1, 1))
test_set_labels = OneHotEncoder(handle_unknown='ignore', sparse_output=False).fit_transform(test_set_txt['sentiment'].values.reshape(-1, 1))

## Supervised learning

Dataloaders and Batch

In [166]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [227]:
batch_size = 50

train_set = TensorDataset(
    torch.tensor(np.vstack(train_set_txt['text_encoded'])), 
    torch.tensor(train_set_labels)
)

train_dataloader = DataLoader(
    train_set, 
    batch_size=batch_size,
    shuffle=True
)

Reccurent Neural Network

In [229]:
class RNN_model(nn.Module):

    def __init__(self, vocab_dim, embedding_dim, hidden_dim):
        super(RNN_model, self).__init__()

        # embedding layer
        self.embedding = nn.Embedding(vocab_dim, embedding_dim)
        # RNN layer
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        # fully connected layer
        self.fc = nn.Linear(hidden_dim, 3)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, txt):
        emb_txt = self.embedding(txt.T)
        rnn_out, rnn_hidden = self.rnn(emb_txt)
        fc_out = self.fc(rnn_hidden.squeeze(0))
        output = self.softmax(fc_out)

        return fc_out

In [230]:
VOCAB_DIM = len(vocab) + 1
EMBEDDING_DIM = 100
HIDDEN_DIM = 32
LEARNING_RATE = 1e-2

model = RNN_model(VOCAB_DIM, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()


In [231]:
print(model)

RNN_model(
  (embedding): Embedding(8338, 100)
  (rnn): RNN(100, 32)
  (fc): Linear(in_features=32, out_features=3, bias=True)
  (softmax): Softmax(dim=1)
)


In [232]:
def train_loop(dataloader, model, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    count = 0
    
    model.train()
    
    for txt, label in dataloader:
        count = count + 1
        print(count)

        optimizer.zero_grad()  
        pred = model(txt)
        print(pred.shape)
        print(label.shape)
        loss = criterion(pred, label)
        
    #     acc = binary_accuracy(predictions, batch.label)
        
    #     loss.backward()
        
    #     optimizer.step()
        
    #     epoch_loss += loss.item()
    #     epoch_acc += acc.item()
        
    # return epoch_loss / len(iterator), epoch_acc / len(iterator)

train_loop(train_dataloader, model, optimizer, criterion)

1
torch.Size([50, 3])
torch.Size([50, 3])


RuntimeError: 1D target tensor expected, multi-target not supported