# Sentiment Analysis for Financial News

In [255]:
import pandas as pd
import numpy as np
import unidecode
import torch
import nltk
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import stopwords
nltk.download('stopwords')
from collections import Counter

[nltk_data] Downloading package stopwords to C:\Users\Camille
[nltk_data]     Leempoels\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocessing

### Loading the dataset

In [51]:
_data_ = pd.read_csv('data.csv', encoding='latin-1', names=['sentiment','text'])

In [53]:
_data_.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


### Cleaning the text

In [171]:
data = _data_.copy()

# Removing punctuation and special caracters
data['text'] = data['text'].str.replace('[^\w\s]', '', regex=True)
data['text'] = data['text'].str.replace('_', ' ')
data['text'] = data['text'].astype('unicode')
data['text'] = data['text'].transform(lambda x: unidecode.unidecode(x))

# Lowercase the text
data['text'] = data['text'].str.lower()

# Removing numbers
data['text'] = data['text'].str.replace('\d+', '', regex=True)

# Removing stop words
nltk_stopwords = stopwords.words('english')
data['text'] = data['text'].apply(lambda x: [item for item in x.split() if item not in nltk_stopwords])

# Lemmatization and/or Expand Contractions ?

### Building the vocabulary

In [225]:
# Extract all the words and count them
vocab = Counter([word for sentence in data['text'].values.tolist() for word in sentence])

print('Number of words : ' + str(sum(vocab.values())))
print('Number of unique words : ' + str(len(list(vocab))))

# Sort the words
vocab = vocab.most_common()

print('Most frequent words : ' + str(vocab[0:10]))

# Convert the vocabulary to a Python Dict
# vocab = dict(vocab)

# encode words as integers
# sparse_vocab = {word:i for i, word in enumerate(vocab, 1)} 

Number of words : 58223
Number of unique words : 9361
Most frequent words : [('eur', 1310), ('company', 848), ('mn', 593), ('said', 544), ('finnish', 512), ('sales', 453), ('million', 441), ('net', 412), ('profit', 409), ('finland', 337)]


In [226]:
# One hot encoding (sklearn preprocessing)
one_hot_enc = MultiLabelBinarizer().fit(data.text)
one_hot_txt = one_hot_enc.transform(data.text)

In [254]:
# Toy example
print('Number of unique words : ' + str(len(one_hot_enc.classes_)))
idx = np.where(one_hot_enc.classes_ == 'company')[0][0]
print('Position of "company" : ' + str(idx))
print('One Hot representation of "company" : ' + str(one_hot_enc.transform([['company']])))
print(str(idx) + '-th element of the One Hot encoding of "company" : ' + str(one_hot_enc.transform([['company']])[0,idx]))

Number of unique words : 9361
Position of "company" : 1483
One Hot representation of "company" : [[0 0 0 ... 0 0 0]]
1483-th element of the One Hot encoding of "company" : 1
