# Sentiment Analysis for Financial News

In [14]:
import pandas as pd
import numpy as np
import unidecode
import torch
import nltk
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
nltk.download('stopwords')
from collections import Counter

[nltk_data] Downloading package stopwords to C:\Users\Camille
[nltk_data]     Leempoels\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preprocessing

### Loading the dataset

In [15]:
_data_ = pd.read_csv('data.csv', encoding='latin-1', names=['sentiment','text'])

In [16]:
_data_.head()

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


### Cleaning the text

In [17]:
data = _data_.copy()

# Removing punctuation and special caracters
data['text'] = data['text'].str.replace('[^\w\s]', '', regex=True)
data['text'] = data['text'].str.replace('_', ' ')
data['text'] = data['text'].astype('unicode')
data['text'] = data['text'].transform(lambda x: unidecode.unidecode(x))

# Lowercase the text
data['text'] = data['text'].str.lower()

# Removing numbers
data['text'] = data['text'].str.replace('\d+', '', regex=True)

# Removing stop words
nltk_stopwords = stopwords.words('english')
data['text'] = data['text'].apply(lambda x: [item for item in x.split() if item not in nltk_stopwords])

# Lemmatization and/or Expand Contractions ?

### Split train and test set

In [44]:
data_train, data_test = train_test_split(data, test_size=0.2, random_state=1)

### Building the vocabulary

In [45]:
# Extract all the words and count them
vocab = Counter([word for sentence in data_train['text'].values.tolist() for word in sentence])

print('Number of words in the training set : ' + str(sum(vocab.values())))
print('Number of unique words in the training set : ' + str(len(list(vocab))))

# Sort the words
vocab = vocab.most_common()

print('Most frequent words : ' + str(vocab[0:10]))

# Convert the vocabulary to a Python Dict
vocab = dict(vocab)

# encode words as integers
sparse_vocab = {word:i for i, word in enumerate(vocab, 1)} 

Number of words in the training set : 46400
Number of unique words in the training set : 8337
Most frequent words : [('eur', 1057), ('company', 680), ('mn', 475), ('said', 437), ('finnish', 405), ('sales', 354), ('million', 335), ('profit', 325), ('net', 323), ('finland', 278)]


In [49]:
# encode sentences
text_train = []
data_train['text_encoded'] = data_train['text'].apply(
    lambda sentence : [sparse_vocab[word] for word in sentence]
)

# encode labels
data_train['label'] = data_train['sentiment'].replace(['positive', 'neutral', 'negative'], [1, 0, -1])

# check for outliers
data_train['length'] = data_train['text_encoded'].apply(lambda sentence : len(sentence))
limit_low, limit_high  = data_train['length'].quantile(q=[0.01, 0.99]).values



In [55]:
# data_train.iloc[data_train['length'].clip(limit_low, limit_high).index.values]

#### One Hot Encoding (i'm not using it, just a try)

In [262]:
# One hot encoding (sklearn preprocessing)
one_hot_enc = MultiLabelBinarizer().fit(data_train.text)
one_hot_txt = one_hot_enc.transform(data_train.text)

In [263]:
# Toy example
print('Number of unique words : ' + str(len(one_hot_enc.classes_)))
idx = np.where(one_hot_enc.classes_ == 'company')[0][0]
print('Position of "company" : ' + str(idx))
print('One Hot representation of "company" : ' + str(one_hot_enc.transform([['company']])))
print(str(idx) + '-th element of the One Hot encoding of "company" : ' + str(one_hot_enc.transform([['company']])[0,idx]))

Number of unique words : 8337
Position of "company" : 1342
One Hot representation of "company" : [[0 0 0 ... 0 0 0]]
1342-th element of the One Hot encoding of "company" : 1
