Using TensorFlow backend.


In [1]:
import keras
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
from datetime import timedelta
import numpy as np
import nltk 
from nltk.corpus import stopwords
from sklearn.utils import class_weight
from sklearn.cross_validation import train_test_split


nltk.download()

import pandas as pd
from sagemaker import get_execution_role


#Load data from S3 buckets
role = get_execution_role()
bucket='thesisdatabucketad'
data_key = 'truth_data.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)


bucket2='thesisdatabucketad'
data_key2 = 'model_data.csv'
data_location2 = 's3://{}/{}'.format(bucket2, data_key2)


Using TensorFlow backend.


NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all
    Downloading collection u'all'
       | 
       | Downloading package abc to /home/ec2-user/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /home/ec2-user/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package biocreative_ppi to
       |     /home/ec2-user/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package brown to /home/ec2-user/nltk_data...
       |   Unzipping corpora/brown.zip.
       | Downloading package brown_tei to /home/ec2-user/nltk_data...
       |   Unzipping corpora/brown_tei.zip.
       | Downloading package cess_cat to /home/ec2-user/nltk_data...
   

       |   Unzipping corpora/twitter_samples.zip.
       | Downloading package udhr to /home/ec2-user/nltk_data...
       |   Unzipping corpora/udhr.zip.
       | Downloading package udhr2 to /home/ec2-user/nltk_data...
       |   Unzipping corpora/udhr2.zip.
       | Downloading package unicode_samples to
       |     /home/ec2-user/nltk_data...
       |   Unzipping corpora/unicode_samples.zip.
       | Downloading package universal_treebanks_v20 to
       |     /home/ec2-user/nltk_data...
       | Downloading package verbnet to /home/ec2-user/nltk_data...
       |   Unzipping corpora/verbnet.zip.
       | Downloading package verbnet3 to /home/ec2-user/nltk_data...
       |   Unzipping corpora/verbnet3.zip.
       | Downloading package webtext to /home/ec2-user/nltk_data...
       |   Unzipping corpora/webtext.zip.
       | Downloading package wordnet to /home/ec2-user/nltk_data...
       |   Unzipping corpora/wordnet.zip.
       | Downloading package wordnet_ic to /home/ec2-user/nltk

In [6]:
df = pd.read_csv(data_location)
df.head()

#remove stpo words from text
stop = set(stopwords.words('english'))
df['text2'] = df['text'].str.lower().str.split()
df['text2'] = df['text2'].apply(lambda x: [item for item in x if item not in stop])


df = df.rename(columns = {'Sent Rating Sean':'sentiment'})

In [7]:

#reset classifications to non-numeric (easier process)
df['new_sent'] = df.sentiment.map({-2:'negative',-1:'negative',0:'neutral',1:'positive',2:'positive'})
df = df[['ticks', 'date','text2', 'new_sent']]

In [8]:
#convert to lists 
input_sentences = [text for text in df["text2"].values.tolist()]
labels = df["new_sent"].values.tolist()


In [9]:
word2id = dict()
label2id = dict()

max_words = 0 # maximum number of words in a sentence

# Construction of word2id dict
for sentence in input_sentences:
    for word in sentence:
        # Add words to word2id dict if not exist
        if word not in word2id:
            word2id[word] = len(word2id)
    # If length of the sentence is greater than max_words, update max_words
    if len(sentence) > max_words:
        max_words = len(sentence)

In [10]:
#turn classifications to numeric values for LSTM to process
label2id = {l: i for i, l in enumerate(set(labels))}
id2label = {v: k for k, v in label2id.items()}
id2label

{0: 'positive', 1: 'neutral', 2: 'negative'}

In [11]:
# Encode input words and labels
X = [[word2id[word] for word in sentence] for sentence in input_sentences]
Y = [label2id[label] for label in labels]


# Apply Padding to X

X = pad_sequences(X, max_words)

# Convert Y to numpy array
Y = keras.utils.to_categorical(Y, num_classes=len(label2id))

# Print shapes
print("Shape of X: {}".format(X.shape))
print("Shape of Y: {}".format(Y.shape))


Shape of X: (2500, 89)
Shape of Y: (2500, 3)


In [12]:
#Balance training data due to large amount of bias
Z = Y[:,0]
weight = class_weight.compute_class_weight('balanced', np.unique(Z), Z)


In [13]:
embedding_dim = 100 

# Input Tensor
sequence_input = keras.Input(shape=(max_words,), dtype='int32')

# Word embedding
embedded_inputs =keras.layers.Embedding(len(word2id) + 1,
                                        embedding_dim,
                                        input_length=max_words)(sequence_input)

# Apply dropout to prevent overfitting
embedded_inputs = keras.layers.Dropout(0.1)(embedded_inputs)

# Apply Bidirectional LSTM over embedded inputs
lstm_outs = keras.layers.wrappers.Bidirectional(
    keras.layers.LSTM(embedding_dim, return_sequences=True)
)(embedded_inputs)

# Apply dropout to LSTM outputs to prevent overfitting
lstm_outs = keras.layers.Dropout(0.1)(lstm_outs)

# Attention Mechanism - Generate attention vectors
input_dim = int(lstm_outs.shape[2])
permuted_inputs = keras.layers.Permute((2, 1))(lstm_outs)
attention_vector = keras.layers.TimeDistributed(keras.layers.Dense(1))(lstm_outs)
attention_vector = keras.layers.Reshape((max_words,))(attention_vector)
attention_vector = keras.layers.Activation('softmax', name='attention_vec')(attention_vector)
attention_output = keras.layers.Dot(axes=1)([lstm_outs, attention_vector])

# Last layer: softmax activation
fc = keras.layers.Dense(embedding_dim, activation='relu')(attention_output)
output = keras.layers.Dense(len(label2id), activation='softmax')(fc)

# Building model
model = keras.Model(inputs=[sequence_input], outputs=output)
model.compile(loss="categorical_crossentropy", metrics=['accuracy'], optimizer='adam')

# Model summary
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 89)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 89, 100)      867700      input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 89, 100)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 89, 200)      160800      dropout_1[0][0]                  
__________________________________________________________________________________________________
dropout_2 

In [14]:
#Split data and fit model
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=123)

model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.1, shuffle=True, class_weight= weight)

Train on 2025 samples, validate on 225 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f065854bf90>

In [58]:
#testing outputs of y_test x_test (visual check of data) 
test = model.predict(X_test)
dftest = pd.DataFrame(test)
print(dftest[0].round())
print(pd.DataFrame(y_test))

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
6      0.0
7      0.0
8      0.0
9      0.0
10     0.0
11     0.0
12     0.0
13     1.0
14     0.0
15     0.0
16     0.0
17     0.0
18     0.0
19     0.0
20     0.0
21     0.0
22     0.0
23     0.0
24     1.0
25     0.0
26     0.0
27     0.0
28     0.0
29     0.0
      ... 
220    0.0
221    0.0
222    1.0
223    0.0
224    0.0
225    0.0
226    0.0
227    0.0
228    0.0
229    0.0
230    0.0
231    0.0
232    0.0
233    0.0
234    0.0
235    0.0
236    0.0
237    0.0
238    0.0
239    1.0
240    0.0
241    0.0
242    0.0
243    0.0
244    0.0
245    0.0
246    0.0
247    0.0
248    0.0
249    0.0
Name: 0, Length: 250, dtype: float32
       0    1    2
0    0.0  1.0  0.0
1    0.0  1.0  0.0
2    0.0  1.0  0.0
3    0.0  0.0  1.0
4    0.0  1.0  0.0
5    0.0  1.0  0.0
6    0.0  1.0  0.0
7    0.0  1.0  0.0
8    0.0  1.0  0.0
9    1.0  0.0  0.0
10   0.0  1.0  0.0
11   0.0  1.0  0.0
12   0.0  1.0  0.0
13   1.0  0.0  0.0
14   0.

In [59]:
#Confusion Matrix Validation
from sklearn.metrics import confusion_matrix
pred = np.argmax(test, axis =1)
y_test2 = np.argmax(y_test, axis = 1)

cm = confusion_matrix(y_test2, pred)
np.set_printoptions(precision=2)
print(cm)

[[ 21  27   0]
 [ 23 154   0]
 [  7  18   0]]


The above results yeild weak recall for Negative sentiment classification. 

In [22]:
#earnings calls dataset
df_calls = pd.read_csv(data_location2)
df_calls = df_calls[['ticker','date','tokenized']]
df_calls.head()

Unnamed: 0,ticker,date,tokenized
0,AMZN,02/02/2017,"Good day, everyone, and welcome to the Amazon...."
1,AMZN,02/02/2017,"At this time, all participants are in a listen..."
2,AMZN,02/02/2017,"After the presentation, we will conduct a ques..."
3,AMZN,02/02/2017,Today's call is being recorded.For opening rem...
4,AMZN,02/02/2017,"Please, go ahead.Darin Manney Amazon.com, In..."


In [23]:
#remove stop words from text
df_calls['tokenized'] = df_calls['tokenized'].str.lower().str.split()
df_calls['tokenized'] = df_calls['tokenized'].apply(lambda x: [item for item in x if item not in stop])
df_calls.head()

Unnamed: 0,ticker,date,tokenized
0,AMZN,02/02/2017,"[good, day,, everyone,, welcome, amazon.com, q..."
1,AMZN,02/02/2017,"[time,, participants, listen, mode.]"
2,AMZN,02/02/2017,"[presentation,, conduct, question, answer, ses..."
3,AMZN,02/02/2017,"[today's, call, recorded.for, opening, remarks..."
4,AMZN,02/02/2017,"[please,, go, ahead.darin, manney, amazon.com,..."


In [165]:
#prepare data for keras model. 
input_sentences2 = [text for text in df_calls["tokenized"].values.tolist()]

word2id2 = dict()

max_words2 = 0 # maximum number of words in a sentence

# Construction of word2id dict
for sent in input_sentences2:
    for word in sent:
        if word not in word2id2:
            word2id2[word] = len(word2id2)
    if len(sent) > max_words2:
        max_words2 = len(sent)

In [166]:

X2 = [[word2id2[word] for word in sent] for sentence in input_sentences2]

X2 = pad_sequences(X2, max_words)
print("Shape of X2: {}".format(X2.shape))

Shape of X2: (1919467, 89)


In [None]:
#predict data (Tested LSTM against full dataset anyway)
prediction = model.predict(X2)


[[9.9979931e-01 3.3908800e-12 2.3848116e-13 ... 8.8095904e-13
  4.8229337e-12 2.0069852e-04]
 [9.9979931e-01 3.3908800e-12 2.3848116e-13 ... 8.8095904e-13
  4.8229337e-12 2.0069852e-04]
 [9.9979931e-01 3.3908800e-12 2.3848116e-13 ... 8.8095904e-13
  4.8229337e-12 2.0069852e-04]
 ...
 [9.9979931e-01 3.3908800e-12 2.3848116e-13 ... 8.8095904e-13
  4.8229337e-12 2.0069852e-04]
 [9.9979931e-01 3.3908800e-12 2.3848116e-13 ... 8.8095904e-13
  4.8229337e-12 2.0069852e-04]
 [9.9979931e-01 3.3908800e-12 2.3848116e-13 ... 8.8095904e-13
  4.8229337e-12 2.0069852e-04]]
