In [1]:
from numpy.random import seed
seed(4)

In [2]:
import os
import numpy as np
import pandas as pd

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

In [3]:
training_data_path = os.path.join('sentimentsamp.xls')
comp_data_path = os.path.join('complete_merged_dataset.csv')

alltrainingdata_df = pd.read_excel(training_data_path)
allcompdata_df = pd.read_csv(comp_data_path)

alltrainingdata_df.head()

Unnamed: 0,id,sentiment,sentence,ticker,call_title,speaker,call_section
0,1,3,All participants will be in a listen-only mode.,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Operator,operator_instruction
1,3,3,[Operator Instructions] Please note this event...,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Operator,operator_instruction
2,4,3,I would now like to turn the conference over t...,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Operator,operator_instruction
3,5,3,Please go ahead.,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Operator,operator_instruction
4,6,3,"Hello, everyone, and welcome to our third qua...",CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Ashish Kohli,presentation


In [4]:
y_train_df = pd.DataFrame(alltrainingdata_df['sentiment'])

y_train_categorical = to_categorical(y_train_df)

y_train_categorical[:10]

array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.]], dtype=float32)

In [5]:
X_train_df = pd.DataFrame(alltrainingdata_df['sentence'])
X_comp_df = pd.DataFrame(allcompdata_df['sentence'])

X_comp_df.head()

Unnamed: 0,sentence
0,Good morning and welcome to the Dominion Ener...
1,"At this time, each of your lines is in a liste..."
2,"At the conclusion of today’s presentation, we ..."
3,[Operator Instructions] I would now like to tu...
4,Good morning and welcome.


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_df, y_train_categorical, test_size=0.25,random_state=109)

In [7]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(X_train['sentence'])]

In [8]:
max_epochs = 100
vec_size = 20
alpha = 0.025
model2 = Doc2Vec(size=vec_size,
               alpha=alpha,
               min_alpha=0.00025,
               min_count=1,
               dm =0)
model2.build_vocab(tagged_data)
for epoch in range(max_epochs):
   print('iteration {0}'.format(epoch))
   model2.train(tagged_data,
               total_examples=model2.corpus_count,
               epochs=model2.iter)
   # decrease the learning rate
   model2.alpha -= 0.0002
   # fix the learning rate, no decay
   model2.min_alpha = model2.alpha
model2.save('d2v.model')
print('Model Saved')

  


iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [16]:
vectored_list = []
for row in X_train['sentence']:
    word_vec = word_tokenize(row.lower())
    sent_vec = model2.infer_vector(word_vec)
    vectored_list.append(sent_vec)

vectored_test = []
for row in X_test['sentence']:
    word_vec = word_tokenize(row.lower())
    sent_vec = model2.infer_vector(word_vec)
    vectored_test.append(sent_vec)

complete_list = []
for row in X_comp_df['sentence']:
    word_vec = word_tokenize(row.lower())
    sent_vec = model2.infer_vector(word_vec)
    complete_list.append(sent_vec)
complete_list

[array([ 0.01736411, -0.21470611, -0.23160847, -0.02604425,  0.11554756,
        -0.02481574,  0.01001908,  0.03716357,  0.06039271,  0.04458696,
         0.17747737,  0.07227363, -0.05892136,  0.1206639 , -0.10001837,
         0.00180762,  0.4122296 ,  0.17521496, -0.15385212,  0.01480173],
       dtype=float32),
 array([ 0.09999196, -0.13239759, -0.12271192, -0.1463715 , -0.05530236,
         0.0397372 ,  0.02792274,  0.07906936, -0.04758149,  0.13696338,
        -0.03559607,  0.06518657, -0.02472934, -0.13578907, -0.16283047,
         0.10979559,  0.29341108,  0.13416572, -0.10348592,  0.01023454],
       dtype=float32),
 array([ 0.11879861, -0.05840684, -0.07637407, -0.10983685,  0.05379957,
        -0.08036621,  0.09266077,  0.2454859 , -0.17188428,  0.0483309 ,
         0.09418663,  0.11570016, -0.01115066, -0.12561017, -0.08594596,
        -0.04074367,  0.41477856,  0.02181136, -0.13415053,  0.05798939],
       dtype=float32),
 array([-0.09095382, -0.10214263, -0.28231913, -0.19

In [32]:
vectored_list = np.array(vectored_list)
new_vectored_list = vectored_list[:2250]

vectored_test = np.array(vectored_test)
new_vectored_test = vectored_test[:750]

complete_list = np.array(complete_list)
new_complete = complete_list#[:42940]

print(new_vectored_list.shape, new_vectored_test.shape, new_complete.shape)

(2250, 20) (750, 20) (42941, 20)


In [23]:
model = Sequential()
number_inputs = 20
number_hidden_nodes = 2
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))

number_classes = 6
model.add(Dense(units=number_classes, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 2)                 42        
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 18        
Total params: 60
Trainable params: 60
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(new_vectored_list, y_train, epochs=150, shuffle=True, verbose=2)

Train on 2250 samples
Epoch 1/150
2250/2250 - 1s - loss: 1.7321 - accuracy: 0.5253
Epoch 2/150
2250/2250 - 0s - loss: 1.5923 - accuracy: 0.6067
Epoch 3/150
2250/2250 - 0s - loss: 1.4382 - accuracy: 0.6178
Epoch 4/150
2250/2250 - 0s - loss: 1.3195 - accuracy: 0.6187
Epoch 5/150
2250/2250 - 0s - loss: 1.2424 - accuracy: 0.6187
Epoch 6/150
2250/2250 - 0s - loss: 1.1956 - accuracy: 0.6187
Epoch 7/150
2250/2250 - 0s - loss: 1.1646 - accuracy: 0.6187
Epoch 8/150
2250/2250 - 0s - loss: 1.1422 - accuracy: 0.6187
Epoch 9/150
2250/2250 - 0s - loss: 1.1249 - accuracy: 0.6187
Epoch 10/150
2250/2250 - 0s - loss: 1.1105 - accuracy: 0.6187
Epoch 11/150
2250/2250 - 0s - loss: 1.0982 - accuracy: 0.6187
Epoch 12/150
2250/2250 - 0s - loss: 1.0873 - accuracy: 0.6187
Epoch 13/150
2250/2250 - 0s - loss: 1.0773 - accuracy: 0.6187
Epoch 14/150
2250/2250 - 0s - loss: 1.0681 - accuracy: 0.6187
Epoch 15/150
2250/2250 - 0s - loss: 1.0592 - accuracy: 0.6187
Epoch 16/150
2250/2250 - 0s - loss: 1.0510 - accuracy: 0.

Epoch 133/150
2250/2250 - 0s - loss: 0.8518 - accuracy: 0.6658
Epoch 134/150
2250/2250 - 0s - loss: 0.8516 - accuracy: 0.6631
Epoch 135/150
2250/2250 - 0s - loss: 0.8514 - accuracy: 0.6622
Epoch 136/150
2250/2250 - 0s - loss: 0.8513 - accuracy: 0.6653
Epoch 137/150
2250/2250 - 0s - loss: 0.8511 - accuracy: 0.6640
Epoch 138/150
2250/2250 - 0s - loss: 0.8513 - accuracy: 0.6649
Epoch 139/150
2250/2250 - 0s - loss: 0.8511 - accuracy: 0.6618
Epoch 140/150
2250/2250 - 0s - loss: 0.8511 - accuracy: 0.6622
Epoch 141/150
2250/2250 - 0s - loss: 0.8509 - accuracy: 0.6636
Epoch 142/150
2250/2250 - 0s - loss: 0.8508 - accuracy: 0.6649
Epoch 143/150
2250/2250 - 0s - loss: 0.8505 - accuracy: 0.6644
Epoch 144/150
2250/2250 - 0s - loss: 0.8505 - accuracy: 0.6653
Epoch 145/150
2250/2250 - 0s - loss: 0.8507 - accuracy: 0.6636
Epoch 146/150
2250/2250 - 0s - loss: 0.8504 - accuracy: 0.6662
Epoch 147/150
2250/2250 - 0s - loss: 0.8501 - accuracy: 0.6636
Epoch 148/150
2250/2250 - 0s - loss: 0.8500 - accuracy:

<tensorflow.python.keras.callbacks.History at 0x1e99331da58>

In [25]:
model_loss, model_accuracy = model.evaluate(new_vectored_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

750/1 - 0s - loss: 0.6784 - accuracy: 0.6707
Loss: 0.8294264671007792, Accuracy: 0.6706666946411133


In [37]:
model_results = model.predict_classes(new_complete)

In [38]:
allcompdata_df['model_results'] = model_results
allcompdata_df.head(30)

Unnamed: 0.1,Unnamed: 0,call_datetime,Unnamed: 0_x,id,sentiment,sentence,ticker,call_title,speaker,call_section,...,next_date,base_close,base_volume,next_close,next_volume,Unnamed: 0_y,marketCap,percent_change_price,percent_change_volume,model_results
0,0,2019-11-01 10:00:00,457,457,,Good morning and welcome to the Dominion Ener...,D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Operator,operator_instruction,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
1,1,2019-11-01 10:00:00,458,458,,"At this time, each of your lines is in a liste...",D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Operator,operator_instruction,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
2,2,2019-11-01 10:00:00,459,459,,"At the conclusion of today’s presentation, we ...",D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Operator,operator_instruction,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
3,3,2019-11-01 10:00:00,460,460,,[Operator Instructions] I would now like to tu...,D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Operator,operator_instruction,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
4,4,2019-11-01 10:00:00,461,461,,Good morning and welcome.,D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Steven Ridge,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
5,5,2019-11-01 10:00:00,462,462,,I encourage you to visit our Investor Relation...,D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Steven Ridge,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
6,6,2019-11-01 10:00:00,463,463,,The Investor Relations team will be available ...,D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Steven Ridge,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
7,7,2019-11-01 10:00:00,464,464,,Earnings materials including our prepared rema...,D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Steven Ridge,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
8,8,2019-11-01 10:00:00,465,465,,"Please refer to our SEC filings, including our...",D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Steven Ridge,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
9,9,2019-11-01 10:00:00,466,466,,"This morning, we will discuss some measures of...",D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Steven Ridge,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
