In [1]:
from numpy.random import seed
seed(4)

In [2]:
import os
import numpy as np
import pandas as pd

from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

In [3]:
training_data_path = os.path.join('sentimentsamp.xls')
comp_data_path = os.path.join('complete_merged_dataset.csv')

alltrainingdata_df = pd.read_excel(training_data_path)
allcompdata_df = pd.read_csv(comp_data_path)

alltrainingdata_df.head()

Unnamed: 0,id,sentiment,sentence,ticker,call_title,speaker,call_section
0,1,3,All participants will be in a listen-only mode.,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Operator,operator_instruction
1,3,3,[Operator Instructions] Please note this event...,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Operator,operator_instruction
2,4,3,I would now like to turn the conference over t...,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Operator,operator_instruction
3,5,3,Please go ahead.,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Operator,operator_instruction
4,6,3,"Hello, everyone, and welcome to our third qua...",CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Ashish Kohli,presentation


In [43]:
# shift sentiment scores down by 1 to establish a 0-based scale. this makes the 'to_categorical' ca
y_train_df = pd.DataFrame(alltrainingdata_df['sentiment'] - 1)

y_train_categorical = to_categorical(y_train_df)

y_train_categorical[:10]

array([[0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [44]:
temp = pd.DataFrame(y_train_categorical)
temp.describe()

Unnamed: 0,0,1,2,3,4
count,3000.0,3000.0,3000.0,3000.0,3000.0
mean,0.023667,0.102,0.629333,0.212667,0.032333
std,0.152034,0.302699,0.483064,0.409262,0.176913
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,0.0,0.0
75%,0.0,0.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0


In [5]:
X_train_df = pd.DataFrame(alltrainingdata_df['sentence'])
X_comp_df = pd.DataFrame(allcompdata_df['sentence'])

X_comp_df.head()

Unnamed: 0,sentence
0,Good morning and welcome to the Dominion Ener...
1,"At this time, each of your lines is in a liste..."
2,"At the conclusion of today’s presentation, we ..."
3,[Operator Instructions] I would now like to tu...
4,Good morning and welcome.


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_df, y_train_categorical, test_size=0.25,random_state=109)

In [7]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(X_train['sentence'])]

In [18]:
max_epochs = 100
vec_size = 100  #20
alpha = 0.025
model2 = Doc2Vec(size=vec_size,
               alpha=alpha,
               min_alpha=0.00025,
               min_count=1,
               dm =0)
model2.build_vocab(tagged_data)
for epoch in range(max_epochs):
   print('iteration {0}'.format(epoch))
   model2.train(tagged_data,
               total_examples=model2.corpus_count,
               epochs=model2.iter)
   # decrease the learning rate
   model2.alpha -= 0.0002
   # fix the learning rate, no decay
   model2.min_alpha = model2.alpha
model2.save('d2v.model')
print('Model Saved')



iteration 0


  


iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration 77
iteratio

In [19]:
vectored_list = []
for row in X_train['sentence']:
    word_vec = word_tokenize(row.lower())
    sent_vec = model2.infer_vector(word_vec)
    vectored_list.append(sent_vec)

vectored_test = []
for row in X_test['sentence']:
    word_vec = word_tokenize(row.lower())
    sent_vec = model2.infer_vector(word_vec)
    vectored_test.append(sent_vec)

complete_list = []
for row in X_comp_df['sentence']:
    word_vec = word_tokenize(row.lower())
    sent_vec = model2.infer_vector(word_vec)
    complete_list.append(sent_vec)
complete_list

[array([ 0.09534958,  0.12362999,  0.08797708,  0.07517424, -0.01748127,
        -0.01571908,  0.04217381,  0.08824681, -0.00842437, -0.01111273,
        -0.12383788,  0.00636612,  0.0840239 ,  0.0690809 ,  0.01778505,
        -0.01446245, -0.07622142,  0.02315907, -0.09666924,  0.04858153,
         0.09204348,  0.0484802 ,  0.03609145, -0.00408379, -0.10481262,
        -0.06605194, -0.02295987, -0.04907992, -0.07698714,  0.03522179,
         0.02217733, -0.04558641, -0.01234356,  0.08636804,  0.06024797,
        -0.10422048, -0.15355004, -0.10251132,  0.01593729,  0.01094206,
        -0.03653228,  0.01882618, -0.01701333,  0.00964052,  0.1225697 ,
         0.08581571,  0.06790363,  0.13177305, -0.02077999, -0.01944217,
        -0.18071274, -0.05976663, -0.0567357 , -0.02342262, -0.01895306,
         0.05417015, -0.05764448,  0.01130051,  0.05586588,  0.01781254,
        -0.03912186,  0.1788402 , -0.03423815, -0.10972805,  0.07714941,
        -0.05828052,  0.05278726, -0.0468566 ,  0.0

In [20]:
vectored_list = np.array(vectored_list)
new_vectored_list = vectored_list[:2250]

vectored_test = np.array(vectored_test)
new_vectored_test = vectored_test[:750]

complete_list = np.array(complete_list)
new_complete = complete_list#[:42940]

print(new_vectored_list.shape, new_vectored_test.shape, new_complete.shape)

(2250, 100) (750, 100) (42941, 100)


In [21]:
model = Sequential()
number_inputs = 100
number_hidden_nodes = 250
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))
model.add(Dense(units=number_hidden_nodes*2, activation='relu'))

number_classes = 5
model.add(Dense(units=number_classes, activation='softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 250)               25250     
_________________________________________________________________
dense_3 (Dense)              (None, 500)               125500    
_________________________________________________________________
dense_4 (Dense)              (None, 5)                 2505      
Total params: 153,255
Trainable params: 153,255
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(new_vectored_list, y_train, epochs=150, shuffle=True, verbose=2)

ValueError: A target array with shape (2250, 6) was passed for an output of shape (None, 5) while using as loss `categorical_crossentropy`. This loss expects targets to have the same shape as the output.

In [13]:
model_loss, model_accuracy = model.evaluate(new_vectored_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

750/1 - 0s - loss: 0.6737 - accuracy: 0.6760
Loss: 0.8273717204729716, Accuracy: 0.6759999990463257


In [14]:
model_results = model.predict_classes(new_complete)

In [15]:
allcompdata_df['model_results'] = model_results
allcompdata_df.head(30)

Unnamed: 0.1,Unnamed: 0,call_datetime,Unnamed: 0_x,id,sentiment,sentence,ticker,call_title,speaker,call_section,...,next_date,base_close,base_volume,next_close,next_volume,Unnamed: 0_y,marketCap,percent_change_price,percent_change_volume,model_results
0,0,2019-11-01 10:00:00,457,457,,Good morning and welcome to the Dominion Ener...,D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Operator,operator_instruction,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
1,1,2019-11-01 10:00:00,458,458,,"At this time, each of your lines is in a liste...",D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Operator,operator_instruction,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
2,2,2019-11-01 10:00:00,459,459,,"At the conclusion of today’s presentation, we ...",D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Operator,operator_instruction,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
3,3,2019-11-01 10:00:00,460,460,,[Operator Instructions] I would now like to tu...,D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Operator,operator_instruction,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
4,4,2019-11-01 10:00:00,461,461,,Good morning and welcome.,D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Steven Ridge,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
5,5,2019-11-01 10:00:00,462,462,,I encourage you to visit our Investor Relation...,D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Steven Ridge,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
6,6,2019-11-01 10:00:00,463,463,,The Investor Relations team will be available ...,D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Steven Ridge,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
7,7,2019-11-01 10:00:00,464,464,,Earnings materials including our prepared rema...,D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Steven Ridge,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
8,8,2019-11-01 10:00:00,465,465,,"Please refer to our SEC filings, including our...",D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Steven Ridge,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3
9,9,2019-11-01 10:00:00,466,466,,"This morning, we will discuss some measures of...",D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Steven Ridge,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,3


In [16]:
allcompdata_df.describe()

Unnamed: 0.1,Unnamed: 0,Unnamed: 0_x,id,sentiment,vader_sentiment,base_close,base_volume,next_close,next_volume,marketCap,percent_change_price,percent_change_volume,model_results
count,42941.0,42941.0,42941.0,0.0,42941.0,42941.0,42941.0,42941.0,42941.0,42941.0,42941.0,42941.0,42941.0
mean,23556.465895,18447.076407,18447.076407,,0.2598,143.517541,6102612.0,139.94265,13198340.0,51018300000.0,-0.341603,98.786692,3.137584
std,16081.709925,12520.269232,12520.269232,,0.330205,265.461199,11403930.0,246.591195,29333280.0,60921870000.0,5.707717,127.013049,0.366289
min,0.0,0.0,0.0,,-0.9493,9.07,228000.0,10.11,413500.0,5205218000.0,-20.808657,-30.40586,2.0
25%,10735.0,6440.0,6440.0,,0.0,42.650002,1473700.0,43.68,2355400.0,12797130000.0,-3.140449,30.74141,3.0
50%,21470.0,17778.0,17778.0,,0.2732,82.730003,2525500.0,84.559998,4400500.0,23865130000.0,0.439476,65.650464,3.0
75%,32205.0,29037.0,29037.0,,0.4939,141.029999,6106800.0,138.119995,10707400.0,56882010000.0,2.937574,124.076569,3.0
max,64976.0,40712.0,40712.0,,0.993,2012.089966,78656000.0,1849.930054,215943400.0,296209000000.0,13.692753,771.327086,4.0


In [17]:
allcompdata_df[allcompdata_df['model_results']==2]

Unnamed: 0.1,Unnamed: 0,call_datetime,Unnamed: 0_x,id,sentiment,sentence,ticker,call_title,speaker,call_section,...,next_date,base_close,base_volume,next_close,next_volume,Unnamed: 0_y,marketCap,percent_change_price,percent_change_volume,model_results
40,40,2019-11-01 10:00:00,497,497,,"The updated ROE will impact in the near-term, ...",D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",James Chapman,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,2
128,128,2019-11-01 10:00:00,585,585,,"However, even if the timing of the BO reissues...",D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Thomas Farrell,presentation,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,2
205,205,2019-11-01 10:00:00,662,662,,"We have a little bit of exposure, mostly aroun...",D,"Dominion Energy, Inc. (NYSE:D) Q3 2019 Earning...",Paul Koonce,answer,...,2019-11-01,82.550003,2424000.0,83.269997,3169600.0,D,67584167936,0.872191,30.759076,2
549,549,2019-10-29 09:00:00,1006,1006,,"At the same time, we've sold $2.675 billion of...",WELL,Welltower Inc. (NYSE:WELL) Q3 2019 Earnings Co...,Shankh Mitra,presentation,...,2019-10-29,87.040001,3518600.0,89.440002,3377300.0,WELL,34083141632,2.757355,-4.015802,2
577,577,2019-10-29 09:00:00,1034,1034,,"So Vikram, it's - I'm not going to get into to...",WELL,Welltower Inc. (NYSE:WELL) Q3 2019 Earnings Co...,Thomas DeRosa,answer,...,2019-10-29,87.040001,3518600.0,89.440002,3377300.0,WELL,34083141632,2.757355,-4.015802,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41343,56624,2019-10-22 16:30:00,26719,26719,,The higher average check includes a price impa...,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Jack Hartung,presentation,...,2019-10-23,831.070007,1154200.0,788.190002,1779800.0,CMG,20975925248,-5.159614,54.202045,2
41666,59208,2019-10-22 16:30:00,27042,27042,,"Wondering on kind of looking out, obviously, t...",CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Andy Barish,question,...,2019-10-23,831.070007,1154200.0,788.190002,1779800.0,CMG,20975925248,-5.159614,54.202045,2
41860,60760,2019-10-22 16:30:00,74,74,,This deferral is lower than previous quarters ...,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Jack Hartung,presentation,...,2019-10-23,831.070007,1154200.0,788.190002,1779800.0,CMG,20975925248,-5.159614,54.202045,2
42360,64396,2019-10-24 16:30:00,36264,36264,,Operating cash flow for the third quarter was ...,VRSN,"VeriSign, Inc. (NASDAQ:VRSN) Q3 2019 Results E...",George Kilguss,presentation,...,2019-10-25,185.729996,526800.0,190.160004,1151700.0,VRSN,21702838272,2.385187,118.621868,2
