In [1]:
from numpy.random import seed
seed(42)

In [2]:
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense

from sklearn.preprocessing import StandardScaler

In [3]:
import numpy as np
import pandas as pd
import math

In [4]:
training_data_path = os.path.join('sentimentsamp.xls')
test_data_path = os.path.join('sentimentsamp2.xls')
alltrainingdata_df = pd.read_excel(training_data_path)
alltestdata_df = pd.read_excel(test_data_path)
alltestdata_df.head()

Unnamed: 0,id,sentiment,sentence,ticker,call_title,speaker,call_section
0,3431,3,"So I wanted to ask you, do you agree with that...",SBUX,Starbucks Corporation (NASDAQ:SBUX) Q4 2019 Ea...,David Palmer,question
1,3432,3,And how do you think those will play out in fi...,SBUX,Starbucks Corporation (NASDAQ:SBUX) Q4 2019 Ea...,David Palmer,question
2,3434,3,Yes.,SBUX,Starbucks Corporation (NASDAQ:SBUX) Q4 2019 Ea...,Kevin Johnson,answer
3,3438,3,A lot of that driven with our China digital pa...,SBUX,Starbucks Corporation (NASDAQ:SBUX) Q4 2019 Ea...,Kevin Johnson,answer
4,3440,3,We launched Starbucks Delivers.,SBUX,Starbucks Corporation (NASDAQ:SBUX) Q4 2019 Ea...,Kevin Johnson,answer


In [5]:
y_train_df = pd.DataFrame(alltrainingdata_df['sentiment'])
y_test_df = pd.DataFrame(alltestdata_df['sentiment'])
y_test_df.head()

Unnamed: 0,sentiment
0,3
1,3
2,3
3,3
4,3


In [6]:
X_train_df = pd.DataFrame(alltrainingdata_df['sentence'])
X_test_df = pd.DataFrame(alltestdata_df['sentence'])
X_test_df.head()

Unnamed: 0,sentence
0,"So I wanted to ask you, do you agree with that..."
1,And how do you think those will play out in fi...
2,Yes.
3,A lot of that driven with our China digital pa...
4,We launched Starbucks Delivers.


In [7]:
X_train = X_train_df.values
X_test = X_test_df.values
X_test.shape

(1000, 1)

In [8]:
y_train = to_categorical(y_train_df)
y_test = to_categorical(y_test_df)
y_test[:10]

array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.]], dtype=float32)

In [9]:
train_word_set = ""
#train_word_list = []
train_sentence_list = [] #2nd
train_dict_list = [] #2nd
for row in range(len(X_train_df)):
    cleartoken = X_train_df.loc[row, "sentence"].lower().split()
    train_sentence_list.append(cleartoken) #2nd
    train_dict_list.append(set(cleartoken)) #2nd
    #for word in cleartoken:
        #train_word_list.append(word)
    train_word_set = set(train_word_set).union(set(cleartoken))
    
test_word_set = ""
#test_word_list = []
test_sentence_list = [] #2nd
test_dict_list = [] #2nd
for row in range(len(X_test_df)):
    cleartoken = X_test_df.loc[row, "sentence"].lower().split()
    test_sentence_list.append(cleartoken) #2nd
    test_dict_list.append(set(cleartoken)) #2nd
    #for word in cleartoken:
        #test_word_list.append(word)
    test_word_set = set(test_word_set).union(set(cleartoken))
print(test_word_set)

{'leader', 'weeks', 'excellence.', 'inc.', 'fourth', 'job.', 'together', 'billion.', 'track', 'lloyd.', 'ended', 'product', 'optimal', 'degradation,', 'asia-pacific,', 'periods,', 'advertising,', 'what’s', 'costing,', 'regard', 'promoted', 'some?', '71', 'exists', 'holistically', 'relative', 'seasonality,', 'rooted-in-performance', 'honor', 'doug', 'classes.', 'become', 'place.', 'repayment', 'weeks.', 'kernan', 'financing', 'authenticated', 'cetera,', 'announcement', 'swipe', 'an', 'firmly', 'cultivate,', 'not', 'third,', 'economic', 'john.', 'discipline.', 'directly', 'risk', 'timeline.', 'words', 'commitment', 'partial', '3', 'dialogue', 'simplifying', 'thailand', 'strength', 'useful', 'respect', 'closing', 'included', 'you’ve', 'long-term.', 'ed.', 'questions.', 'commercial', 'others', 'area', 'lejuez.', 'mark.', 'much.', 'omar', 'familiarity', 'nine', 'magnified', 'over', 'holistic', 'personality', 'forecasting', 'remain', 'got', 'express', 'happens', 'million', 'areas', 'improved

In [13]:
train_word_dict = dict.fromkeys(train_word_set, 0)
test_word_dict = dict.fromkeys(test_word_set, 0)

#2nd
for i in range(len(train_dict_list)):
    train_dict_list[i] = dict.fromkeys(train_dict_list[i], 0)
    
for i in range(len(test_dict_list)):
    test_dict_list[i] = dict.fromkeys(test_dict_list[i], 0)
print(test_dict_list)

[{'with': 0, 'the': 0, 'in': 0, 'agree': 0, 'drivers': 0, 'terms': 0, 'that': 0, 'you': 0, 'of': 0, 'to': 0, 'you,': 0, 'characterization': 0, 'wanted': 0, 'ask': 0, 'top': 0, 'each': 0, 'so': 0, 'market?': 0, 'do': 0, 'i': 0}, {'2020?': 0, 'play': 0, 'out': 0, 'in': 0, 'will': 0, 'you': 0, 'how': 0, 'fiscal': 0, 'and': 0, 'those': 0, 'think': 0, 'do': 0}, {'yes.': 0}, {'rewards': 0, 'spend-based': 0, 'lot': 0, 'partnership': 0, 'digital': 0, 'in': 0, 'driven': 0, 'china': 0, 'of': 0, 'that': 0, 'a': 0, 'we': 0, 'december.': 0, 'introduced': 0, 'program': 0, 'our': 0, 'with': 0, 'alibaba,': 0}, {'launched': 0, 'starbucks': 0, 'we': 0, 'delivers.': 0}, {'are': 0, 'mobile': 0, 'scenarios.': 0, 'the': 0, 'in': 0, 'look': 0, 'china': 0, 'digital': 0, 'that': 0, 'you': 0, 'numbers': 0, 'at': 0, 'scenarios': 0}, {'anchored': 0, 'our': 0, 'belinda': 0, 'step': 0, 'was': 0, 'leadership': 0, 'digital': 0, 'work': 0, 'that': 0, 'many': 0, 'team': 0, 'so': 0, 'this': 0, 'did': 0, 'in': 0, 'big': 

In [14]:
# for word in word_list:
#     word_dict[word] += 1

#2nd
for i in range(len(train_sentence_list)):
    for word in train_sentence_list[i]:
        train_dict_list[i][word] += 1

for i in range(len(test_sentence_list)):
    for word in test_sentence_list[i]:
        test_dict_list[i][word] += 1
print(test_dict_list)

[{'with': 1, 'the': 1, 'in': 2, 'agree': 1, 'drivers': 1, 'terms': 1, 'that': 1, 'you': 1, 'of': 1, 'to': 1, 'you,': 1, 'characterization': 1, 'wanted': 1, 'ask': 1, 'top': 1, 'each': 1, 'so': 1, 'market?': 1, 'do': 1, 'i': 1}, {'2020?': 1, 'play': 1, 'out': 1, 'in': 1, 'will': 1, 'you': 1, 'how': 1, 'fiscal': 1, 'and': 1, 'those': 1, 'think': 1, 'do': 1}, {'yes.': 1}, {'rewards': 1, 'spend-based': 1, 'lot': 1, 'partnership': 1, 'digital': 1, 'in': 1, 'driven': 1, 'china': 1, 'of': 1, 'that': 1, 'a': 2, 'we': 1, 'december.': 1, 'introduced': 1, 'program': 1, 'our': 1, 'with': 2, 'alibaba,': 1}, {'launched': 1, 'starbucks': 1, 'we': 1, 'delivers.': 1}, {'are': 1, 'mobile': 1, 'scenarios.': 1, 'the': 1, 'in': 1, 'look': 1, 'china': 1, 'digital': 1, 'that': 1, 'you': 1, 'numbers': 1, 'at': 1, 'scenarios': 1}, {'anchored': 1, 'our': 1, 'belinda': 1, 'step': 1, 'was': 1, 'leadership': 1, 'digital': 1, 'work': 1, 'that': 1, 'many': 1, 'team': 1, 'so': 1, 'this': 1, 'did': 1, 'in': 2, 'big': 

In [15]:
# tf = {}
# sum_nk = len(word_list)
# for word, count in word_dict.items():
#     tf[word] = count/sum_nk

#2nd
train_tf_list = []
test_tf_list = []
def compute_tf(wd, l):
    tf2 = {}
    sum_nk = len(l)
    for word, count in wd.items():
        tf2[word] = count/sum_nk
    return tf2

for i in range(len(train_dict_list)):
    train_tf_list.append(compute_tf(train_dict_list[i], train_sentence_list[i]))  
for i in range(len(test_dict_list)):
    test_tf_list.append(compute_tf(test_dict_list[i], test_sentence_list[i]))
print(test_tf_list)

[{'with': 0.047619047619047616, 'the': 0.047619047619047616, 'in': 0.09523809523809523, 'agree': 0.047619047619047616, 'drivers': 0.047619047619047616, 'terms': 0.047619047619047616, 'that': 0.047619047619047616, 'you': 0.047619047619047616, 'of': 0.047619047619047616, 'to': 0.047619047619047616, 'you,': 0.047619047619047616, 'characterization': 0.047619047619047616, 'wanted': 0.047619047619047616, 'ask': 0.047619047619047616, 'top': 0.047619047619047616, 'each': 0.047619047619047616, 'so': 0.047619047619047616, 'market?': 0.047619047619047616, 'do': 0.047619047619047616, 'i': 0.047619047619047616}, {'2020?': 0.08333333333333333, 'play': 0.08333333333333333, 'out': 0.08333333333333333, 'in': 0.08333333333333333, 'will': 0.08333333333333333, 'you': 0.08333333333333333, 'how': 0.08333333333333333, 'fiscal': 0.08333333333333333, 'and': 0.08333333333333333, 'those': 0.08333333333333333, 'think': 0.08333333333333333, 'do': 0.08333333333333333}, {'yes.': 1.0}, {'rewards': 0.05, 'spend-based'

In [17]:
train_idf = dict.fromkeys(train_word_dict.keys(), 0)
test_idf = dict.fromkeys(test_word_dict.keys(), 0)

# for row in range(len(X_train_df)):
#     string_list = X_train_df.loc[row, "sentence"].lower().split()
#     string_set = set(string_list)
#     for word in string_set:
#         idf[word] += 1
            
#2nd
def compute_idf(strings_list, idf):
    n = len(strings_list)

    for l in strings_list:
        for word, count in l.items():
            if count > 0:
                idf[word] += 1
    for word, v in idf.items():
        idf[word] = math.log(n/float(v))
    return idf

train_idf = compute_idf(train_dict_list, train_idf)
test_idf = compute_idf(test_dict_list, test_idf)
print(test_idf)

{'leader': 6.214608098422191, 'weeks': 6.907755278982137, 'excellence.': 6.907755278982137, 'inc.': 6.907755278982137, 'fourth': 3.816712825623821, 'job.': 6.907755278982137, 'together': 6.907755278982137, 'billion.': 6.907755278982137, 'track': 6.214608098422191, 'lloyd.': 6.907755278982137, 'ended': 6.214608098422191, 'product': 3.473768074496991, 'optimal': 6.907755278982137, 'degradation,': 6.907755278982137, 'asia-pacific,': 6.907755278982137, 'periods,': 6.907755278982137, 'advertising,': 6.907755278982137, 'what’s': 6.214608098422191, 'costing,': 6.907755278982137, 'regard': 5.809142990314028, 'promoted': 6.907755278982137, 'some?': 6.907755278982137, '71': 6.907755278982137, 'exists': 6.214608098422191, 'holistically': 6.907755278982137, 'relative': 4.422848629194137, 'seasonality,': 6.907755278982137, 'rooted-in-performance': 6.907755278982137, 'honor': 6.907755278982137, 'doug': 6.907755278982137, 'classes.': 6.907755278982137, 'become': 6.907755278982137, 'place.': 6.9077552

In [18]:
#for word, count in idf.items():
    #idf[word] = math.log(n/float(count))
    
#print(idf)

In [19]:
# tf_idf = dict.fromkeys(tf.keys(), 0)
# for word, v in tf.items():
#     tf_idf[word] = v*idf[word]

#2nd
train_tf_idf_list = []
test_tf_idf_list = []
def compute_tf_idf(tf, idf):
    tf_idf = dict.fromkeys(tf.keys(), 0)
    for word, v in tf.items():
        tf_idf[word] = v * idf[word]
    return tf_idf

for i in range(len(train_tf_list)):
    train_tf_idf_list.append(compute_tf_idf(train_tf_list[i], train_idf))
for i in range(len(test_tf_list)):
    test_tf_idf_list.append(compute_tf_idf(test_tf_list[i], test_idf))
print(test_tf_idf_list)

[{'with': 0.09465592157876249, 'the': 0.03455573201264311, 'in': 0.11062400842304608, 'agree': 0.2766258566816204, 'drivers': 0.2362783395203249, 'terms': 0.1737932733790788, 'that': 0.05670131321790072, 'you': 0.09033904689932767, 'of': 0.05151215102865174, 'to': 0.04447360319585123, 'you,': 0.22991970177630006, 'characterization': 0.3289407275705779, 'wanted': 0.2362783395203249, 'ask': 0.22991970177630006, 'top': 0.22991970177630006, 'each': 0.26292671037439264, 'so': 0.11259335698629204, 'market?': 0.3289407275705779, 'do': 0.14265391778828526, 'i': 0.09195340650293155}, {'2020?': 0.48409524919283564, 'play': 0.3925442251371598, 'out': 0.3041382284133879, 'in': 0.09679600737016532, 'will': 0.22650837807964264, 'you': 0.15809333207382342, 'how': 0.25304618900618714, 'fiscal': 0.344597213061863, 'and': 0.06994414089483555, 'those': 0.2979625640672444, 'think': 0.20542533520765172, 'do': 0.24964435612949923}, {'yes.': 5.298317366548036}, {'rewards': 0.2302585092994046, 'spend-based': 

In [20]:
# for row in range(len(X_train_df)):
#     sentence_v = 0
#     string_list = X_train_df.loc[row, "sentence"].lower().split()
#     for word in string_list:
#         sentence_v += tf_idf[word]
#     X_train_df.loc[row, "sentence_weight"] = sentence_v
    
#2nd
for row in range(len(X_train_df)):
    sentence_weight = 0
    for word in train_tf_idf_list[row]:
        sentence_weight += train_tf_idf_list[row][word]
    X_train_df.loc[row, "sentence_weight"] = sentence_weight
for row in range(len(X_test_df)):
    sentence_weight = 0
    for word in test_tf_idf_list[row]:
        sentence_weight += test_tf_idf_list[row][word]
    X_test_df.loc[row, "sentence_weight"] = sentence_weight
X_test_df.head()

Unnamed: 0,sentence,sentence_weight
0,"So I wanted to ask you, do you agree with that...",3.363606
1,And how do you think those will play out in fi...,3.082795
2,Yes.,5.298317
3,A lot of that driven with our China digital pa...,3.574384
4,We launched Starbucks Delivers.,4.425223


In [21]:
#X_train_df = pd.DataFrame(X_train_df["sentence_weight"])

#2nd
X_train_df = pd.DataFrame(X_train_df["sentence_weight"])
X_test_df = pd.DataFrame(X_test_df["sentence_weight"])
X_test_df.head()

Unnamed: 0,sentence_weight
0,3.363606
1,3.082795
2,5.298317
3,3.574384
4,4.425223


In [22]:
#X_train = X_train_df.values
#X_train.shape

#2nd
X_train = X_train_df.values
X_test = X_test_df.values
X_test.shape

(1000, 1)

In [23]:
#X_scaler = StandardScaler().fit(X_train)

#2nd
X_scaler = StandardScaler().fit(X_train)

In [24]:
#X_train_scaled = X_scaler.transform(X_train)

#2nd
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [68]:
model = Sequential()
number_inputs = 1
number_hidden_nodes = 20
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))

In [69]:
number_classes = 6
model.add(Dense(units=number_classes, activation='softmax'))

In [70]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             (None, 20)                40        
_________________________________________________________________
dense_15 (Dense)             (None, 6)                 126       
Total params: 166
Trainable params: 166
Non-trainable params: 0
_________________________________________________________________


In [71]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [72]:
model.fit(
    X_train_scaled, #2nd
    y_train,
    epochs=500,
    shuffle=True,
    verbose=2
)

Train on 2000 samples
Epoch 1/500
2000/2000 - 0s - loss: 1.7290 - accuracy: 0.3450
Epoch 2/500
2000/2000 - 0s - loss: 1.5346 - accuracy: 0.5145
Epoch 3/500
2000/2000 - 0s - loss: 1.4030 - accuracy: 0.5615
Epoch 4/500
2000/2000 - 0s - loss: 1.3130 - accuracy: 0.5940
Epoch 5/500
2000/2000 - 0s - loss: 1.2315 - accuracy: 0.6110
Epoch 6/500
2000/2000 - 0s - loss: 1.1630 - accuracy: 0.6155
Epoch 7/500
2000/2000 - 0s - loss: 1.1189 - accuracy: 0.6155
Epoch 8/500
2000/2000 - 0s - loss: 1.0984 - accuracy: 0.6155
Epoch 9/500
2000/2000 - 0s - loss: 1.0891 - accuracy: 0.6155
Epoch 10/500
2000/2000 - 0s - loss: 1.0844 - accuracy: 0.6155
Epoch 11/500
2000/2000 - 0s - loss: 1.0815 - accuracy: 0.6155
Epoch 12/500
2000/2000 - 0s - loss: 1.0796 - accuracy: 0.6155
Epoch 13/500
2000/2000 - 0s - loss: 1.0780 - accuracy: 0.6155
Epoch 14/500
2000/2000 - 0s - loss: 1.0772 - accuracy: 0.6155
Epoch 15/500
2000/2000 - 0s - loss: 1.0761 - accuracy: 0.6155
Epoch 16/500
2000/2000 - 0s - loss: 1.0752 - accuracy: 0.

Epoch 133/500
2000/2000 - 0s - loss: 1.0667 - accuracy: 0.6155
Epoch 134/500
2000/2000 - 0s - loss: 1.0668 - accuracy: 0.6155
Epoch 135/500
2000/2000 - 0s - loss: 1.0663 - accuracy: 0.6155
Epoch 136/500
2000/2000 - 0s - loss: 1.0665 - accuracy: 0.6155
Epoch 137/500
2000/2000 - 0s - loss: 1.0666 - accuracy: 0.6155
Epoch 138/500
2000/2000 - 0s - loss: 1.0662 - accuracy: 0.6155
Epoch 139/500
2000/2000 - 0s - loss: 1.0666 - accuracy: 0.6155
Epoch 140/500
2000/2000 - 0s - loss: 1.0669 - accuracy: 0.6155
Epoch 141/500
2000/2000 - 0s - loss: 1.0663 - accuracy: 0.6155
Epoch 142/500
2000/2000 - 0s - loss: 1.0662 - accuracy: 0.6155
Epoch 143/500
2000/2000 - 0s - loss: 1.0666 - accuracy: 0.6155
Epoch 144/500
2000/2000 - 0s - loss: 1.0663 - accuracy: 0.6155
Epoch 145/500
2000/2000 - 0s - loss: 1.0661 - accuracy: 0.6155
Epoch 146/500
2000/2000 - 0s - loss: 1.0666 - accuracy: 0.6155
Epoch 147/500
2000/2000 - 0s - loss: 1.0664 - accuracy: 0.6155
Epoch 148/500
2000/2000 - 0s - loss: 1.0666 - accuracy:

Epoch 264/500
2000/2000 - 0s - loss: 1.0655 - accuracy: 0.6155
Epoch 265/500
2000/2000 - 0s - loss: 1.0656 - accuracy: 0.6155
Epoch 266/500
2000/2000 - 0s - loss: 1.0657 - accuracy: 0.6155
Epoch 267/500
2000/2000 - 0s - loss: 1.0654 - accuracy: 0.6155
Epoch 268/500
2000/2000 - 0s - loss: 1.0651 - accuracy: 0.6155
Epoch 269/500
2000/2000 - 0s - loss: 1.0652 - accuracy: 0.6155
Epoch 270/500
2000/2000 - 0s - loss: 1.0653 - accuracy: 0.6155
Epoch 271/500
2000/2000 - 0s - loss: 1.0657 - accuracy: 0.6155
Epoch 272/500
2000/2000 - 0s - loss: 1.0654 - accuracy: 0.6155
Epoch 273/500
2000/2000 - 0s - loss: 1.0652 - accuracy: 0.6155
Epoch 274/500
2000/2000 - 0s - loss: 1.0653 - accuracy: 0.6155
Epoch 275/500
2000/2000 - 0s - loss: 1.0654 - accuracy: 0.6155
Epoch 276/500
2000/2000 - 0s - loss: 1.0655 - accuracy: 0.6155
Epoch 277/500
2000/2000 - 0s - loss: 1.0652 - accuracy: 0.6155
Epoch 278/500
2000/2000 - 0s - loss: 1.0655 - accuracy: 0.6155
Epoch 279/500
2000/2000 - 0s - loss: 1.0651 - accuracy:

2000/2000 - 0s - loss: 1.0645 - accuracy: 0.6155
Epoch 395/500
2000/2000 - 0s - loss: 1.0646 - accuracy: 0.6155
Epoch 396/500
2000/2000 - 0s - loss: 1.0656 - accuracy: 0.6155
Epoch 397/500
2000/2000 - 0s - loss: 1.0649 - accuracy: 0.6155
Epoch 398/500
2000/2000 - 0s - loss: 1.0643 - accuracy: 0.6155
Epoch 399/500
2000/2000 - 0s - loss: 1.0644 - accuracy: 0.6155
Epoch 400/500
2000/2000 - 0s - loss: 1.0649 - accuracy: 0.6155
Epoch 401/500
2000/2000 - 0s - loss: 1.0646 - accuracy: 0.6155
Epoch 402/500
2000/2000 - 0s - loss: 1.0646 - accuracy: 0.6155
Epoch 403/500
2000/2000 - 0s - loss: 1.0646 - accuracy: 0.6155
Epoch 404/500
2000/2000 - 0s - loss: 1.0644 - accuracy: 0.6155
Epoch 405/500
2000/2000 - 0s - loss: 1.0645 - accuracy: 0.6155
Epoch 406/500
2000/2000 - 0s - loss: 1.0652 - accuracy: 0.6155
Epoch 407/500
2000/2000 - 0s - loss: 1.0644 - accuracy: 0.6155
Epoch 408/500
2000/2000 - 0s - loss: 1.0645 - accuracy: 0.6155
Epoch 409/500
2000/2000 - 0s - loss: 1.0650 - accuracy: 0.6155
Epoch 

<tensorflow.python.keras.callbacks.History at 0x2adff921f98>

In [73]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Loss: 0.9799915122985839, Accuracy: 0.6570000052452087
