In [1]:
from numpy.random import seed
seed(4)

In [2]:
import os
import math
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

In [3]:
# pull in training and test data
training_data_path = os.path.join('sentimentsamp.xls')
test_data_path = os.path.join('sentimentsamp2.xls')

alltrainingdata_df = pd.read_excel(training_data_path)
alltestdata_df = pd.read_excel(test_data_path)

alltrainingdata_df.head()

Unnamed: 0,id,sentiment,sentence,ticker,call_title,speaker,call_section
0,1,3,All participants will be in a listen-only mode.,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Operator,operator_instruction
1,3,3,[Operator Instructions] Please note this event...,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Operator,operator_instruction
2,4,3,I would now like to turn the conference over t...,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Operator,operator_instruction
3,5,3,Please go ahead.,CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Operator,operator_instruction
4,6,3,"Hello, everyone, and welcome to our third qua...",CMG,"Chipotle Mexican Grill, Inc. (NYSE:CMG) Q3 201...",Ashish Kohli,presentation


In [4]:
# extract label data
y_train_df = pd.DataFrame(alltrainingdata_df['sentiment'])
y_test_df = pd.DataFrame(alltestdata_df['sentiment'])

y_train_categorical = to_categorical(y_train_df)
y_test_categorical = to_categorical(y_test_df)

y_train_categorical[:10]

array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.]], dtype=float32)

In [5]:
# extract input data
X_train_df = pd.DataFrame(alltrainingdata_df['sentence'])
#X_test_df = pd.DataFrame(alltestdata_df['sentence'])

X_train_df.head()

Unnamed: 0,sentence
0,All participants will be in a listen-only mode.
1,[Operator Instructions] Please note this event...
2,I would now like to turn the conference over t...
3,Please go ahead.
4,"Hello, everyone, and welcome to our third qua..."


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(alltrainingdata_df['sentence'])
first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[0]
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=['tfidf'], ascending=False)
#df = pd.DataFrame(tfidf_vectorizer_vectors.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
#print(tfidf_vectorizer_vectors[0])

Unnamed: 0,tfidf
mode,0.482935
participants,0.482935
listen,0.460968
only,0.352267
all,0.260886
will,0.248983
be,0.215851
in,0.142546
piper,0.000000
pipeline,0.000000


In [7]:
from sklearn import preprocessing

#df = column_or_1d(df, warn=True)
le = preprocessing.LabelEncoder()
feature_encoded = le.fit_transform(df['tfidf'])
label_encoded = le.fit_transform(alltrainingdata_df['sentiment'])
#test_encoded = le.fit_transform(test_df['tfidf'])

print(label_encoded, feature_encoded.shape)

[2 2 2 ... 2 2 2] (4635,)


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf_vectorizer_vectors, y_train_categorical, test_size=0.25,random_state=109)

In [9]:
X_array = X_train.toarray()
X_test_array = X_test.toarray()

In [10]:
# from sklearn.naive_bayes import GaussianNB

# gnb = GaussianNB()

# gnb.fit(X_array, y_train)

# y_pred = gnb.predict(X_test_array)

In [11]:
# from sklearn import metrics

# print(metrics.accuracy_score(y_test, y_pred))

In [12]:
# build model
model = Sequential()
number_inputs = 4635
number_hidden_nodes = 3
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))

number_classes = 6
model.add(Dense(units=number_classes, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 3)                 13908     
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 24        
Total params: 13,932
Trainable params: 13,932
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_array, y_train, epochs=20, shuffle=True, verbose=2)

Train on 2250 samples
Epoch 1/20
2250/2250 - 1s - loss: 1.7023 - accuracy: 0.5556
Epoch 2/20
2250/2250 - 0s - loss: 1.5032 - accuracy: 0.6187
Epoch 3/20
2250/2250 - 0s - loss: 1.3327 - accuracy: 0.6187
Epoch 4/20
2250/2250 - 0s - loss: 1.2113 - accuracy: 0.6187
Epoch 5/20
2250/2250 - 0s - loss: 1.1273 - accuracy: 0.6187
Epoch 6/20
2250/2250 - 0s - loss: 1.0633 - accuracy: 0.6187
Epoch 7/20
2250/2250 - 0s - loss: 1.0072 - accuracy: 0.6204
Epoch 8/20
2250/2250 - 0s - loss: 0.9550 - accuracy: 0.6391
Epoch 9/20
2250/2250 - 0s - loss: 0.9059 - accuracy: 0.6684
Epoch 10/20
2250/2250 - 0s - loss: 0.8605 - accuracy: 0.6991
Epoch 11/20
2250/2250 - 0s - loss: 0.8186 - accuracy: 0.7271
Epoch 12/20
2250/2250 - 0s - loss: 0.7800 - accuracy: 0.7427
Epoch 13/20
2250/2250 - 0s - loss: 0.7447 - accuracy: 0.7600
Epoch 14/20
2250/2250 - 0s - loss: 0.7122 - accuracy: 0.7729
Epoch 15/20
2250/2250 - 0s - loss: 0.6824 - accuracy: 0.7822
Epoch 16/20
2250/2250 - 0s - loss: 0.6548 - accuracy: 0.7871
Epoch 17/20

<tensorflow.python.keras.callbacks.History at 0x1f78cee94a8>

In [14]:
model_loss, model_accuracy = model.evaluate(X_test_array, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

750/1 - 0s - loss: 0.6357 - accuracy: 0.6933
Loss: 0.8454714953104655, Accuracy: 0.6933333277702332
