In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('trainingData.csv', error_bad_lines=False)

In [3]:
df.head()

Unnamed: 0,title,Sentiment
0,Is 3M (MMM) A Good Stock To Buy Now?,1
1,3 Dividend Stocks That Should Pay You the Rest...,1
2,Starbucks and Disney Are Stock Stalwarts That ...,1
3,The 10 Most Reliable Value Stocks to Buy for 2021,1
4,Is GE Stock a Buy?,1


In [5]:
train_x = np.array(df['title'].values)
train_y = np.array(df['Sentiment'].values)

In [6]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar()
dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)

In [7]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [8]:
dataset, info = tfds.load('imdb_reviews', with_info=True,
                          as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

print(train_dataset.element_spec)
# print(test_dataset.element_spec)

(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))


In [9]:
print(train_dataset)

<PrefetchDataset shapes: ((), ()), types: (tf.string, tf.int64)>


In [10]:
df.dtypes

title        object
Sentiment     int64
dtype: object

In [11]:
df['title'] = df['title'].astype(str)
# split = 8000
train_dataset = tf.data.Dataset.from_tensor_slices(
    ((df['title']),
    (df['Sentiment']))
)

# test_dataset = tf.data.Dataset.from_tensor_slices(
#     ((df['title'][split:]),
#     (df['Sentiment'][split:]))
# )

In [12]:
print(train_dataset)
# print(test_dataset)

<TensorSliceDataset shapes: ((), ()), types: (tf.string, tf.int64)>


In [13]:
print(train_dataset.element_spec)
# print(test_dataset.element_spec)

(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))


In [14]:
for example, label in train_dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

text:  b'Is 3M (MMM) A Good Stock To Buy Now?'
label:  1


In [15]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
# test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [b'Is Corteva Inc (CTVA) Stock Near the Top of the Agricultural Inputs Industry?'
 b'Cramer Gives His Opinion On Barrick Gold, Las Vegas Sands And More'
 b'FMC Corp (FMC) Stock Increases 3.25% This Week; Should You Buy?']

labels:  [ 1 -1  1]


In [16]:
VOCAB_SIZE=5000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [17]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'stock', 'to', 'the', 'is', 'a', 'stocks', 'buy',
       'for', 'in', 'and', 'inc', 'of', 'earnings', 'now', 'why', 'on',
       'good', 'you'], dtype='<U20')

In [18]:
encoded_example = encoder(example)[:3].numpy()
encoded_example

array([[   5,  906,   12, 1356,    2,  333,    4,   31,   13,    4, 4653,
           1,  138,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [ 153, 2072, 1083, 2591,   17,    1,  897, 2297, 1937, 3032,   11,
          56,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0],
       [ 536,   75,  536,    2, 1320, 3524,   26,   90,   20,   19,    8,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]], dtype=int64)

In [19]:
for n in range(3):
  print("Original: ", example[n].numpy())
  print("Round-trip: ", " ".join(vocab[encoded_example[n]]))
  print()

Original:  b'Is Corteva Inc (CTVA) Stock Near the Top of the Agricultural Inputs Industry?'
Round-trip:  is corteva inc ctva stock near the top of the agricultural [UNK] industry             

Original:  b'Cramer Gives His Opinion On Barrick Gold, Las Vegas Sands And More'
Round-trip:  cramer gives his opinion on [UNK] gold las vegas sands and more              

Original:  b'FMC Corp (FMC) Stock Increases 3.25% This Week; Should You Buy?'
Round-trip:  fmc corp fmc stock increases 325 this week should you buy               



In [20]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

In [21]:
print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True, True, True]


In [22]:
sample_text = ('It is a Bad stock')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

[-0.01273567]


In [23]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [24]:
history = model.fit(train_dataset, epochs=128)

Epoch 1/128
Epoch 2/128
Epoch 3/128
Epoch 4/128
Epoch 5/128
Epoch 6/128
Epoch 7/128
Epoch 8/128
Epoch 9/128
Epoch 10/128
Epoch 11/128
Epoch 12/128
Epoch 13/128
Epoch 14/128
Epoch 15/128
Epoch 16/128
Epoch 17/128
Epoch 18/128
Epoch 19/128
Epoch 20/128
Epoch 21/128
Epoch 22/128
Epoch 23/128
Epoch 24/128
Epoch 25/128
Epoch 26/128
Epoch 27/128
Epoch 28/128
Epoch 29/128
Epoch 30/128
Epoch 31/128
Epoch 32/128
Epoch 33/128
Epoch 34/128
Epoch 35/128
Epoch 36/128
Epoch 37/128
Epoch 38/128
Epoch 39/128
Epoch 40/128
Epoch 41/128
Epoch 42/128
Epoch 43/128
Epoch 44/128
Epoch 45/128
Epoch 46/128
Epoch 47/128
Epoch 48/128
Epoch 49/128
Epoch 50/128
Epoch 51/128
Epoch 52/128
Epoch 53/128
Epoch 54/128
Epoch 55/128
Epoch 56/128
Epoch 57/128
Epoch 58/128
Epoch 59/128
Epoch 60/128
Epoch 61/128
Epoch 62/128
Epoch 63/128
Epoch 64/128
Epoch 65/128
Epoch 66/128
Epoch 67/128
Epoch 68/128
Epoch 69/128
Epoch 70/128
Epoch 71/128
Epoch 72/128
Epoch 73/128
Epoch 74/128
Epoch 75/128
Epoch 76/128
Epoch 77/128
Epoch 78

In [25]:
# test_loss, test_acc = model.evaluate(test_dataset)

# print('Test Loss: {}'.format(test_loss))
# print('Test Accuracy: {}'.format(test_acc))

In [26]:
# plt.figure(figsize=(16,8))
# plt.subplot(1,2,1)
# plot_graphs(history, 'accuracy')
# plt.ylim(None,1)
# plt.subplot(1,2,2)
# plot_graphs(history, 'loss')
# plt.ylim(0,None)

In [27]:
sample_text = ('Bad Buy')
predictions = model.predict(np.array([sample_text]))

In [28]:
print(predictions)

[[-8766.751]]


In [32]:
model.save('model')

INFO:tensorflow:Assets written to: model\assets
INFO:tensorflow:Assets written to: model\assets
