In [9]:
import pandas as pd
import numpy as np

import keras as keras
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Conv1D, Flatten
from tensorflow.keras.layers import AveragePooling1D, MaxPool1D, GlobalMaxPool1D, AveragePooling1D

import pickle
from keras_pickle_wrapper import KerasPickleWrapper

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [41]:
news1 = pd.read_csv('./djia_news/djia_news copy.csv')
news2 = pd.read_csv('./nasdaq/nasdaq.csv')

combined_news = news1.append(news2)
combined_news = combined_news[['Label', 'Headline']]
len(combined_news)

15562

In [61]:
#This is important for splitting a one column classification to an n-column classification
y = pd.get_dummies(combined_news['Label'])
y.head()

Unnamed: 0,0,1,2
0,1,0,0
1,0,1,0
2,1,0,0
3,0,1,0
4,0,1,0


In [30]:
train_text, test_text, train_labels, test_labels = train_test_split(combined_news['Headline'].to_numpy(), y.to_numpy(), test_size = 0.2, random_state = 42)

In [54]:
print(len(test_text),len(test_labels))

3113 3113


In [45]:
combined_news.head()

Unnamed: 0,Label,Headline
0,0,Employer who stole nearly $3M in wages from 15...
1,1,Huge new Facebook data leak exposed intimate d...
2,0,A campaign has accelerated to turn a disused r...
3,1,Google launches global human trafficking helpl...
4,1,Over 3m Saudi Women Don’t Have ID Cards; Saudi...


In [46]:
type(train_text)

numpy.ndarray

In [47]:
vocab_size = 10000
emb_size = 128
max_length = 20000
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<oov>'

In [55]:
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(train_text)

train_sequences =  tokenizer.texts_to_sequences(train_text)
train_padded = pad_sequences(train_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_text)
test_padded = pad_sequences(test_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

In [56]:
train_padded.shape[1]

20000

In [63]:
model = Sequential()
model.add(Embedding(vocab_size, emb_size, input_length = max_length))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(8, activation = 'relu'))
model.add(Dense(3,activation = 'softmax'))
          
#model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 20000, 128)        1280000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 20000, 64)         41216     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 8)                 520       
_________________________________________________________________
dense_11 (Dense)             (None, 3)                 27        
Total params: 1,321,763
Trainable params: 1,321,763
Non-trainable params: 0
_________________________________________________________________


In [64]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
history = model.fit(train_padded,
                    train_labels,
                    epochs = 20,
                    validation_data = (test_padded, test_labels))

Train on 12449 samples, validate on 3113 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20