In [89]:
import gensim
#import fasttext as ft
import re
import itertools
from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import TimeDistributed
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import collections 
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import time
import sys
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
start = time.time()

In [90]:
articles1 = pd.read_csv('articles_with_topics.csv')
articles = articles1.iloc[:100,:]

In [91]:
articles.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content,topics
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...,"['republican', 'administration', 'health', 'su..."
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood...","['police', 'band', 'arrest', 'crime', 'percent..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri...","['1942', 'wong', 'artistic', 'miserable', 'chi..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t...","['george', 'led', 'death', '“the', 'were', 'zs..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ...","['ballistic', 'test', 'korea', 'missile', '1',..."


In [92]:
dictionary = list(articles.content)
def joinStrings(dictionary):
    return ' '.join(string for string in dictionary)

dictionary = joinStrings(dictionary)

In [93]:
dictionary = dictionary.lower()
chars = sorted(list(set(dictionary)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [94]:
n_chars = len(dictionary)
n_vocab = len(chars)
print("Total characters:", n_chars)
print("Total Vocab:", n_vocab)

Total characters: 743978
Total Vocab: 74


In [95]:
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = dictionary[i:i+seq_length]
    seq_out = dictionary[i+seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)

In [96]:
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
X_train = X[:int(0.75*len(X))]
X_val = X[int(0.75*len(X)):]
# one hot encode the output variable
y = np_utils.to_categorical(dataY)
y_train = y[:int(0.75*len(y))]
y_val = y[int(0.75*len(y)):]

In [97]:
y_val.shape

(185970, 74)

In [98]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [None]:
#filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
filepath="weights-improvement-{epoch:02d}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
start_training = time.time()
history = model.fit(X_train, y_train, epochs=10, batch_size=128, callbacks=callbacks_list, validation_data = (X_val, y_val))
finish_training = time.time()
print('Training time = ', finish_training- start_training)

Train on 557908 samples, validate on 185970 samples
Epoch 1/10
  6144/557908 [..............................] - ETA: 50:04 - loss: 3.2262 - acc: 0.1460

In [None]:
filename = "weights-improvement-10.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])


In [None]:
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [None]:
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = np.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = np.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]

In [None]:
#stopset = set(nltk.corpus.stopwords.words('english'))
def cleaner(x):
    x = nltk.word_tokenize(str(x))
    #x = [w for w in x if not w in stopset]
    x = [z.lower() for z in x]
    return x

articles.content = articles.content.map(lambda x:cleaner(x))

In [None]:
vocab = []
word = []
for i in range(len(articles.content)):
    for j in range(len(articles.content[i])):
        word.append(articles.content[i][j])
        if articles.content[i][j] not in vocab:
            vocab.append(articles.content[i][j])
#chars = list(set(data))
seq_length = 1
print('Data length: {} words'.format(len(word)))
print('Vocabulary size: {} words'.format(len(vocab)))

ix_to_word = {ix:word for ix, word in enumerate(vocab)}
word_to_ix = {word:ix for ix, word in enumerate(vocab)}

In [None]:
keywords = input(str("Enter some keywords or sentences. In case you're entering keywords, don't use comma separation."))

In [None]:
keys = keywords.split()
for i in keys:
    if i not in word_to_ix:
        keys.remove(i)

In [None]:
if keys is None:
    print('Error! No keyword recognized!')
else:
    article = []
    for j in keys:
        new_dictionary = []
        for i in range(len(articles)):
            if j in articles.topics[i]:
                new_dictionary.append(articles.content[i])
        def joinStrings(dictionary):
            return ' '.join(string for string in dictionary)
        new_dictionary = joinStrings(new_dictionary)
        new_dictionary = new_dictionary.lower()
        #chars_new = sorted(list(set(new_dictionary)))
        #char_to_int = dict((c, i) for i, c in enumerate(chars))
        seq_length = 100
        dataX = []
        dataY = []
        for k in range(0, n_chars - seq_length, 1):
            seq_in = dictionary[k:k+seq_length]
            seq_out = dictionary[k+seq_length]
            dataX.append([char_to_int[char] for char in seq_in])
            dataY.append(char_to_int[seq_out])
        n_patterns = len(dataX)
        start = np.random.randint(0, len(dataX)-1)
        pattern = dataX[start]
        article.append(''.join([int_to_char[value] for value in pattern]))
        # generate characters
        for l in range(1000):
            x = np.reshape(pattern, (1, len(pattern), 1))
            x = x / float(n_vocab)
            prediction = model.predict(x, verbose=0)
            index = np.argmax(prediction)
            result = int_to_char[index]
            article.append(result)
            seq_in = [int_to_char[value] for value in pattern]
            sys.stdout.write(result)
            pattern.append(index)
            pattern = pattern[1:len(pattern)]

In [None]:
print('Generated article:', ''.join(article))

In [None]:
import matplotlib.pyplot as plt
plt.subplot(211)
plt.title("Accuracy")
plt.plot(history.history["acc"], color="g", label="Train")
plt.plot(history.history["val_acc"], color="b", label="Validation")
plt.legend(loc="best")

plt.subplot(212)
plt.title("Loss")
plt.plot(history.history["loss"], color="g", label="Train")
plt.plot(history.history["val_loss"], color="b", label="Validation")
plt.legend(loc="best")
plt.tight_layout()

plt.show()

In [None]:
end = time.time()
print(end - start, 'seconds')