In [None]:
import numpy as np
import pandas as pd
import os
from warnings import filterwarnings
filterwarnings('ignore')
pd.set_option('max_colwidth', 1000)


In [None]:
from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/Colab DataSets/abcnews-date-text.csv')
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting licence
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1226258 entries, 0 to 1226257
Data columns (total 2 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   publish_date   1226258 non-null  int64 
 1   headline_text  1226258 non-null  object
dtypes: int64(1), object(1)
memory usage: 18.7+ MB


In [None]:
## We will be taking 10000 long headlines
df = df.iloc[df['headline_text'].apply(lambda x : len(x)).sort_values(ascending = False).index].head(10000)
df.head()

Unnamed: 0,publish_date,headline_text
864170,20140331,tasmanian world heritage delisting breach of international obligations
998246,20151208,berri barmera council gets $3.5 million for multipurpose sports centre
748563,20121213,salvation army determined to deliver on nauru despite tough conditions
831712,20131105,johnson and johnson fined 2.2 billion for faulty drug claims kickbacks
1085421,20170610,daniel morcombe assistant commissioner mike conduct alleged misconduct


In [None]:
df = df.reset_index()
print(len(df))
df.drop(columns = ['index' , 'publish_date'] , inplace = True)
df.head()

10000


Unnamed: 0,headline_text
0,tasmanian world heritage delisting breach of international obligations
1,berri barmera council gets $3.5 million for multipurpose sports centre
2,salvation army determined to deliver on nauru despite tough conditions
3,johnson and johnson fined 2.2 billion for faulty drug claims kickbacks
4,daniel morcombe assistant commissioner mike conduct alleged misconduct


In [None]:
headlines = []
headlines.extend(list(df["headline_text"].values))
print(len(headlines))
headlines[:5]

10000


['tasmanian world heritage delisting breach of international obligations',
 'berri barmera council gets $3.5 million for multipurpose sports centre',
 'salvation army determined to deliver on nauru despite tough conditions',
 'johnson and johnson fined 2.2 billion for faulty drug claims kickbacks',
 'daniel morcombe assistant commissioner mike conduct alleged misconduct']

### Text Cleaning

In [None]:
import string
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def clean_text(headline):
    text = "".join( word for word in headline if word not in string.punctuation ).lower()
    text = text.encode("utf8").decode("ascii", "ignore")
    return text

corpus = [ clean_text(headline) for headline in headlines ]
corpus[:5]

['tasmanian world heritage delisting breach of international obligations',
 'berri barmera council gets 35 million for multipurpose sports centre',
 'salvation army determined to deliver on nauru despite tough conditions',
 'johnson and johnson fined 22 billion for faulty drug claims kickbacks',
 'daniel morcombe assistant commissioner mike conduct alleged misconduct']

In [None]:
corpus[len(corpus)-5:]

['fears exploratory drilling will contaminate local water supplies',
 'barge carrying fireworks display catches fire off terrigal beach',
 'australia needs more than proteas thrashing to regain number one',
 'victoria seeks to follow queensland in criminalising bikie gangs',
 'russian tankers fuelled north korea via transfers at sea reports']

### Generating sequence of N-gram tokens

In [None]:
vocab = []
for line in corpus:
    words = line.split()
    for word in words:
        vocab.append(word)

vocablary = set(vocab)
print(len(vocablary))

15861


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word2index = tokenizer.word_index
len(word2index)

15861

In [None]:
word2index.get('and')

10

In [None]:
dictionary = {}
rev_dictionary = {}
for word, idx in word2index.items():
    dictionary[word] = idx
    rev_dictionary[idx] = word

In [None]:
from pickle import dump
# saving the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [None]:
max(rev_dictionary.keys())

15861

In [None]:
input_seqences = tokenizer.texts_to_sequences(corpus)
print(len(input_seqences))

10000


In [None]:
input_seqences[:5]

[[97, 66, 495, 7768, 526, 3, 318, 7769],
 [7770, 7771, 113, 417, 2922, 94, 4, 7772, 1160, 136],
 [1015, 373, 2923, 1, 1617, 5, 840, 196, 1246, 1161],
 [473, 10, 473, 1357, 1618, 175, 4, 5381, 129, 137, 7773],
 [452, 2924, 2537, 550, 749, 1247, 90, 1481]]

In [None]:
input_data = []
output_data = []
for line in input_seqences[:2]:
  print(len(line))
  for i in range(1,len(line)):
    print(i)  
    input_data.append(line[:i])

  for i in range(1,len(line)):  
    if i == int(1):
      output_data.append(line[i])
    else:  
      output_data.append(line[i])
    
print(input_data)

8
1
2
3
4
5
6
7
10
1
2
3
4
5
6
7
8
9
[[97], [97, 66], [97, 66, 495], [97, 66, 495, 7768], [97, 66, 495, 7768, 526], [97, 66, 495, 7768, 526, 3], [97, 66, 495, 7768, 526, 3, 318], [7770], [7770, 7771], [7770, 7771, 113], [7770, 7771, 113, 417], [7770, 7771, 113, 417, 2922], [7770, 7771, 113, 417, 2922, 94], [7770, 7771, 113, 417, 2922, 94, 4], [7770, 7771, 113, 417, 2922, 94, 4, 7772], [7770, 7771, 113, 417, 2922, 94, 4, 7772, 1160]]


In [None]:
output_data

[66,
 495,
 7768,
 526,
 3,
 318,
 7769,
 7771,
 113,
 417,
 2922,
 94,
 4,
 7772,
 1160,
 136]

In [None]:
input_data = []
target = []
for line in input_seqences:
  for i in range(1,len(line)):
    input_data.append(line[:i])
  for i in range(1,len(line)):  
    if i == int(1):
      target.append(line[i])
    else:  
      target.append(line[i])
    
print(len(input_data) , len(target))
input_data[:10]

86758 86758


[[97],
 [97, 66],
 [97, 66, 495],
 [97, 66, 495, 7768],
 [97, 66, 495, 7768, 526],
 [97, 66, 495, 7768, 526, 3],
 [97, 66, 495, 7768, 526, 3, 318],
 [7770],
 [7770, 7771],
 [7770, 7771, 113]]

In [None]:
target[:10]

[66, 495, 7768, 526, 3, 318, 7769, 7771, 113, 417]

In [None]:
MAX_LEN = 0
for seq in input_data:
    if len(seq) > MAX_LEN:
        MAX_LEN = len(seq)
MAX_LEN

14

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
input_data = pad_sequences(input_data, maxlen=MAX_LEN, padding="post", truncating="post")
print(len(input_data[0]))
input_data.shape

14


(86758, 14)

In [None]:
from tensorflow.keras.utils import to_categorical


In [None]:
vocab_size = len(word2index)+1
target = to_categorical(target, num_classes=vocab_size )
print(target.shape)
target

(86758, 15862)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
print(MAX_LEN)

14


## Model Building

In [None]:

from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *


In [None]:
print(vocab_size , target.shape[1])

15862 15862


In [None]:
## GRU model
gru_model = Sequential()
gru_model.add(Embedding(input_dim=vocab_size, output_dim=300, input_length=MAX_LEN))
gru_model.add(GRU(units=100))
gru_model.add(Dropout(rate=0.1))
gru_model.add(Dense(units=target.shape[1], activation="softmax"))

In [None]:
## LSTM model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=300, input_length=MAX_LEN))
model.add(Bidirectional(LSTM(300)))
model.add(Dropout(rate=0.1))
model.add(Dense(units=target.shape[1], activation="softmax"))

In [None]:
gru_model.compile(loss="categorical_crossentropy", optimizer="adam" , metrics=['accuracy'])
gru_model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 14, 300)           4758600   
_________________________________________________________________
gru (GRU)                    (None, 100)               120600    
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 15862)             1602062   
Total params: 6,481,262
Trainable params: 6,481,262
Non-trainable params: 0
_________________________________________________________________


In [None]:
from time import time
t = time()

gru_model.fit(input_data, target, batch_size=32, epochs=20, verbose=1)
print("Total time taken to run : {} mins".format(np.round((time()-t)/60,decimals = 2)))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Total time taken to run : 46.28 mins


In [None]:
## Saving current model
gru_model.save('GRU_model.h5')

## Generating the text


In [None]:
#from tensorflow import set_random_seed
from numpy.random import seed
#set_random_seed(2)
seed(1)
from keras.models import load_model
# load the model
GRU = load_model('GRU_model.h5')


In [None]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, padding='post')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return (' '.join(result) , in_text)

In [None]:
text1 = "salvation army determined to deliver on nauru despite"
predicted , total_text = generate_seq(gru_model, tokenizer , MAX_LEN , text1 , 2)


In [None]:
print(total_text)

salvation army determined to deliver on nauru despite crisis boss


In [None]:
print(predicted)

crisis boss


### It needs more Data as well as more epochs to train !!! However I've used approx. all of available Ram of Colab during training above along with GPU support . 