

> **libraries for data handling**



In [None]:
import numpy as np
import pandas as pd
import string
import tensorflow as tf
import os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from tensorflow.python.framework.random_seed import set_random_seed

In [3]:
# set seeds for reproducability
from numpy.random import seed
set_random_seed(2)
seed(1)
# keras module for building LSTM 
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku



**Loading the Dataset**



In [13]:
! pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
#!pip install -q kaggle

In [14]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (5).json


{'kaggle.json': b'{"username":"asrargalt","key":"5ed745b7c3591129b660816a5d292acf"}'}

In [17]:
! cp kaggle.json ~/.kaggle/

In [19]:
! chmod 600 ~/.kaggle/kaggle.json

In [25]:
! kaggle datasets download -d aashita/nyt-comments

nyt-comments.zip: Skipping, found more recently modified local copy (use --force to force download)


In [26]:
! unzip /content/nyt-comments.zip

Archive:  /content/nyt-comments.zip
replace ArticlesApril2017.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: ArticlesApril2017.csv   
  inflating: ArticlesApril2018.csv   
  inflating: ArticlesFeb2017.csv     
  inflating: ArticlesFeb2018.csv     
  inflating: ArticlesJan2017.csv     
  inflating: ArticlesJan2018.csv     
  inflating: ArticlesMarch2017.csv   
  inflating: ArticlesMarch2018.csv   
  inflating: ArticlesMay2017.csv     
  inflating: CommentsApril2017.csv   
  inflating: CommentsApril2018.csv   
  inflating: CommentsFeb2017.csv     
  inflating: CommentsFeb2018.csv     
  inflating: CommentsJan2017.csv     
  inflating: CommentsJan2018.csv     
  inflating: CommentsMarch2017.csv   
  inflating: CommentsMarch2018.csv   
  inflating: CommentsMay2017.csv     


# Read Articals headlines

In [27]:
curr_dir = '/content/'
all_headlines = [] 
for filename in os.listdir(curr_dir):
 if 'Articles' in filename:
    article_df = pd.read_csv(curr_dir + filename) 
    all_headlines.extend(list(article_df.headline.values))
    break 
all_headlines = [h for h in all_headlines if h != "Unknown"] 
len(all_headlines)

829

In [28]:
all_headlines = [line for line in all_headlines if line!= "Unknown"]
print(all_headlines[:10])

['N.F.L. vs. Politics Has Been Battle All Season Long', 'Voice. Vice. Veracity.', 'A Stand-Up’s Downward Slide', 'New York Today: A Groundhog Has Her Day', 'A Swimmer’s Communion With the Ocean', 'Trail Activity', 'Super Bowl', 'Trump’s Mexican Shakedown', 'Pence’s Presidential Pet', 'Fruit of a Poison Tree']


# 1) Cleaning:

In [29]:
def clean_data(headlines):
    headlines = "".join(i for i in headlines if i not in string.punctuation).lower()
    headlines = headlines.encode("utf8").decode("ascii",'ignore')
    return headlines 
data = [clean_data(x) for x in all_headlines]

In [30]:
data[0:10]

['nfl vs politics has been battle all season long',
 'voice vice veracity',
 'a standups downward slide',
 'new york today a groundhog has her day',
 'a swimmers communion with the ocean',
 'trail activity',
 'super bowl',
 'trumps mexican shakedown',
 'pences presidential pet',
 'fruit of a poison tree']

# 2) Generating Sequence of N-Grams Tokens:

In [31]:
from keras.preprocessing.text import Tokenizer

In [32]:
tokenizer = Tokenizer()

In [33]:
def get_sequence_of_tokens(data):
    tokenizer.fit_on_texts(data)
    words = len(tokenizer.word_index)+1
    input_sequences = []
    for line in data:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1,len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences,words

In [34]:
sequences,words = get_sequence_of_tokens(data)

In [35]:
sequences[0:10]

[[660, 117],
 [660, 117, 72],
 [660, 117, 72, 73],
 [660, 117, 72, 73, 661],
 [660, 117, 72, 73, 661, 662],
 [660, 117, 72, 73, 661, 662, 63],
 [660, 117, 72, 73, 661, 662, 63, 29],
 [660, 117, 72, 73, 661, 662, 63, 29, 210],
 [211, 663],
 [211, 663, 664]]

# 3) Padding the Sequences:

In [36]:
from keras_preprocessing.sequence import pad_sequences

In [37]:
max_sequence_len = max([len(x) for x in sequences])
input_sequences = np.array(pad_sequences(sequences,maxlen=max_sequence_len,padding='pre'))

# 4) Creating Predictors and Targets:

In [38]:
predictors,label = input_sequences[:,:-1],input_sequences[:,-1]
label = tf.keras.utils.to_categorical(label,num_classes=words)

In [40]:
input_len = max_sequence_len - 1
model = tf.keras.Sequential()
# ----------Add Input Embedding Layer
model.add(tf.keras.layers.Embedding(words,50,input_length=input_len))
# ----------Add Hidden Layer 1 - LSTM Layer
model.add(tf.keras.layers.LSTM(500))
model.add(Dropout(0.1))
# ----------Add Output Layer
model.add(tf.keras.layers.Dense(words,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam')
model = create_model(max_sequence_len, words)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 16, 10)            22880     
                                                                 
 lstm_1 (LSTM)               (None, 100)               44400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 2288)              231088    
                                                                 
Total params: 298,368
Trainable params: 298,368
Non-trainable params: 0
_________________________________________________________________




> **Training the model**









In [41]:
history = model.fit(predictors,label,epochs=800)

Epoch 1/800
Epoch 2/800
Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800
Epoch 7/800
Epoch 8/800
Epoch 9/800
Epoch 10/800
Epoch 11/800
Epoch 12/800
Epoch 13/800
Epoch 14/800
Epoch 15/800
Epoch 16/800
Epoch 17/800
Epoch 18/800
Epoch 19/800
Epoch 20/800
Epoch 21/800
Epoch 22/800
Epoch 23/800
Epoch 24/800
Epoch 25/800
Epoch 26/800
Epoch 27/800
Epoch 28/800
Epoch 29/800
Epoch 30/800
Epoch 31/800
Epoch 32/800
Epoch 33/800
Epoch 34/800
Epoch 35/800
Epoch 36/800
Epoch 37/800
Epoch 38/800
Epoch 39/800
Epoch 40/800
Epoch 41/800
Epoch 42/800
Epoch 43/800
Epoch 44/800
Epoch 45/800
Epoch 46/800
Epoch 47/800
Epoch 48/800
Epoch 49/800
Epoch 50/800
Epoch 51/800
Epoch 52/800
Epoch 53/800
Epoch 54/800
Epoch 55/800
Epoch 56/800
Epoch 57/800
Epoch 58/800
Epoch 59/800
Epoch 60/800
Epoch 61/800
Epoch 62/800
Epoch 63/800
Epoch 64/800
Epoch 65/800
Epoch 66/800
Epoch 67/800
Epoch 68/800
Epoch 69/800
Epoch 70/800
Epoch 71/800
Epoch 72/800
Epoch 73/800
Epoch 74/800
Epoch 75/800
Epoch 76/800
Epoch 77/800
Epoch 78



> **Text Generation (Prediction)**



In [42]:
def generate_text(seed_text,next_words,model,max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list],maxlen=max_sequence_len-1,padding='pre')
        predict_x=model.predict(token_list,verbose=0) 
        classes_x=np.argmax(predict_x,axis=1)
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index==classes_x:
                output_word = word
                break
        seed_text += " "+ output_word
    return seed_text.title()





> Prediction







In [43]:
print(generate_text("Political",8,model,max_sequence_len))

Political Support Divides Art Therapists The Same Friend Freely


In [44]:
print (generate_text("president trump", 3, model, max_sequence_len))
print (generate_text("united states", 4, model, max_sequence_len))
print (generate_text("donald trump", 2, model, max_sequence_len))
print (generate_text("new york", 3, model, max_sequence_len))
print (generate_text("science and technology", 5, model, max_sequence_len))

President Trump And Staff Rethink
United States Race To Pillars Of
Donald Trump Master Of
New York Today A Goodbye
Science And Technology Rules Of The Game Era
