In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'nyt-comments:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F19447%2F31436%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240402%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240402T053538Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Ddfbd7eae7df5c94269875e97db169d655a8227000fdd0a8ec13626b1142e4ae19b3eca610946b9097c68d7111195ff02f3ac857a2fabb47eb85e589a18aced2cf4618566d214b004fe277e3f02d1ce58086de8f2690240ac2131b906f00e455a42869d5ff93c129945dc23fae1929680de75ef9797c4bb56ffce043ee1c5a4f75586f04c5e0cbe3d858377a5543688437ba050bad8155c657ddef4d9f530d620dc467c2f9887cf06f54aceff92ccac968016ef23bab2af9b8f34e95c8df91c057d65915f3325b8dc6c537107a506abcb8bf049f4a4fae5657c1d610ad177ae86ba99bc534051c6ed6665b0c330c1d261043c40ff18df423b24ccf43210b75ed0'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading nyt-comments, 502973613 bytes compressed
Downloaded and uncompressed: nyt-comments
Data source import complete.


In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [3]:
curr_dir = '/kaggle/input/nyt-comments/'
all_headlines = []
for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(curr_dir + filename)
        all_headlines.extend(list(article_df.headline.values))
        break
all_headlines = [line for line in all_headlines if line!= "Unknown"]
all_headlines[:5]

['Finding an Expansive View  of a Forgotten People in Niger',
 'And Now,  the Dreaded Trump Curse',
 'Venezuela’s Descent Into Dictatorship',
 'Stain Permeates Basketball Blue Blood',
 'Taking Things for Granted']

In [4]:
def clean_s1(txt):
    txt = "".join(t for t in txt if t not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt
corpus = [clean_s1(x) for x in all_headlines]
corpus[:5]

['finding an expansive view  of a forgotten people in niger',
 'and now  the dreaded trump curse',
 'venezuelas descent into dictatorship',
 'stain permeates basketball blue blood',
 'taking things for granted']

In [5]:
tokenizer = Tokenizer()
def get_sequence_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

input_sequences, total_words = get_sequence_of_tokens(corpus)
input_sequences[:8]

[[169, 17],
 [169, 17, 665],
 [169, 17, 665, 367],
 [169, 17, 665, 367, 4],
 [169, 17, 665, 367, 4, 2],
 [169, 17, 665, 367, 4, 2, 666],
 [169, 17, 665, 367, 4, 2, 666, 170],
 [169, 17, 665, 367, 4, 2, 666, 170, 5]]

In [6]:
for j in range(8):
    s = input_sequences[j]
    for i in range(1, len(s)+1):
        print(tokenizer.index_word[s[i-1]], end = ' ')
    print()

finding an 
finding an expansive 
finding an expansive view 
finding an expansive view of 
finding an expansive view of a 
finding an expansive view of a forgotten 
finding an expansive view of a forgotten people 
finding an expansive view of a forgotten people in 


In [7]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = tf.keras.utils.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence = generate_padded_sequences(input_sequences)
max_sequence = max_sequence-1
print("Length of Max Sequence:",max_sequence)
for i in range(3):
    print(predictors[i], list(label[i]).index(1))

Length of Max Sequence: 18
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 169] 17
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 169  17] 665
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 169  17 665] 367


In [8]:
total_words

2422

In [9]:
model = Sequential()
model.add(Embedding(total_words,20,input_length=max_sequence))
model.add(Bidirectional(LSTM(units=100, return_sequences=True)))
model.add(LSTM(units=100))
model.add(Dropout(0.5))
model.add(Dense(units=total_words, activation='softmax'))

In [10]:
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 18, 20)            48440     
                                                                 
 bidirectional (Bidirection  (None, 18, 200)           96800     
 al)                                                             
                                                                 
 lstm_1 (LSTM)               (None, 100)               120400    
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 2422)              244622    
                                                                 
Total params: 510262 (1.95 MB)
Trainable params: 510262 (1.95 MB)
Non-trainable params: 0 (0.00 Byte)
____________________

In [11]:
model.fit(predictors,label,epochs=100,verbose=5)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x798e45303070>

In [12]:
def make_pred(sentence, limit):
    for i in range(limit):
        tokenized_words = tokenizer.texts_to_sequences([sentence])[0]
        tokenized_words = pad_sequences([tokenized_words], maxlen=max_sequence, padding='pre')
        pred_word = np.argmax(model.predict(tokenized_words))
        pred = tokenizer.index_word[pred_word]
        sentence += " " + pred
    return sentence

sentence = "I know how to"
length_words = 2
output_pred = make_pred(sentence,length_words)
print(output_pred)

I know how to ease college
