In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/lib/kaggle/gcp.py
/kaggle/input/scientificamerican/Policy  Ethics.xlsx
/kaggle/input/scientificamerican/Math.xlsx
/kaggle/input/scientificamerican/Chemistry.xlsx
/kaggle/input/scientificamerican/Arts  Culture.xlsx
/kaggle/input/scientificamerican/Physics.xlsx
/kaggle/input/scientificamerican/Evolution.xlsx
/kaggle/input/scientificamerican/Biology.xlsx
/kaggle/working/__notebook_source__.ipynb


In [1]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.7-py2.py3-none-any.whl (243 kB)
[K     |████████████████████████████████| 243 kB 1.3 MB/s eta 0:00:01
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.7
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
df = pd.read_excel('../input/scientificamerican/Physics.xlsx', engine='openpyxl') 

In [3]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from tensorflow.random import set_seed 
from numpy.random import seed
set_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
    df = df.dropna()

In [7]:
all_articles = []
all_articles.extend(df[['title', 'articleText']].apply(lambda x: '. '.join(x), axis=1).values)
all_articles[0][:500]

'Quantum Astronomy Could Create Telescopes Hundreds of Kilometers Wide. A few years ago researchers using the radio-based Event Horizon Telescope (EHT) performed an extraordinary observation, the likes of which remains a dream for most other astronomers. The EHT team announced in April 2019 that it had successfully imaged the shadow of a supermassive black hole in a nearby galaxy by combining observations from eight different radio telescopes spread across our planet. This technique, called inter'

In [7]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_articles]

In [8]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus, n=3):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, n):        
#         for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus, 5)
inp_sequences[:10]

[[66, 849],
 [66, 849, 43],
 [66, 849, 43, 305],
 [66, 849, 43, 305, 558],
 [8, 1],
 [8, 1, 328],
 [8, 1, 328, 184],
 [8, 1, 328, 184, 2],
 [8280, 1742],
 [8280, 1742, 687]]

In [9]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [10]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 32, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(128))
    model.add(Dropout(0.2))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 32)             3730336   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               82432     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 116573)            15037917  
Total params: 18,850,685
Trainable params: 18,850,685
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.fit(predictors, label, epochs=500, verbose=1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f08d2664450>

In [12]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [13]:
print (generate_text("science", 400, model, max_sequence_len))

 In February Airplane Telescopes Could Exoplanets Astronomer How Was To Gold To The Ice That Use Cool For Giant Planet For Science Of A New May Sheds The Most Distant Theory May Help New Fuel Bosons After That Risks From A Power Of Quantum Of Lunar May Could The Time For A Quantum Pioneer Unlocks Matters By Better For Better You Are You Hear Astronomical Neutrinos Particles May Be A Magnetic Gravitational For The Physics Of Diving Gannets From Bad To Worse Hardluck Nearly When For By Source Of The Future Of Gravitational Wave Illusions Of Hunting Black Made Of The Most Bang Physics May What To From Manmade And Time To Make To Black About Key To On The Universe May Be Expanding Black Hole More Slowly Better You Enjoyed Be Hunting Robot And A Graphic The Quantum Tantra On One The Hemispheres Of A Science Of The First Heat Of The Moment How To Packs Win More Fromck About Key To On The Universe May Be Expanding Black Hole More Slowly Better You Enjoyed Be Hunting Robot And A Graphic The Qu

In [14]:
model.save(filepath='../output/kaggle/working/em10_lstm100_ep500.h5',
           overwrite=True,
           include_optimizer=True,
           save_traces=True)

In [15]:
import shutil
shutil.make_archive('em10_lstm100_ep500.h5', 'zip', '../output/kaggle/working/')

'/kaggle/working/em10_lstm100_ep500.h5.zip'