In [115]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

import numpy as np
import glob
import random

import nltk

from multi_rake import Rake
from tqdm import tqdm_notebook as tqdm

In [2]:
all_csv = glob.glob('conference-data/csv/*.csv')

In [60]:
# read them all in at once
# Note that 2008 did not have date or year for all entries, so had to be added manually

year_df = (pd.read_csv(f) for f in all_csv)
df = pd.concat(year_df, ignore_index=False)

df = df.fillna('')
df = df.astype(str)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """


In [68]:
# trying some predictive text
import tensorflow as tf
import numpy as np

import glob

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from nltk.corpus import stopwords 
stopWords = set(stopwords.words('english'))

  from ._conv import register_converters as _register_converters


In [69]:
all_titles = []
all_abstracts = []

for i,r in df.iterrows():
            
    all_titles.append(str(r['title']))
    all_abstracts.append(str(r['abstract']))

In [70]:
all_titles = [t for t in all_titles if t != '']

all_abstracts = [t for t in all_abstracts if t != '']

In [71]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(all_titles)

total_words = len(tokenizer.word_index) + 1
print(total_words)

In [73]:
# make n_grams of word_indexes for each sentence - e.g. [4,2], [4,2,17], [4,2,17,36], etc

input_sentences = []

for line in title_corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1, len(token_list)):
        n_gram_sequences = token_list[:i+1]
        input_sentences.append(n_gram_sequences)

In [74]:
# find longest sentence

max_sequence_len = max(len(x) for x in input_sentences)
max_sequence_len

39

In [75]:
# pad sequences to longest length

input_sequences = np.array(pad_sequences(input_sentences, maxlen=max_sequence_len, padding='pre'))

In [76]:
input_sequences

array([[   0,    0,    0, ...,    0,    9,    1],
       [   0,    0,    0, ...,    9,    1,  481],
       [   0,    0,    0, ...,    1,  481,    3],
       ...,
       [   0,    0,    0, ..., 1431,    2,    1],
       [   0,    0,    0, ...,    2,    1,  366],
       [   0,    0,    0, ...,    0,  218, 1432]], dtype=int32)

In [77]:
# to get predictive properties, take the above array and
# use all tokens except the final one as the input, 
# and the final one as the label 

xs = input_sequences[:,:-1]
labels = input_sequences[:,-1]

In [78]:
# encode the labels for tf
# creates "one-hot encoding" by converting list [labels] to categorical

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [None]:
# see one-hot encoding (find the '1' to represent the word in the word_index)
ys[2]

In [142]:
# model - note some has been updated from 07 notebook

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len - 1))
# using Bidirectional to allow model to look before and after a given word
model.add(Bidirectional(LSTM(50)))
model.add(Dense(total_words, activation='softmax'))
# you can set the lr ("learning rate") on the adam optimizer
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.fit(xs, ys, epochs=18, verbose=1)


Train on 3367 samples
Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


<tensorflow.python.keras.callbacks.History at 0x1a396e1630>

In [151]:
# prediction to call next n of words

seed_titles = ' '.join(set([t.strip() for t in ' '.join(all_titles).split() if t not in stopWords and t.isalpha()]))

def get_title(seed_text):

    next_words = np.random.randint(low=4, high=10)

    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)

        output_word = ""

        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    
    if seed_text.split()[-1] in stopWords:
        seed_text = ' '.join(seed_text.split()[:-1])
        
    print(seed_text.capitalize())

beg_words = []    
    
for i in range(0,10):
    
    seed_text = random.choice(seed_titles.split())
    
    get_title(seed_text)


Music database the project we
Accessible to the university
Worldcat discovery libraries museums the university
Core person entities such as xissn worldcat registry this
Projet commercial ocr standard is responsible for varyfrom benign
Halinet commercial ocr standard
Big data fangirl offers
Xmlmark commercial ocr standard
Comparative media studies and the library
Canadiennes commercial ocr standard is responsible


In [144]:
# abstracts

tokenizer = Tokenizer()

tokenizer.fit_on_texts(all_abstracts)

total_words = len(tokenizer.word_index) + 1
print(total_words)

5834


In [145]:
# make n_grams of word_indexes for each sentence - e.g. [4,2], [4,2,17], [4,2,17,36], etc

input_sentences = []

for line in all_abstracts:
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1, len(token_list)):
        n_gram_sequences = token_list[:i+1]
        input_sentences.append(n_gram_sequences)

In [146]:
# find longest sentence

max_sequence_len = max(len(x) for x in input_sentences)
max_sequence_len

383

In [147]:
# pad sequences to longest length

input_sequences = np.array(pad_sequences(input_sentences, maxlen=max_sequence_len, padding='pre'))

In [148]:
# to get predictive properties, take the above array and
# use all tokens except the final one as the input, 
# and the final one as the label 

xs = input_sequences[:,:-1]
labels = input_sequences[:,-1]

In [149]:
# encode the labels for tf
# creates "one-hot encoding" by converting list [labels] to categorical

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [150]:
# model - note some has been updated from 07 notebook

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len - 1))
# using Bidirectional to allow model to look before and after a given word
model.add(Bidirectional(LSTM(50)))
model.add(Dense(total_words, activation='softmax'))
# you can set the lr ("learning rate") on the adam optimizer
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.fit(xs, ys, epochs=18, verbose=1)


Train on 37922 samples
Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


<tensorflow.python.keras.callbacks.History at 0x1a77df6be0>