In [15]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 


import pandas as pd
import numpy as np
import string, os 

import warnings
from tqdm import tqdm

In [3]:
ROOT = '/home/mluser/users'
data_subjects = os.path.join(ROOT, 'data', 'subject_line')
print(data_subjects)

result = []
for (r, d, f) in os.walk(data_subjects):
    for i in tqdm(range(len(f))):
        df = pd.read_parquet(os.path.join(r, f[i]))
        result.extend(list(df["subject"]))

/home/mluser/users/data/subject_line


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [02:04<00:00,  4.14s/it]


In [4]:
len(result)

28315441

### Preprocessing-1

In [5]:
import unicodedata
import re

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

    w = w.strip()
    return w

In [6]:
corpus = [preprocess_sentence(x) for x in result if x is not None]

In [7]:
df1 = pd.DataFrame(corpus)
df1.head()

Unnamed: 0,0
0,ag test
1,my test
2,bestatigen sie bitte ihr e mail abonnement
3,ck tudor
4,pagemodo tip grow your audience with a like gate


### Preprocessing-2

In [8]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [preprocess_sentence(x) for x in result if x is not None]
print(corpus[:10])

['ag test', 'my test', 'bestatigen sie bitte ihr e mail abonnement', 'ck tudor', 'pagemodo tip grow your audience with a like gate', 'quick tip steps to a winning website', 'get design inspiration for your website', 'tips for creating an amazing facebook tab', 'please confirm your email subscription', 'feb c email']


In [9]:
len(corpus)

27934628

In [None]:
corpus = pd.DataFrame(corpus)
corpus.to_csv(os.path.join(ROOT, 'data', 'clean_subject_corpus.csv'))

In [17]:
corpus = pd.read_csv(os.path.join(ROOT, 'data', 'clean_subject_corpus.csv'))

In [19]:
df1 = corpus

### Tokenizing and Target Generation

In [20]:
import tensorflow as tf
def tokenize(encoder_text, decoder_text):

    print(len(encoder_text), "Encoder Subject line example: {}".format(encoder_text[0]))
    print(len(decoder_text), "Decoder Subject line example: {}".format(decoder_text[0]))
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(encoder_text)
    tokenizer.fit_on_texts(decoder_text)

    ## tf.keras.preprocessing.text.Tokenizer.texts_to_sequences converts string (w1, w2, w3, ......, wn) 
    ## to a list of correspoding integer ids of words (id_w1, id_w2, id_w3, ...., id_wn)
    encoder_sequences = tokenizer.texts_to_sequences(encoder_text)
    decoder_sequences = tokenizer.texts_to_sequences(decoder_text)

    ## tf.keras.preprocessing.sequence.pad_sequences takes argument a list of integer id sequences 
    ## and pads the sequences to match the longest sequences in the given input
    padded_encoder_seq = tf.keras.preprocessing.sequence.pad_sequences(encoder_sequences, padding='post')
    padded_decoder_seq = tf.keras.preprocessing.sequence.pad_sequences(decoder_sequences, padding='post')

    return padded_encoder_seq, padded_decoder_seq, tokenizer

In [24]:
df1 = df1['0'].astype(str)

In [26]:
padded_encoder_seq, padded_decoder_seq, tokenizer = tokenize(df1, df1)

27934628 Encoder Subject line example: ag test
27934628 Decoder Subject line example: ag test


In [None]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    '''Tokenization and convert data to sequence of tokens '''
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(df1)
print(inp_sequences[:10])

In [None]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words,10,input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

In [None]:
model = create_model(max_sequence_len, total_words)
model.summary()

In [None]:
model.fit(predictors, label, epochs=100, verbose=5)

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
print (generate_text("united states",5, model, max_sequence_len))
print (generate_text("preident trump", 4, model, max_sequence_len))
print (generate_text("donald trump", 4, model, max_sequence_len))
print (generate_text("india and china", 4, model, max_sequence_len))
print (generate_text("new york", 4, model, max_sequence_len))
print (generate_text("science and technology", 5, model, max_sequence_len))

print (generate_text("BITS Pilani", 8, model, max_sequence_len))