## Import Libraries and Data

In [None]:
import numpy as np               #array processing
import pandas as pd              #data manipulation
import re                        #regular expression for text cleaning
import random

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("Womens Clothing E-Commerce Reviews.csv")

Let's check the shape of the data.

In [None]:
data.shape

In [None]:
data.head()

## Check Missing Data 

In [None]:
data.isnull().sum()

In [None]:
# drop observations with missing data
data.dropna(axis=0,inplace=True)

In [None]:
data.shape

## Preprocess the Text Data

Before getting started with text summarization, let's preprocess the text in __Title__ and __Review Text__. The objective is to make the text suitable for modeling by taking off as much noise as possible.

We will carry out the following preprocessing operations:

1. Convert text to lowercase
2. Expand the contractions ("isn't" to "is not")
3. Remove everyhing from the text except alphabets, '.' and ','
4. Remove single-character tokens

In [None]:
# define a dictionary of all possible contractions and their expanded forms
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

Let's define a function that will preprocess and clean the text.

In [None]:
def text_cleaner(text):
    newString = text.lower()
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split()])
    newString = re.sub(r"’s\b","",newString)
    newString = re.sub("[^a-zA-Z.,]", " ", newString)
      
    # remove terms with length = 1
    long_tokens=[]
    
    for i in newString.split():
        if(len(i) > 1):
            long_tokens.append(i)
    
    # return preprocessed tweets
    return " ".join(long_tokens)

Now preprocess the text in the features __Review Text__ and __Title__.

In [None]:
# preprocess review text
cleaned_text = []
reviews=np.array(data['Review Text'])

for i in range(len(reviews)):
    cleaned_text.append(text_cleaner(reviews[i]))
    
    
# preprocess titles
cleaned_summary = []
summary=np.array(data['Title'])

for i in range(len(summary)):
    cleaned_summary.append(text_cleaner(summary[i]))
    
# create a dataframe
df=pd.DataFrame({'text':cleaned_text,'summary':cleaned_summary,'reviews':reviews})

<br>

## Text to Sequences

Let's check the distribution of the length of the reviews and the titles. It will help us in finding the suitable maximum length of the text that will be used in the encoder-decoder model

In [None]:
text_word_count = []
summary_word_count=[]

for i in df['text']:
      text_word_count.append(len(i.split()))
        
for i in df['summary']:
      summary_word_count.append(len(i.split()))
        
text_length_df = pd.DataFrame({'text':text_word_count})
text_length_df.hist(bins = 30,range=(0,1000))

summary_length_df = pd.DataFrame({'summary':summary_word_count})
summary_length_df.hist(bins = 30,range=(0,100))

plt.show()

In [None]:
# maximum length for text (user reviews)
max_text_len = 50

# maximum length for summaries (titles)
max_summary_len = 7

Let's add the __start-of-sentence__ and __end-of-sentence__ tokens ("sostok" and "eostok") to the summaries.

In [None]:
df['summary'] = df['summary'].apply(lambda x : 'sostok '+ x + ' eostok')

In [None]:
df.shape

<br>

### Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

#split the cleaned data
x_tr, x_val, y_tr, y_val = train_test_split( df['text'],df['summary'],test_size = 0.1, random_state=0,shuffle=True)

#split the uncleaned data into same ratio with same random state
text_tr, text_val, y_tr, y_val = train_test_split( df['reviews'],df['summary'],test_size = 0.1, random_state=0,shuffle=True)

In [None]:
x_tr.shape

### Create a Keras Tokenizer for Reviews

In [None]:
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences

x_tokenizer = Tokenizer(num_words=5000) 
x_tokenizer.fit_on_texts(list(x_tr))

### Convert Text to Integer Sequences

In [None]:
#convert text sequences into integer sequences
x_tr_seq    =   x_tokenizer.texts_to_sequences(x_tr) 
x_val_seq   =   x_tokenizer.texts_to_sequences(x_val)

#padding zero upto maximum length
x_tr   =   pad_sequences(x_tr_seq,  maxlen=max_text_len, padding='post')
x_val  =   pad_sequences(x_val_seq, maxlen=max_text_len, padding='post')

#size of vocabulary
x_voc   =  x_tokenizer.num_words + 1

### Create a Keras Tokenizer for Summaries / Titles

In [None]:
y_tokenizer = Tokenizer(num_words = 3000)   
y_tokenizer.fit_on_texts(list(y_tr))

In [None]:
summary_val = np.array(y_val)
text_val = np.array(text_val)

### Convert Summaries / Titles to Integer Sequences

In [None]:
# Convert word sequences to integer sequences
y_tr_seq = y_tokenizer.texts_to_sequences(y_tr) 
y_val_seq = y_tokenizer.texts_to_sequences(y_val) 

Eliminate the summaries/titles sequences whose length is less than 4.

In [None]:
# eliminate the sequences from training data
index=[]
for i in range(len(y_tr_seq)):
    if(len(y_tr_seq[i])<=3):
        index.append(i)
        
y_tr_seq = np.delete(y_tr_seq,index, axis=0)
x_tr = np.delete(x_tr,index, axis=0)

# eliminate the sequences from test data
index = []
for i in range(len(y_val_seq)):
    if(len(y_val_seq[i])<=3):
        index.append(i)
        
y_val_seq = np.delete(y_val_seq,index, axis=0)
x_val = np.delete(x_val,index, axis=0)
text_val = np.delete(text_val,index,axis=0)
summary_val = np.delete(summary_val,index,axis=0)

In [None]:
#padding zero upto maximum length
y_tr = pad_sequences(y_tr_seq, maxlen=max_summary_len, padding='post')
y_val = pad_sequences(y_val_seq, maxlen=max_summary_len, padding='post')

#size of vocabulary
y_voc = y_tokenizer.num_words +1

<br>

## Build a Sequence-to-Sequence Model for Text Summarization

In [None]:
import keras
from keras.layers import Input, LSTM, Embedding, Dense, concatenate, TimeDistributed, Add, dot, Activation
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint

# No. of hidden nodes
latent_dim = 300
# Dimension of embeddings
embedding_dim=100

## Encoder-Decoder Structure for model training

#Encoder 
encoder_inputs=Input(shape=(max_text_len,))
encoder_embedding = Embedding(x_voc,embedding_dim, trainable=True, mask_zero=True)(encoder_inputs)

encoder_lstm = LSTM(latent_dim,return_sequences=True,return_state=True,go_backwards=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

#Decoder
decoder_inputs=Input(shape=(None,))
dec_emb_layer = Embedding(y_voc, embedding_dim,trainable=True,mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim,return_sequences=True,return_state=True)
decoder_outputs, dec_state_h, dec_state_c = decoder_lstm(dec_emb,initial_state=[state_h, state_c])

#Dense layer
dense_layer = TimeDistributed(Dense(latent_dim, activation="tanh"))
dense_outputs=dense_layer(decoder_outputs) 

#Output layer
output_layer = TimeDistributed(Dense(y_voc, activation="softmax"))
decoder_outputs=output_layer(dense_outputs)

model = Model([encoder_inputs,decoder_inputs], decoder_outputs)
model.summary()

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

### Define Early Stopping and Save Model

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2, min_delta=0.0001) 
mc = ModelCheckpoint('best_model_9.hdf5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')

In [None]:
x_tr.shape

### Initiate Model Training

In [None]:
history=model.fit([x_tr,y_tr[:,:-1]], y_tr.reshape(y_tr.shape[0],y_tr.shape[1], 1)[:,1:] ,epochs=50, callbacks=[es,mc], batch_size=32, validation_data=([x_val,y_val[:,:-1]], y_val.reshape(y_val.shape[0],y_val.shape[1], 1)[:,1:]))

<br>

### Load the Saved Model

In [None]:
model=keras.models.load_model('best_model_9.hdf5')

In [None]:
from matplotlib import pyplot
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

## Model Inference 

In [None]:
reverse_target_word_index=dict((v, k) for k, v in y_tokenizer.word_index.items())
reverse_source_word_index=dict((v, k) for k, v in x_tokenizer.word_index.items())

## Encoder-Decoder structure for model inference

# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(max_text_len,latent_dim))

# Get the embeddings of the decoder sequence
dec_emb= dec_emb_layer(decoder_inputs) 

# To predict the next word in the sequence, set the initial states to the states from the previous time step
dec_outputs, dec_h, dec_c = decoder_lstm(dec_emb, initial_state=[decoder_state_input_h, decoder_state_input_c])

#dense layer
dense_outputs=dense_layer(dec_outputs)

# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = output_layer(dense_outputs) 

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [dec_h, dec_c])

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = y_tokenizer.word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
      
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out,e_h, e_c])

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :]) 
        sampled_token = reverse_target_word_index[sampled_token_index]
        
        if(sampled_token!='eostok'):
            decoded_sentence= decoded_sentence+sampled_token+' '

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (max_summary_len-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        e_h, e_c = h, c

    return decoded_sentence.strip()

### Convert Validation Integer Sequences back to Text

In [None]:
def seq2source(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+reverse_source_word_index[i]+' '
    return newString.strip()

In [None]:
source=[]
for i in range(len(x_val)):
    source.append(seq2source(x_val[i]))  

In [None]:
summary_val=[' '.join(i.split()[1:-1]) for i in summary_val]

In [None]:
index=[]
for i in range(len(source)):
    if(len(source[i].split()) >= 10):
        index.append(i)

### Generate Some Summaries

In [None]:
for i in random.sample(range(0,len(text_val)+1),20):
    print("Review:",text_val[index[i]],"\n")
    print("Actual summary:",summary_val[index[i]])
    print("Predicted summary:",decode_sequence(x_val[index[i]].reshape(1,max_text_len)))
    print("\n")