In [1]:
import numpy as np
from tqdm import tqdm
import email
import pandas as pd
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
import re
import nltk
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split

# Loading all the pickled data 

In [2]:
X_train_Encoder_input=pickle.load(open("../input/final-trained-encoder-decoder-models/X_train_Encoder_input.pkl","rb"))
X_test_Encoder_input=pickle.load(open("../input/final-trained-encoder-decoder-models/X_test_Encoder_input.pkl","rb"))

In [3]:
X_train_Decoder_input=pickle.load(open("../input/final-trained-encoder-decoder-models/X_train_Decoder_input.pkl","rb"))
X_test_Decoder_input=pickle.load(open("../input/final-trained-encoder-decoder-models/X_test_Decoder_input.pkl","rb"))

In [4]:
X_train_Decoder_output=pickle.load(open("../input/final-trained-encoder-decoder-models/X_train_Decoder_output.pkl","rb"))
X_test_Decoder_output=pickle.load(open("../input/final-trained-encoder-decoder-models/X_test_Decoder_output.pkl","rb"))

In [5]:
print("train data Encoder input length {}".format(X_train_Encoder_input.shape[1]))
print("train data Deccoder input length {}".format(X_train_Decoder_input.shape[1]))
print("train data Deccoder output length {}".format(X_train_Decoder_output.shape[1]))

train data Encoder input length 21
train data Deccoder input length 20
train data Deccoder output length 20


In [6]:
print("test data Encoder input length {}".format(X_test_Encoder_input.shape[1]))
print("test data Deccoder input length {}".format(X_test_Decoder_input.shape[1]))
print("test data Deccoder output length {}".format(X_test_Decoder_output.shape[1]))

test data Encoder input length 21
test data Deccoder input length 20
test data Deccoder output length 20


In [7]:
Enc_Tokenizer=pickle.load(open("../input/final-trained-encoder-decoder-models/Enc_Tokenizer.pkl","rb"))
Dec_Tokenizer=pickle.load(open("../input/final-trained-encoder-decoder-models/Dec_Tokenizer.pkl","rb"))
Dec_Tokenizer_target=pickle.load(open("../input/final-trained-encoder-decoder-models/Dec_Tokenizer_target.pkl","rb"))

In [8]:
# Creating the dictiionary with output vocab words
target_token_index=Dec_Tokenizer_target.word_index
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [9]:
input_vocab=len(Enc_Tokenizer.word_index)+1
output_vocab=len(Dec_Tokenizer_target.word_index)+1
print("input vocab length {}".format(input_vocab))
print("output vocab length {}".format(output_vocab))

input vocab length 19419
output vocab length 20955


# Defined the function to calculate the perplexity the dataset

In [10]:
def perplexity(y_true, y_pred):
    """
    this function will calculate perplexity metric by calculates the cross entropy loss and takes its exponent for train and test dataset
    """
    return keras.backend.exp(keras.backend.mean(keras.backend.sparse_categorical_crossentropy(y_true, y_pred)))

In [11]:
## defining some vairable like embedding size , number of gru or lstm units batch size and epochs we are going to run to train the model
embedding_dim=100    
latent_dim= 100      

# Bidirectional GRU based Encoder Decoder 

In [12]:
# Bidirectional GRU Encoder layer
encoder_in_layer = tf.keras.layers.Input(shape=(X_train_Encoder_input.shape[1],))

encoder_embedding = tf.keras.layers.Embedding(input_dim=input_vocab, output_dim=embedding_dim)

encoder_bi_gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(units=latent_dim, return_sequences=True, return_state=True))

# Discard the encoder output and use hidden states (h) of forward and backward layer
encoder_out, fstate_h,bstate_h= encoder_bi_gru(encoder_embedding(encoder_in_layer))

state_h = tf.keras.layers.Concatenate()([fstate_h, bstate_h])


# forward only GRU Decoder layer
decoder_in_layer = tf.keras.layers.Input(shape=(None,))

decoder_embedding = tf.keras.layers.Embedding(input_dim=output_vocab, output_dim=embedding_dim)

decoder_gru = tf.keras.layers.GRU(units=latent_dim*2, return_sequences=True, return_state=True)

# Discard internal states in training, keep only the output sequence
decoder_gru_out, _ = decoder_gru(decoder_embedding(decoder_in_layer), initial_state=state_h)

decoder_dense = tf.keras.layers.Dense(output_vocab, activation="softmax")

decoder_out_layer = decoder_dense(decoder_gru_out)

# Define the model that uses the Encoder and the Decoder
model2 = tf.keras.models.Model([encoder_in_layer, decoder_in_layer], decoder_out_layer)

model2.compile(optimizer='adam', loss="sparse_categorical_crossentropy", metrics=[perplexity])

model2.summary()

2022-06-28 14:09:19.269292: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 21)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 21, 100)      1941900     input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   [(None, 21, 200), (N 121200      embedding[0][0]                  
______________________________________________________________________________________________

# loading the pretained weighted which we have got during our training from best model

In [13]:
model2.load_weights('../input/final-trained-encoder-decoder-models/Bi_gru_best_model_weights.h5')

# loaded layer weights

In [14]:
model2.get_weights()[0]

array([[ 0.34457716,  0.27720726, -0.13754787, ...,  0.24860892,
        -0.10481738, -0.07865687],
       [ 0.3367375 , -0.02528957,  0.06972946, ...,  0.5417424 ,
        -0.00737609, -1.0066501 ],
       [ 0.18133155,  0.18576327, -0.01458529, ..., -0.12290533,
        -0.2844378 ,  0.12487155],
       ...,
       [-0.14527412, -0.08467149,  0.11735411, ..., -0.08907049,
        -0.11917455,  0.04306293],
       [-0.12630507, -0.00715517,  0.12349854, ...,  0.01107639,
         0.04716697,  0.09069911],
       [-0.12019105, -0.10082497,  0.17458011, ..., -0.073657  ,
         0.14752027,  0.03755322]], dtype=float32)

# Creating the inference for final prediction

In [15]:
 # Inference Encoder
encoder_model2 = tf.keras.models.Model(encoder_in_layer, state_h)

 # Inference Decoder
state_input_h = tf.keras.layers.Input(shape=(latent_dim*2,))
decoder_out, decoder_h = decoder_gru(decoder_embedding(decoder_in_layer), initial_state=state_input_h)
decoder_out = decoder_dense(decoder_out)
decoder_model2 = tf.keras.models.Model(inputs=[decoder_in_layer, state_input_h], 
                  outputs=[decoder_out, decoder_h])

# Defining all the preprocessing function that i needed before final prediction

In [16]:
def email_content(data):
    '''this function will get the email body text messsages'''
    mail_content = []
    for i in tqdm(data):
        message = email.message_from_string(i)
        mail_content.append(message.get_payload())
    return mail_content

CONTRACTION_MAP=pickle.load(open('../input/final-trained-encoder-decoder-models/CONTRACTION_MAP.pkl','rb'))# loading the contraction map 

def decontracted(text):
    '''this function will Replace all apostrophe/short words from text data'''
    for word in text.split():
        if word.lower() in CONTRACTION_MAP:
            text = text.replace(word, CONTRACTION_MAP[word.lower()])
    return text

def data_preprocess(text):
    '''This function will will preprocess the data by removing the puchuation digit email address and all non alphabet wrods from text.'''
    
    text=text.lower()
    text = re.sub(r'\.+', ".", text) #Replace multiple fullstops with single fullstop

    text = re.sub(r"[^a-zA-Z?.!,']+", " ", text)# replacing everything with space except (a-z, ".", "?", "!", ",", "'")

    # Compact spaces
    text = re.sub(r'[" "]+', " ", text)

    # Remove forwarded messages
    text = text.split('forwarded by')[0]

    final_text = text.strip()


    return final_text

# defined the functon which give predicted sequences 

In [17]:
def decode_sequence(input_seq):
    '''this function will predict the next words sequences given the input sequences'''
    
    if input_seq.split(' ')[0] != '<start>' and input_seq.split(' ')[-1] != '<end>':
        input_seq = '<start>'+ ' ' + input_seq + ' ' + '<end>'
    
    #print("input seq",input_seq)
    input_seq = Enc_Tokenizer.texts_to_sequences([str(input_seq)])
    input_seq = pad_sequences(input_seq, padding="post",maxlen= 21)
    
     # Encode the input as state vectors.
    state = encoder_model2.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = Dec_Tokenizer.word_index['<start>']
    
    # Encode the input as state vectors.
    curr_word = "<start>"
    decoded_sentence = ''

    i = 0
    while curr_word != "<end>" and i < (20 - 1):
        output_tokens, h = decoder_model2.predict([target_seq, state])

        curr_token = np.argmax(output_tokens[0, 0])

        if (curr_token == 0):
            break;

        curr_word = reverse_target_char_index[curr_token]

        decoded_sentence += ' ' + curr_word
        target_seq[0, 0] = curr_token
        state = h
        i += 1
    if curr_word != "<end>":
        decoded_sentence += ' ' + '<end>'
    return decoded_sentence

# Defining the function_1 which will take input here input can be single or set of datapoint and return the prediction of output

In [18]:
def function_1(test):
    """this function will take dataset as input(single of set of datapoint) and perfrom data preprocessing and return the predictions as output"""
    df_data=test.copy(deep=True) 
    
    mail_content=email_content(df_data.message.values)
    
    df_data["mail_content"]=mail_content
    
    df_data['mail_content_len'] = df_data['mail_content'].apply(len)
    
    
    if len(test)>1:
        emails_df_final = df_data[(df_data["mail_content_len"] > 0) & (df_data["mail_content_len"]< 5909)]
        emails_df_final = df_data[(df_data["mail_content_len"] > 0) & (df_data["mail_content_len"]> 58)]
    else:
        emails_df_final=df_data
    
    emails_df_final['clean_mail_content'] = emails_df_final.apply(lambda x: data_preprocess(x["mail_content"]),axis=1)
    
    
    emails_df_final['clean_mail_content'] = emails_df_final.apply(lambda x: decontracted(x["clean_mail_content"]),axis=1)
    
    #Droping the duplicates
    emails_df_final.drop_duplicates(subset = "clean_mail_content",inplace=True)
    
    if len(test)>1:    
        emails_df_final.drop(["mail_content","mail_content_len"],axis=1,inplace=True)

    
    emails_df_final['clean_mail_content_len'] = emails_df_final['clean_mail_content'].apply(lambda x:len(nltk.word_tokenize(x)))
    
    
    if len(test)>1: 
        emails_df_final=emails_df_final[emails_df_final["clean_mail_content_len"]<=20]
    
    output=[]
    for i in tqdm(emails_df_final["clean_mail_content"].values):
        token_list=i.split()
        for j in range(len(token_list)-1):
            data = []
            x_ngram =' '.join(token_list[:j+1]) 
            y_ngram =' '.join(token_list[j+1:])
            data.append(x_ngram)
            data.append(y_ngram)
            output.append(data)
    Final_df = pd.DataFrame(output, columns=['input','output'])
    
    
    # Add start and end tokens to target sequences in oder to use teacher forcing traning method
    Final_df["Encoder_input"] = Final_df.input.apply(lambda x : '<start> '+ x + ' <end>')
    
    sample_data=Final_df
    
    if len(test)==1:
        output=[]
        for i in range(len(Final_df)):
            data = []
            data.append(sample_data["Encoder_input"].iloc[i])
            data.append(decode_sequence(sample_data["Encoder_input"].iloc[i]))
            output.append(data)
        Predicted_df = pd.DataFrame(output, columns=['input','predicted'])
    else:
        output=[]
        for i in range(2000):
            data = []
            data.append(sample_data["Encoder_input"].iloc[i])
            data.append(decode_sequence(sample_data["Encoder_input"].iloc[i]))
            output.append(data)
        Predicted_df = pd.DataFrame(output, columns=['input','predicted'])
    
    return Predicted_df
 

# laoding the dataset

In [19]:
emails_df  = pd.read_csv('../input/enron-email-dataset/emails.csv')

In [20]:
emails_df.drop(['file'],axis=1,inplace=True)

In [21]:
emails_df.head()

Unnamed: 0,message
0,Message-ID: <18782981.1075855378110.JavaMail.e...
1,Message-ID: <15464986.1075855378456.JavaMail.e...
2,Message-ID: <24216240.1075855687451.JavaMail.e...
3,Message-ID: <13505866.1075863688222.JavaMail.e...
4,Message-ID: <30922949.1075863688243.JavaMail.e...


# passing the single data point to function_1

In [25]:
Predicted_df_single_data_pts=function_1(emails_df.iloc[[0],:])#this will give single row from dataset

100%|██████████| 1/1 [00:00<00:00, 1127.80it/s]
100%|██████████| 1/1 [00:00<00:00, 3771.86it/s]
2022-06-28 14:09:47.151975: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


# predicted output from our model

In [26]:
pd.set_option('max_colwidth', -1)
Predicted_df_single_data_pts

Unnamed: 0,input,predicted
0,<start> let <end>,me know if you are going to decide quickly. <end>
1,<start> let me <end>,"know if you have any questions. also, very important that this goes out today. kay <end>"
2,<start> let me know <end>,"if you have any questions. also, very important that this goes out today. <end>"
3,<start> let me know if <end>,you are read on this. <end>


# As in our orignal dataset we have almost 5 lakh row so here we are randomly sampling 2000 data points and pasing it functon_1

In [22]:
emails_df=emails_df.sample(2000)

In [27]:
Predicted_df=function_1(emails_df)

100%|██████████| 2000/2000 [00:00<00:00, 5439.32it/s]
100%|██████████| 208/208 [00:00<00:00, 15427.60it/s]


# Predction for 2000 datapoint from out model

In [28]:
pd.set_option('max_colwidth', -1)
Predicted_df.sample(10)

Unnamed: 0,input,predicted
1887,<start> don i have offers for carrie and <end>,"provide the interest for the new company? thanx, chris <end>"
1032,"<start> fyi also, mike and i are working with tim to <end>",take a look at this. <end>
617,<start> while i'm still making <end>,the letter of the month of recruiting for the gas trading agreement. <end>
859,<start> richard we actually need to duplicate his ena workstation as <end>,will not be able to read enron <end>
254,<start> ken <end>,lay's office n. sent from my blackberry wireless handheld www.blackberry.net <end>
1547,<start> did you have a chance to talk <end>,to lisa? <end>
1731,<start> update from cera on pricing and rest <end>,of the season. <end>
314,<start> attached is the weekly report for <end>,the week ending . morgan gottsponer <end>
729,"<start> guys, please take a read of the draft <end>",amendment for the review. thanks. mike green leslie <end>
1348,"<start> louise, attached <end>","is the spreadsheet that you requested. thanks, chris <end>"


# Defining the function which will take dataset with X and Y values and return the perplexity on same dataset

In [29]:
def function_2(test):
    """this function will take dataset (with x and y) as input and perfrom data preprocessing and return the perplexity of model"""
    df_data=test.copy(deep=True) 
    
    mail_content=email_content(df_data.message.values)
    
    df_data["mail_content"]=mail_content
    
    df_data['mail_content_len'] = df_data['mail_content'].apply(len)
    
    emails_df_final = df_data[(df_data["mail_content_len"] > 0) & (df_data["mail_content_len"]< 5909)]
    emails_df_final = df_data[(df_data["mail_content_len"] > 0) & (df_data["mail_content_len"]> 58)]
    
    emails_df_final['clean_mail_content'] = emails_df_final.apply(lambda x: data_preprocess(x["mail_content"]),axis=1)
    
    emails_df_final['clean_mail_content'] = emails_df_final.apply(lambda x: decontracted(x["clean_mail_content"]),axis=1)
    
    #Droping the duplicates
    emails_df_final.drop_duplicates(subset = "clean_mail_content",inplace=True)
    
        
    emails_df_final.drop(["mail_content","mail_content_len"],axis=1,inplace=True)
    
    emails_df_final['clean_mail_content_len'] = emails_df_final['clean_mail_content'].apply(lambda x:len(nltk.word_tokenize(x)))
    
    emails_df_final=emails_df_final[emails_df_final["clean_mail_content_len"]<=20]
    
    output=[]
    for i in tqdm(emails_df_final["clean_mail_content"].values):
        token_list=i.split()
        for j in range(len(token_list)-1):
            data = []
            x_ngram =' '.join(token_list[:j+1]) 
            y_ngram =' '.join(token_list[j+1:])
            data.append(x_ngram)
            data.append(y_ngram)
            output.append(data)
    Final_df = pd.DataFrame(output, columns=['input','output'])
    
    # Add start and end tokens to target sequences in oder to use teacher forcing traning method
    Final_df["Encoder_input"] = Final_df.input.apply(lambda x : '<start> '+ x + ' <end>')
    Final_df["Decoder_input"] = Final_df.output.apply(lambda x : '<start> '+ x)
    Final_df["Decoder_output"] = Final_df.output.apply(lambda x : x + ' <end>')
    
    Enc_Tokenizer=pickle.load(open("../input/final-trained-encoder-decoder-models/Enc_Tokenizer.pkl","rb"))
    Dec_Tokenizer=pickle.load(open("../input/final-trained-encoder-decoder-models/Dec_Tokenizer.pkl","rb"))
    Dec_Tokenizer_target=pickle.load(open("../input/final-trained-encoder-decoder-models/Dec_Tokenizer_target.pkl","rb"))
    
    X_test_Encoder_input = Enc_Tokenizer.texts_to_sequences( Final_df["Encoder_input"])
    X_test_Encoder_input = pad_sequences(X_test_Encoder_input, padding='post',maxlen= 21)
    
    X_test_Decoder_input = Dec_Tokenizer.texts_to_sequences( Final_df["Decoder_input"])
    X_test_Decoder_input = pad_sequences(X_test_Decoder_input, padding='post',maxlen= 20)
    
    X_test_Decoder_output = Dec_Tokenizer_target.texts_to_sequences( Final_df["Decoder_output"])
    X_test_Decoder_output = pad_sequences(X_test_Decoder_output, padding='post',maxlen= 20)
    
    perplexity_scores = model2.evaluate([X_test_Encoder_input, X_test_Decoder_input], X_test_Decoder_output)
    
    return perplexity_scores

# Again loading the dataset

In [30]:
emails_df  = pd.read_csv('../input/enron-email-dataset/emails.csv')

In [31]:
emails_df.drop(['file'],axis=1,inplace=True)

# passing the whole dataset to function_2

In [32]:
perplexity_scores=function_2(emails_df)

100%|██████████| 517401/517401 [01:38<00:00, 5241.63it/s]
100%|██████████| 19069/19069 [00:01<00:00, 14418.73it/s]




# we have got the perplexity score on whole dataset as 1.4156 and loss as 0.3389 which is very good