## Read file & Prepocessing

In [None]:
import pickle  
import numpy as np 
import os 
from keras.callbacks import ModelCheckpoint
from keras import optimizers
from keras.metrics import categorical_accuracy
from keras.preprocessing.text import Tokenizer

import Model
import operator

import preTool

embeddingPath = "Embedding/"
gloveFile = 'glove.6B.300d.txt'  
wordDim = 300 

textPath = "text/"
orgFile = "test.txt"

processedPath = "processed/"
maskedFile = "masked.txt"
marked_token = "maskedtoken"

frequentVocabLimit = 2500
trainingSize = 0
maxLength = 30
vocab_size = 0

val_pert = 0.0001

wordDicFile = 'wordDic.txt'
tokenizerFile = 'tokenizer.txt'
embeddingFile =  'embedding_matrix.txt'
valiFile = "val_set.txt"

## Run Once and get the results

In [None]:
# 1. Splite the data into 10 groups  + Validatio dataset
orgSentence = preTool.loadFile(textPath,orgFile)
val_pert = 0.01
val_len = int(len(orgSentence)*val_pert)
val_set = orgSentence[-val_len:]

# Preprocessing val_set and save to directory. 
orgSentence = orgSentence[:-val_len]
orgLength = len(orgSentence)

val_set  = preTool.preprocessing(val_set)

with open(processedPath + textPath + valiFile, 'w') as f:
    for item in val_set:
        f.write("%s" % item)
print("Finish writing")

del val_set
del orgSentence

In [None]:
# Split into 10 groups
cut_pert = int(orgLength*0.1)
split_group = 10

for i in range(split_group):
    orgSentence = preTool.loadFile(textPath,orgFile)
    orgSentence = orgSentence[:-val_len]
    orgSentence = orgSentence[i*cut_pert:(i+1)*cut_pert]
    print("Start at {}, end at {}".format(i*cut_pert,min((i+1)*cut_pert,orgLength)))
    
    orgSentence = preTool.preprocessing(orgSentence)
    with open(processedPath + textPath + "group_"+str(i)+".txt", 'w') as f:
        for item in orgSentence:
            f.write("%s" % item)
    print("Finish writing")

In [None]:
orgFile = "books_large_p2.txt"
for i in range(split_group):
    orgSentence = loadFile(textPath,orgFile)
    sentenceLen = len(orgSentence)
    orgSentence = orgSentence[i*cut_pert:(i+1)*cut_pert]
    print("Start at {}, end at {}".format(i*cut_pert,min((i+1)*cut_pert,sentenceLen)))
    
    orgSentence = preprocessing(orgSentence)
    with open(processedPath + textPath + "group_"+str( i + 10)+".txt", 'w') as f:
        for item in orgSentence:
            f.write("%s" % item)
    print("Finish writing")

In [None]:
# Compute the vocab frequency & tokenizer & embedding_matrix
wordDic = {}
for file in os.listdir(processedPath+textPath):
    if file.endswith(".txt"):
        lines = preTool.loadFile(processedPath+textPath , file)
        lines = preTool.sentenceToWordList(lines)
        wordDic = preTool.getVocabFrequencyForText(lines,wordDic)
wordDic[marked_token] = 1 

with open(processedPath +  wordDicFile, "wb") as internal_filename:
    pickle.dump(wordDic, internal_filename)
    
    
t = Tokenizer()
t.fit_on_texts(wordDic.keys())
with open(processedPath +  tokenizerFile, "wb") as internal_filename:
    pickle.dump(t, internal_filename)
    
vocab_size = len(t.word_index) + 2

embeddings_matrix = preTool.getEmbeddingMatrix(embeddingPath,gloveFile,marked_token,wordDim,vocab_size)

with open(processedPath +  embeddingFile, "wb") as internal_filename:
    pickle.dump(embedding_matrix, internal_filename)

## Modeling

In [None]:
# Load Dic & EmbeddingMatrix & Tokenizer & validation dataset  outside of the for-loop 

import copy
dic = {}

with open(processedPath + wordDicFile , "rb") as input_file:
    wholeVocab = pickle.load(input_file)
wholeVocab = sorted(wholeVocab.items(), key=operator.itemgetter(1))
wholeVocab = wholeVocab[-frequentVocabLimit:]
for key in wholeVocab:
    dic[key[0]] = True

with open(processedPath + tokenizerFile , "rb") as input_file:
    t = pickle.load(input_file)
    
with open(processedPath + embeddingFile , "rb") as input_file:
    embedding_matrix = pickle.load(input_file)

orgFile,maskedFile = sentenceToTokenData(processedPath + textPath,valiFile,t,dic)
input_X_vali,input_masked_first_vali,input_masked_second_vali,input_masked_third_vali,input_Y_vali = generateTrainingDataSet(orgFile,maskedFile)

del orgFile,maskedFile

In [None]:
import glob

num_neurons = [wordDim*2,wordDim*2]
kernel_reg = 0.001
batch_size = 400
epochs = 1

count = 0
merge_mode = 'concat'
extractLayer = ['X_input_second','X_input_mask_2']
checkPoint = checkPointPath+ "weights-loss-{loss:.4f}-cata_acc-{categorical_accuracy:.4f}-val_loss-{val_loss:.4f}-cate_acc_val-{val_categorical_accuracy:.4f}.hdf5"
restoreCheckpoint = False
vocab_size = len(t.word_index) + 2

In [None]:
model = Model.buildModel(input_X_vali,kernel_reg,num_neurons,merge_mode,vocab_size,maxLength,wordDim,embedding_matrix)
Adam = optimizers.Adam(lr=5e-4)

model.compile(loss = 'categorical_crossentropy', optimizer=Adam, metrics=['categorical_accuracy'])
model.save_weights("random.hdf5")

In [None]:
for i in range(12):
    for file in os.listdir(processedPath+textPath):
        if file.endswith(".txt") and file != "val_set.txt":
            print("\n*******************  Epoch:{}, Training on file:{}  ***********************************************\n".format(i+1,file))
            orgFile,maskedFile = sentenceToTokenData(processedPath + textPath,file,t,dic)
            input_X_train,input_masked_first_train,input_masked_second_train,input_masked_third_train,input_Y_train = generateTrainingDataSet(orgFile,maskedFile)
            
            if restoreCheckpoint:
                list_of_files = glob.glob( checkPointPath + '*.hdf5') # * means all if need specific format then *.csv
                if len(list_of_files) != 0:
                    latest_file = max(list_of_files, key=os.path.getctime)
                    print("Load model:" + latest_file)
                    model.load_weights(latest_file)
                else:
                    print("No pre-trained model to load")
                restoreCheckpoint = False
            checkpoint =  ModelCheckpoint(checkpointPath, monitor=('val_loss'), verbose=0, 
                                  save_best_only=False, save_weights_only=False, mode='auto', period=1)

            model.fit([input_X_train,input_masked_first_train,input_masked_second_train,input_masked_third_train],input_Y_train,
                      validation_data=([input_X_vali,input_masked_first_vali,input_masked_second_vali,input_masked_third_vali],input_Y_vali),
                      shuffle=True,epochs=epochs,batch_size=batch_size,callbacks=[checkpoint])
    count += 1

In [None]:
output = Model.extractHiddenState(extractLayer,model,input_X_vali)

In [None]:
output =  np.concatenate((output[0],output[1]),axis=1)

## Extract hidden layer 

In [None]:
output =  Model.extractHiddenState(extractLayer,model,input_X)
results1  =  np.concatenate((output[0],output[1]),axis=1)

In [None]:
# Get the hidden layer for SentEval 
x =  np.zeros((10,maxLength,wordDim))
model = buildModel(x,kernel_reg,num_neurons,merge_mode)
model.load_weights(checkpoint)
output = extractHiddenState(layerName=extractLayer,model=model, predict_input=input_X)

In [None]:
# Warning: Normally its '  but in the text is ’
def decontracted(phrase):
    phrase = re.sub(r"wo n\’t", "will not", phrase)
    phrase = re.sub(r"ca n\’t", "can not", phrase)
    phrase = re.sub(r"wont","will not", phrase)
    phrase = re.sub(r"cant", "can not", phrase)
    phrase = re.sub(r"wouldnt", "would not", phrase)
    phrase = re.sub(r"couldnt","could not",phrase)
    # general
    phrase = re.sub(r"n\’t", " not", phrase)
    phrase = re.sub(r"\’re", " are", phrase)
    phrase = re.sub(r"\’s", " is", phrase)
    phrase = re.sub(r"\’d", " would", phrase)
    phrase = re.sub(r"\’ll", " will", phrase)
    phrase = re.sub(r"\’t", " not", phrase)
    phrase = re.sub(r"\’ve", " have", phrase)
    phrase = re.sub(r"\’m", " am", phrase)
    return phrase

def generateMaskedSentence(processedSentence,frequentDic):
    print("Generate Maksed Sentence")
    for sentence in processedSentence:
        for i in range(len(sentence)):
            if sentence[i] not in frequentDic.keys():
                sentence[i] = marked_token
    return processedSentence

def generateIndexForDataset(dataSize):
    print("Generate Index For Dataset")
    trainIndexList = []
    data_index = []

    for i in range(dataSize-1):
        x = randint(0,dataSize-1)
        y = randint(0,dataSize-1)
        z = i+1 
        while(x == y or x==i or x==z or y == i or y==z ):
            x = randint(0,dataSize-1)
            y = randint(0,dataSize-1)
        index = ([[x,0],[y,0],[z,1]])
        random.shuffle(index)
        trainIndexList.append([i,index])   

    for row in trainIndexList:
        train_x = row[0]
        y_index = row[1]
        y_output = []
        x_masked = []
        for index in y_index:
            y_output.append(index[1])
            x_masked.append(index[0])
        data_index.append([train_x,x_masked,y_output])

    print("Finish Generating index")
    return data_index

def loadFile(textPath,orgFile):
    print("Load:" + textPath + orgFile)
    with open(textPath +  orgFile ,encoding="utf-8") as fp:
        lines = fp.readlines()
    lines = [e for e in lines if e not in {'\n'}]
    return lines
    
def preprocessing(lines): 
    print("Pre processing Data")
    count = 0 
    for line in lines:
        # Decapitalize: Conver to lower case first. 
        line = line.lower()
        # Delete url 
        line = re.sub(r"http\S+", "link", line)
        line = re.sub(r"\S+html", "link", line)
        line = re.sub(r"\S+.com$", "link", line)
        line = re.sub(r"\S+.jpg$", "photo", line)
        line = decontracted(line)
        '''
        ignore:  * { } \  < > 
        Splite based on : ! . ,  &  # ' $ 
        line = re.findall(r"[\w']+|[().,:!?;'$&]", line)
        '''
        lines[count] = line
        count += 1 
    return lines

def getVocabFrequencyForText(text, dic):
    for sent in text:
        for vocab in sent:
            if vocab not in dic.keys():
                dic[vocab] = 1
            else: dic[vocab] += 1
    return dic

def sentenceToWordList(lines):
    print("Convert sentence to word list")
    count = 0
    for line in lines:
        lines[count] = re.findall(r"[\w']+|[().,:!?;'$&]", line)
        count += 1
    return lines


def sentenceToTokenData(textPath,orgFile,t,dic):
    processedLine = loadFile(textPath,orgFile)
    processedLine = sentenceToWordList(processedLine)
    # print("Deep copy")
    maskedLine = copy.deepcopy(processedLine)
    maskedLine =  generateMaskedSentence(maskedLine,dic)
    print("--Test-- Process:{} ; masked:{}".format(processedLine[1],maskedLine[1]))
  
    processedLine = t.texts_to_sequences(processedLine)
    maskedLine = t.texts_to_sequences(maskedLine)

    processedLine = pad_sequences(processedLine, maxlen=maxLength, padding='post')
    maskedLine = pad_sequences(maskedLine, maxlen=maxLength, padding='post')
    return processedLine,maskedLine 

def generateTrainingDataSet(processedLine,maskedLine):
    dataIndex = generateIndexForDataset(len(processedLine))
    # Modeling
    input_X = []
    input_masked_first = []
    input_masked_second = []
    input_masked_third = []
    input_Y = []
    for index in dataIndex:
        input_X.append(processedLine[index[0]])
        input_Y.append(index[2])
        input_masked_first.append(maskedLine[index[1][0]])
        input_masked_second.append(maskedLine[index[1][1]])
        input_masked_third.append(maskedLine[index[1][2]])
    input_X = asarray(input_X)
    input_masked_first = asarray(input_masked_first)
    input_masked_second = asarray(input_masked_second)
    input_masked_third = asarray(input_masked_third)
    input_Y = asarray(input_Y)
    return input_X,input_masked_first,input_masked_second,input_masked_third,input_Y