In [41]:
import os
import pandas as pd
import keras
os.environ['KERAS_BACKEND'] = 'tensorflow'
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import argparse
import pickle
import sys
import jsonlines
#import torch
from random import randint

from allennlp.commands.elmo import ElmoEmbedder

class preProcessing():


    '''
    token_type can be claim or sentence
    this function takens all the claims/sentences and perform tokenization
    '''

    def tokenize_data(self, data, token_type, dataset_name, max_tokens=None):

        tokens_list = [text_to_word_sequence(data[str(token_type)].iloc[i], lower=False) for i in range(len(data))]
       
        return tokens_list

    def create_elmo_embeddings(self, elmo, claims_tokens, sents_tokens, documents, dataset_name):

#         num_sentences = min(max_sentences, len(documents)) if max_sentences > 0 else len(documents)
#         print("\n\n:: Lookup of "+str(num_sentences)+" ELMo representations. This takes a while ::")
        claim_embeddings = []
        sentence_embeddings = []

        labels = []

        documentIdx = 0
        for elmo_embedding in elmo.embed_sentences(claims_tokens):  
            
            claim_document = documents["claim"].iloc[documentIdx]
            # Average the 3 layers returned from ELMo
            avg_elmo_embedding = np.average(elmo_embedding, axis=0)

            claim_embeddings.append(avg_elmo_embedding)        
            labels.append(documents['label'].iloc[documentIdx])

            documentIdx += 1
            
        documentIdx = 0
        batch_size = 16
        # embed_sentences(tokens, batch_size)
        for elmo_embedding in elmo.embed_sentences(sents_tokens, batch_size):  
            
            if dataset_name == "birth_place" or dataset_name == "institution":
                sent_document = documents["body"].iloc[documentIdx]
            else:
                
                sent_document = documents["sentence"].iloc[documentIdx]
#                 print ("sentence document ", sent_document)
#             print (sent_document, "\n")
#             print (documentIdx)

            try:
        
                # Average the 3 layers returned from ELMo
                avg_elmo_embedding = np.average(elmo_embedding, axis=0)
#                 print ("avg elmo embedding ", avg_elmo_embedding.shape)
            #because some sents have just punc ' (' due to which there is no embeddings
            except ZeroDivisionError:
                random_number = randint(4,15)
                avg_elmo_embedding = np.zeros((random_number, 1024)) 
                
            sentence_embeddings.append(avg_elmo_embedding)

            # Some progress info
            documentIdx += 1
            
        return claim_embeddings, sentence_embeddings, labels

    
    def load_datafiles(self, dataset_params):


        data = dict()
        for p in dataset_params:
            open_data = open(p['EXP_FOLDER'] + p['DATASET'], "rb")
            dataframe = pickle.load(open_data)
            data[str(p['DATASET'][0:-4])] = dataframe # keys are dataset names w/o extension

        return data

    
    def filter_proper_claims_sents(self, datasets):

        #because in fever rej some sents jus have punctuations
        return (datasets[datasets['sentence'].str.len() > 2])


    def to_padding(self, claims, sentences, labels, max_claims_length, max_sents_length):

#         print ("claims data ")
        claims_data = pad_sequences(claims, maxlen=max_claims_length)  #returns array of data
        print ("claims data ")
        sents_data = pad_sequences(sentences, maxlen=max_sents_length)
        print ("sents data ")
        labels = np.asarray(labels)

        return (claims_data, sents_data, labels)


if __name__ == '__main__':

#     dataset_name = 'test_fever_3'
    dataset_name = 'fever_full_binary_dev'
    # dataset_path = [{'EXP_FOLDER': './lstm/' , 'DATASET': 'train_fever_rej.pkl'}],
#     dataset_path = [{'EXP_FOLDER': './datasets/' , 'DATASET': 'train_'+str(dataset_name)+'.pkl'}]
    
    
    preprocess = preProcessing()
    
    
    if dataset_name == 'fever_full_binary_train' or dataset_name == 'fever_full_binary_dev':
        
        dataset_path = "/home/kkuma12s/thesis/Proof_Extraction/data/fever-full/"+dataset_name+".jsonl"
        with jsonlines.open(dataset_path, mode='r') as f:
            claims = []
            sents = []
            labels = []
            for example in f:
                claims.append(example["claim"])
                sents.append(example["sentence"])
                labels.append(example["label"])
            
            tmp_dict = {'claim':claims, 'sentence':sents, 'label': labels}
            dataframe = pd.DataFrame(data=tmp_dict)
                
    else:
        # print (len(dataset["train_fever_rej"]))
        # print (dataset["train_fever_rej"]["sentence"].iloc[0])
        dataset_path = [{'EXP_FOLDER': './datasets/' , 'DATASET': str(dataset_name)+'.pkl'}]
        dataset = preprocess.load_datafiles(dataset_path)
        dataframe = dataset[str(dataset_name)]

        if dataset_name == 'birth_place' or dataset_name == 'institution':

            dataframe["claim"]= dataframe["claim"].apply(lambda x: ','.join(map(str, x)))
            max_length_claims = max([len(i.split()) for i in dataframe["claim"].tolist()])
            max_length_sents = max([len(i.split()) for i in dataframe["body"].tolist()])
            print (max_length_claims)
            print (max_length_sents)

    claims_tokens = preprocess.tokenize_data(dataframe, "claim", dataset_name)
    sents_tokens = preprocess.tokenize_data(dataframe, "sentence", dataset_name)

        
    elmo = ElmoEmbedder(cuda_device=1)

    claim_embeddings, sent_embeddings, train_label = preprocess.create_elmo_embeddings(elmo, claims_tokens, sents_tokens, dataframe, dataset_name)
    pickle.dump(claim_embeddings, open("./embeddings/test_claim_elmo_emb_fever_full_binary_dev.pkl", "wb"))
    pickle.dump(sent_embeddings, open("./embeddings/test_sents_elmo_emb_fever_full_binary_dev.pkl", "wb"))


In [42]:
max_claims_length = 65
max_sents_length = 300
claims_data, sents_data, labels = preprocess.to_padding(claim_embeddings, sent_embeddings, train_label, max_claims_length, max_sents_length)


claims data 
sents data 


In [5]:
print (claims_data.shape)
print (sents_data.shape)
print (labels.shape)

(568, 15, 1024)
(568, 20, 1024)
(568,)


In [166]:
# print (claim_embeddings[0].shape)



In [167]:

claim_embeddings = pickle.load(open("claim_elmo_emb.pkl", "rb"))
sent_embeddings = pickle.load(open("sent_elmo_emb.pkl", "rb"))
def pad_x_matrix(x_embeddings, max_length):

    count = 0
    for i in range(len(x_embeddings)):
        embedding = x_embeddings[i]
#         print (embedding)
        embedding_vec = np.array(embedding, dtype=np.float32)
#         print (embedding_vec.shape)
        
        print (embedding_vec.shape)
        padding_length = max_length - embedding_vec.shape[0] 

        if padding_length > 0:
            x_embeddings[i] = np.append(embedding, np.zeros((padding_length, embedding_vec.shape[1])), axis=0)
            
        else:
            x_embeddings[i] = np.delete(embedding, np.s_[max_length:], axis = 0)            
    
#     print (x_embeddings.shape)
    embeddings_vector = np.array(x_embeddings, dtype=np.float32)

    return embeddings_vector

# train_claim_embedded = pad_x_matrix(claim_embeddings, 12)
train_sents_embedded = pad_x_matrix(sent_embeddings, 25)

# train_y = np.array(train_label)

# test_x = pad_x_matrix(test_x)
# test_y = np.array(test_y)



(18, 1024)
padding length  7
(21, 1024)
padding length  4
(7, 1024)
padding length  18
(33, 1024)
padding length  -8
(21, 1024)
padding length  4
(36, 1024)
padding length  -11
(17, 1024)
padding length  8
(15, 1024)
padding length  10
(19, 1024)
padding length  6
(19, 1024)
padding length  6
(14, 1024)
padding length  11
(62, 1024)
padding length  -37
(29, 1024)
padding length  -4
(23, 1024)
padding length  2
(39, 1024)
padding length  -14
(19, 1024)
padding length  6
(26, 1024)
padding length  -1
(15, 1024)
padding length  10
(35, 1024)
padding length  -10
(21, 1024)
padding length  4
(10, 1024)
padding length  15
(36, 1024)
padding length  -11
(15, 1024)
padding length  10
(37, 1024)
padding length  -12
(31, 1024)
padding length  -6
(13, 1024)
padding length  12
(31, 1024)
padding length  -6
(36, 1024)
padding length  -11
(24, 1024)
padding length  1
(32, 1024)
padding length  -7
(23, 1024)
padding length  2
(27, 1024)
padding length  -2
(40, 1024)
padding length  -15
(30, 1024)
pad

In [153]:
print (train_x.shape)

(5124, 8, 1024)


In [24]:
print (claim_embeddings[0].shape)
print (sent_embeddings[0].shape)

(8, 1024)
(18, 1024)


In [12]:
print (type(claim_embeddings))
# print (len(sent_embeddings[143]))
print (len(claim_embeddings[0]))

<class 'list'>
8


In [139]:
a = np.array([[1,2,3],[4,5,6], [7,8,9], [10, 11, 12], [13, 14,15]])
# np.delete(a, (2), axis=1)
np.delete(a, np.s_[2:], axis = 0)


array([[1, 2, 3],
       [4, 5, 6]])

In [1]:
l = [0] * 2000
for i in range()

50