In [None]:
import os
import numpy as np
import cv2
import sklearn
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,LabelBinarizer
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential,Model,load_model
from keras.layers import Input, Dense, Activation, Dropout, LSTM, Flatten, Embedding, merge,TimeDistributed,concatenate,Bidirectional,Reshape
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences





############################################################################################################################

# Text file contains three information. Image ID, Question and Answer. So I want to separate all those three components by separate function.
# for example, "synpic41148|what kind of image is this?|cta - ct angiography". Here, synpic41148-is Image ID, what kind of image is this?- is the question and ct angiography- is the answer. 
# Separate function will store these three imformation separately.



def separate(Text_file_name_location): 
    
    # create null vector to store image id, question and answer 
    
    length=len(open(Text_file_name_location,"r").read().split("\n"))
    question=[] 
    img_id=[]
    answer=[]
    img_jpg=[]
    ques_no_punc=[]
    
    for line in range(length):   # there are in total 3827 instances (train+valid+test)
        data=open(Text_file_name_location)
        split_ques=data.read().split("\n")[line]      # split each line
        ques=split_ques.split("|")                 # from each line split image id, question and answer 
        question.append(" ".join(ques[1].split()).lower())           # convert all answers and question in lower case
        answer.append(" ".join(ques[2].split()).lower())
        img_id.append(ques[0].lower())


    answer_final=[]
    
    for ans in range(length):
            y=answer[ans]        
            answer_final.append(y.translate(str.maketrans("","", string.punctuation)))   ### remove the punctuation mark from answer
    

    for ques in range(length):
        z=question[ques]        
        ques_no_punc.append(z.translate(str.maketrans("","", string.punctuation)))  # remove the punctuation mark from question
   

    for img in range(length):  ## we add .jpg extention with all image id so that we call call then later
        x=img_id[img]+".jpg"
        img_jpg.append(x)
         
    
    
    dictionary=dict()    

    dictionary["Question"]=ques_no_punc  # save those image id , question and answer as a dictionary
    dictionary["Image_id"]=img_jpg
    dictionary["Answer"]=answer_final
    
      
    return dictionary  # this dictionary contains all information 




###################################################################################################################


# Image file has three part, namely, Training, Validation and testing. 
# Training folder has 3200 Images, Validation floder has 500 Images and Testing has 125 Images.

# image_presprocess_modality function resize all the images into (224,224,3)


# sequence of image from main image file and from question file is different
# sequence of image_id_list should be according to text file (question text file). That's we need to provide image_id_list

# image pre-process involves the process to resize the each image into (224,224,2) shape

def image_preprocess(folder_location,image_id_list): 

    #     img_path_train=folder_location
    
    image_id=image_id_list
    reshaped_image=[]  
    
    for img in range(len(image_id)):
        join_path=os.path.join(folder_location,image_id[img])
        im=cv2.imread(join_path)
        re=cv2.resize(im,(224,224))
        reshaped_image.append(re)
   
    reshaped_image=np.array(reshaped_image)

    return reshaped_image


######################################################################################################



## Here we convert all answer into both integer label and one-hot-encoding

# we want 

# reference_label-----> according to this label we want to fit the label
# target_label--------> on which we want to transform



class LabelPreprocess:
    
    def __init__(self,target_label,reference_label):
        
        self.target_label=target_label
        self.reference_label=reference_label
    
    def integer_encoding(target_label,reference_label):

        y_array=np.array(reference_label)
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit(y_array)
#         target_label= target_label.map(lambda s: '<unknown>' if s not in label_encoder.classes_ else s)
        label_encoder.classes_ = np.append(integer_encoded.classes_, 'zero') ##### "zero" for umknown label
        integer_encoded1 = label_encoder.transform(target_label)

        return integer_encoded1

                          
                          
    def onehot_encoding(target_label,reference_label):
        y_array=np.array(reference_label)
        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit(y_array)
        label_encoder.classes_ = np.append(integer_encoded.classes_, 'zero')
        integer_encoded1=label_encoder.transform(np.array(reference_label))
        integer_encoded2= label_encoder.transform(np.array(target_label))
        

        onehot_encoder = OneHotEncoder(sparse=False)
        integer_encoded1 = integer_encoded1.reshape(len(integer_encoded1), 1)
        integer_encoded2 = integer_encoded2.reshape(len(integer_encoded2), 1)

        onehot_encoded = onehot_encoder.fit(integer_encoded1)

        onehot_encoded1=onehot_encoder.transform(integer_encoded2)
        
        return onehot_encoded1
    
    
#############################################################################################################################

## question (sentence) pre-process for LSTM MODEL

# reference_question-----> according to this label we want to fit the label
# target_question--------> on which we want to transform

class QuestionPreprocess:
    
    
    def tokenize_question(question):    #### Tokenizr the question
        
        token=Tokenizer(oov_token="unk")
        token.fit_on_texts(question)

        word_index=token.word_index
        word_index
        return word_index 
    
    def sequence_question(target_question,reference_question):  #### convert the question into sequence
        token=Tokenizer(oov_token="unk")
        token.fit_on_texts(reference_question)                     # tokenize will work on train data
        seq_question=token.texts_to_sequences(target_question)
        return seq_question                         # return list of question. each question now represented by integer token
    
    def padding_question(sequence_of_the_question,maxlength,padding_criterion): ## Padding criterion to make each question equal length
        
        pad_ques=pad_sequences(sequence_of_the_question,maxlen=maxlength,padding=padding_criterion)
        return pad_ques






##################################################################################################################



## question (sentence) pre-process for BERT MODEL

# '/home/local/AD/asarkar2/BERT'

# bert_path-----> location for bert folder where all files for pretrained bert, namely 'bert_config.json', 'model.ckpt-150000', 'vocab.txt' are stored

def bert_question_preprocess(question_seq_length,bert_path,question):  ### we will insert all question (train+valid+test)
    
    SEQ_LEN = question_seq_length
    all_ques=question

    pretrained_path = bert_path 
    config_path = os.path.join(pretrained_path, 'bert_config.json')
    checkpoint_path = os.path.join(pretrained_path, 'model.ckpt-150000')
    vocab_path = os.path.join(pretrained_path, 'vocab.txt')

    model_bert = load_trained_model_from_checkpoint(
          config_path,
          checkpoint_path,
          training=True,
          trainable=True,
          seq_len=SEQ_LEN,
      )

    token_dict = {}
    with codecs.open(vocab_path, 'r', 'utf8') as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)

    tokenizer = k_Tokenizer(token_dict)

    token=[]  #### store the tokenized sentence
    segment=[]
       
    for q in range(len(all_ques)):
        ques=all_ques[q]
        ids, segments = tokenizer.encode(ques, max_len=SEQ_LEN)
        token.append(ids)
        segment.append(segments)
    
    ### seperate train, valid and test question
    

    token_train=np.array(token[:3200])
    token_valid=np.array(token[3200:3700])
    token_test=np.array(token[3700:])


    train_x=[token_train, np.zeros_like(token_train)]
    valid_x=[token_valid, np.zeros_like(token_valid)]
    test_x=[token_test, np.zeros_like(token_test)]


    return train_x,valid_x,test_x   ## return all training, validation and text question






