# Text Auto-encoder

In [0]:
from google.colab import drive
import os
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers

import json 


drive.mount('/content/Drive', force_remount=True)
os.chdir('/content/Drive/My Drive/Projet 3A : PIR/')
print(os.getcwd())
print(*os.listdir(),sep=",")

print('Test d\'acces aux données:',end=" ")
print(len(os.listdir("Data/annotations/")) > 0 )

Mounted at /content/Drive
/content/Drive/My Drive/Projet 3A : PIR
weights,Data,poster,output,m_image.py,utils.py,m_text.py,__pycache__,image_autoencoding.ipynb,text_autoencoder.ipynb
Test d'acces aux données: True


## Preprocessing of the inputs


In [0]:
file_name="Data/annotations/dataset_coco.json" #dataset_flickr30k.json"
json_file=json.load(open(file_name))

#lst=[ list(map(lambda x: x.strip('(').strip(')'), image["sentences"][np.random.randint(0,len(image["sentids"]))]["tokens"]))  for image in json_file["images"] ] #we select one of the 4 sentences
#lst=[]
split={'restval':0, 'test':0, 'train': 0, 'val': 0}
validation_set=[]
training_set=[]

for image in json_file["images"]:
  for i in range(len(image["sentids"])):
    tokens=list(map(lambda x: x.strip('(').strip(')'), image["sentences"][i]["tokens"]) )
    #lst.append(tokens)
    if image['split'] == "test":
      validation_set.append( tokens)
    else :
      training_set.append(tokens)
    split[image['split']]+=1
    
    
np.random.shuffle(validation_set)
np.random.shuffle(training_set)


size=len(validation_set)+len(training_set)
men_size=np.mean([len(tmp) for tmp in training_set] )
var_size=np.std([len(tmp) for tmp in training_set] )


print('Using {} sentences of mean length {} \pm {} with the following repartition :'.format(size,men_size,var_size))
for key in split.keys():
      print( "\t{} : {}".format(key,split[key]))



Using 616767 sentences of mean length 10.465468089097383 \pm 2.398318430809373 with the following repartition :
	restval : 152634
	test : 25010
	train : 414113
	val : 25010


In [0]:
for i,sentence in enumerate(training_set[:4]):
  print(i,end=": ")
  print(*sentence,sep=" ")

0: a man is riding a white bike down the street
1: a man riding skis down a snow covered slope
2: a parked motorcycle sitting next to a bunch of cloths
3: there are artifacts such as rocks and ceramics


### Init Embedding using Glove

In [0]:
!pip install glove_python

Collecting glove_python
[?25l  Downloading https://files.pythonhosted.org/packages/3e/79/7e7e548dd9dcb741935d031117f4bed133276c2a047aadad42f1552d1771/glove_python-0.1.0.tar.gz (263kB)
[K    100% |████████████████████████████████| 266kB 14.3MB/s 
Building wheels for collected packages: glove-python
  Building wheel for glove-python (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/88/4b/6d/10c0d2ad32c9d9d68beec9694a6f0b6e83ab1662a90a089a4b
Successfully built glove-python
Installing collected packages: glove-python
Successfully installed glove-python-0.1.0


In [0]:
from glove import Corpus, Glove
from utils import create_embedding

#glove = Glove(no_components=100, learning_rate=0.05)
glove=Glove.load_stanford("weights/glove.6B.100d.txt")
sub_glove=Glove.load("weights/sub_glove.10k.100d.txt")

#sub_glove=create_embedding(glove,corpus=lst,n_word=10000)
#sub_glove.save("weights/sub_glove.10k.100d.txt")

In [0]:
print(glove.dictionary["hello"],sub_glove.dictionary["hello"])
print(len(glove.word_vectors),len(glove.word_vectors[0]))
print(glove.most_similar("hello"))
print(glove.inverse_dictionary[10000])


13075 2946
400000 100
[('goodbye', 0.7905023817864941), ('hey', 0.7171452903620842), ('!', 0.6594691265760626), ('yeah', 0.6267022357975351)]
persecution


### Decode/encode sentences


In [0]:
from utils import encode_sentences,decode_sentence

s="hello n&&b".split()
print(s)
encoded=encode_sentences([s], sub_glove,maxlen=20)
print(decode_sentence(encoded[0],sub_glove))  

['hello', 'n&&b']
['hello', '<unk>', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


### Text Generator


In [0]:
import os.path
from utils import TextSequence
      
training_sequence=TextSequence(sentences=training_set,glove=sub_glove,input_dim=10000, max_len=20)
validation_sequence=TextSequence(sentences=validation_set,glove=sub_glove,input_dim=10000, max_len=20)  


tmp,tmp2=training_sequence[0]
print(tmp.shape,tmp2.shape)

## Building the model

In [0]:
import m_text as text

input_dim=len(sub_glove.word_vectors)
embeddings_dim=len(sub_glove.word_vectors[0])
latent_dim=1024
sentence_lenght=20


encoder=text.get_encoder(sentence_lenght,input_dim ,embeddings_dim,latent_dim)
decoder=text.get_decoder(sentence_lenght,input_dim ,latent_dim)


model=text.get_training_model(input_shape=(sentence_lenght,),encoder=encoder,decoder=decoder)


print("Encoder: \n ")
encoder.summary()

print("\nDecoder: \n ")
decoder.summary()

print("\nTraining model:\n")
model.summary()


Instructions for updating:
Colocations handled automatically by placer.
Encoder: 
 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 20)                0         
_________________________________________________________________
Embedding (Embedding)        (None, 20, 100)           1000000   
_________________________________________________________________
bi_LSTM (Bidirectional)      (None, 1024)              9224192   
Total params: 10,224,192
Trainable params: 9,224,192
Non-trainable params: 1,000,000
_________________________________________________________________

Decoder: 
 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
txt (InputLayer)             (None, 1024)              0         
_________________________________________________________________
repeat_vector (RepeatVecto

In [0]:
encoder.load_weights('weights/text/encoder.h')
decoder.load_weights('weights/text/decoder.h')

#model.fit_generator(training_sequence, epochs=10, verbose=1,workers=4,validation_data=validation_sequence)

#encoder.save('weights/text/encoder.h')
#decoder.save('weights/text/decoder.h')

## Results

In [0]:
import time

input_encoder,_=validation_sequence[10] #training_sequence[0] #
enc=encoder.predict(input_encoder)
dec=decoder.predict(enc)


for i,sentence in enumerate(input_encoder):
  print('In : ',end='')
  print(' '.join(decode_sentence(sentence,sub_glove)))
  print('Out : ',end='')
  print(' '.join(decode_sentence(dec[i],sub_glove)))
  print()


In : several boats sailing on a beautiful day                          
Out : several boats displayed on a beautiful day                          

In : the toothbrushes with toothpaste on them are next to box <unk>                  
Out : the toothbrushes with dog on them are next to local <unk>                  

In : a plate topped with small sandwiches with meat and veggies                    
Out : a plate topped with small sandwiches with meat and veggies                    

In : a bird standing next to a large body of water                    
Out : a bird standing next to a large body of water                    

In : an empty tiled bathroom with no toilet paper                        
Out : an empty tiled bathroom with no toilet paper                        

In : the man holding a beverage is about to throw a frisbee                  
Out : the man holding a beverage is about to throw a frisbee                  

In : three young guys standing around each other in shirts   