In [None]:
import os
import string

In [None]:
# Load doc into memory
def load_doc(filename):
    
    file = open(filename, 'r')
    text = file.read()
    
    file.close()
    
    return text
 
filename = 'Flickr8k_text/Flickr8k.token.txt'

# Load descriptions
doc = load_doc(filename)

In [None]:
# Extract descriptions for images
def load_descriptions(doc):
    
    mapping = dict()

    for line in doc.split('\n'):
        
        # Split line by white space
        tokens = line.split()
        
        if len(line) < 2:
            continue
            
        # Take the first token as the image id, the rest as the description
        image_id, image_desc = tokens[0], tokens[1:]
        # Remove filename from image id
        image_id = image_id.split('.')[0]
        # Convert description tokens back to string
        image_desc = ' '.join(image_desc)
        # Create the list if needed
        if image_id not in mapping:
            mapping[image_id] = list()
            
        # Store description
        mapping[image_id].append(image_desc)
        
    return mapping
 
# Parse descriptions
descriptions = load_descriptions(doc)
print('Loaded: %d ' % len(descriptions))

In [None]:
def clean_descriptions(descriptions):
    
    # Translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    
    for key, desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc = desc_list[i]
            # Tokenize
            desc = desc.split()
            # 1) Convert to lower case
            desc = [word.lower() for word in desc]
            # 2) Remove punctuation from each token
            desc = [w.translate(table) for w in desc]
            # 3) Remove hanging 's' and 'a'
            desc = [word for word in desc if len(word)>1]
            # 4) Remove tokens with numbers in them
            desc = [word for word in desc if word.isalpha()]
            # Store as string
            desc_list[i] =  ' '.join(desc)

# Clean descriptions
clean_descriptions(descriptions)

In [None]:
# Building the vocabulary of words from descriptions
def to_vocabulary(descriptions):

    all_desc = set()
    
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key]]
        
    return all_desc

# Summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

In [None]:
# Saving descriptions to file
def save_descriptions(descriptions, filename):
    
    lines = list()
    
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
            
    data = '\n'.join(lines)
    
    file = open(filename, 'w')
    file.write(data)
    file.close()

# Save descriptions
save_descriptions(descriptions, 'descriptions.txt')