In [4]:
# all our imports
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Model
from os import listdir
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from pickle import dump
from pickle import load
import emoji
import advertools as adv
from sklearn.model_selection import train_test_split

In [5]:
def extract_features(directory):
    model = tf.keras.applications.VGG16()
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    print(model.summary())
    features = dict()
    for name in listdir(directory):
        filename = directory + '/' + name
        image = load_img(filename, target_size=(224, 224))
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    feature = model.predict(image, verbose=0)
    image_id = name.split('.')[0]
    features[image_id] = feature
    return features

directory = 'instagram_data/img'
features = extract_features(directory)
print('Extracted Features: %d' % len(features))
dump(features, open('features.pkl', 'wb'))

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [6]:
print(len(features['insta8959']))

1


In [7]:
import pandas as pd
def load_doc(filename):
    df = pd.read_csv(filename)
    # Remove  na values
    df = df.dropna()
    # Drop the first column
    df = df.drop(df.columns[0], axis=1)
    # Rename Image File column to image_id
    df = df.rename(columns={"Image File": "image_id"})
    # In the image_id column, remove the file extension
    df['image_id'] = df['image_id'].str.replace('img/', '')
    # keep only the emojis in the caption
    df["Caption"] = adv.extract_emoji(df["Caption"])["emoji"]

    # Turn the dataframe into a dictionary with the image_id as the key and the caption as the value
    captions = dict()
    for i in range(len(df)):
        image_id = df.iloc[i]['image_id']
        caption = df.iloc[i]['Caption']
        captions[image_id] = caption

    return captions
    

filename = 'instagram_data/captions_csv.csv'
descriptions = load_doc(filename)
print(descriptions)

{'insta2': [], 'insta3': [], 'insta4': ['🤍'], 'insta5': [], 'insta6': [], 'insta7': [], 'insta8': ['🦋', '🌈', '💫'], 'insta9': ['🤎'], 'insta10': ['🦋'], 'insta11': [], 'insta12': ['🌸', '🦒'], 'insta13': [], 'insta14': ['🌈', '🌸', '💐'], 'insta15': [], 'insta16': [], 'insta17': [], 'insta18': [], 'insta19': [], 'insta20': [], 'insta21': [], 'insta22': [], 'insta23': ['🥴'], 'insta24': [], 'insta25': [], 'insta26': ['💚'], 'insta27': ['🌼'], 'insta28': ['🖤'], 'insta29': [], 'insta30': ['🍝'], 'insta31': ['❤'], 'insta32': ['🖤'], 'insta33': ['❤️'], 'insta34': ['💙'], 'insta35': ['💙'], 'insta36': [], 'insta37': [], 'insta38': [], 'insta39': ['🎃', '🕸', '🖤'], 'insta40': ['😋'], 'insta41': ['🥰'], 'insta42': ['❤️'], 'insta43': [], 'insta44': [], 'insta45': [], 'insta46': [], 'insta47': [], 'insta48': [], 'insta49': [], 'insta50': [], 'insta51': [], 'insta52': ['❤️'], 'insta53': ['🦈'], 'insta54': ['💜', '🌈'], 'insta55': ['❤️'], 'insta56': [], 'insta57': [], 'insta58': ['☀️'], 'insta59': ['✨', '🌈', '💕'], 'ins

In [10]:
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    # Split the descriptions into words and add them to the set
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key].split()]
    return all_desc
 
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print(vocabulary)
print(len(vocabulary))

AttributeError: 'list' object has no attribute 'split'

In [48]:
def save_descriptions(descriptions, filename):
	lines = list()
	
	for key, desc in descriptions.items():
		new_desc = ""
		if desc == []:
			continue
		else:
			for element in desc:
				new_desc += element
			lines.append(key + " " + new_desc)

	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

save_descriptions(descriptions, 'descriptions.txt')

In [70]:
desc = []
file = open("descriptions.txt")
for element in file:
    identifier = element.split(" ")[0]
    desc.append(identifier)
train, test = train_test_split(desc, test_size = 0.20, random_state = 10)
print(train)
len(train)

['insta9426', 'insta17665', 'insta2803', 'insta1477', 'insta19697', 'insta6149', 'insta8835', 'insta2120', 'insta4798', 'insta18172', 'insta7649', 'insta20118', 'insta8174', 'insta9454', 'insta8469', 'insta2391', 'insta10356', 'insta3526', 'insta20120', 'insta6286', 'insta19473', 'insta18093', 'insta2926', 'insta5017', 'insta19905', 'insta9693', 'insta6084', 'insta10336', 'insta6417', 'insta8291', 'insta2969', 'insta3419', 'insta911', 'insta7246', 'insta2862', 'insta6221', 'insta17805', 'insta2392', 'insta8543', 'insta2705', 'insta2610', 'insta17447', 'insta17879', 'insta4787', 'insta5710', 'insta6783', 'insta8193', 'insta19794', 'insta9038', 'insta8192', 'insta4743', 'insta11160', 'insta3505', 'insta3696', 'insta1123', 'insta9708', 'insta11904', 'insta11468', 'insta18161', 'insta723', 'insta7857', 'insta309', 'insta8869', 'insta20079', 'insta3265', 'insta2925', 'insta19687', 'insta8195', 'insta167', 'insta2879', 'insta17366', 'insta741', 'insta6707', 'insta7683', 'insta7808', 'insta91

4396

In [2]:
import os   # handling the files
import pickle # storing numpy features
import numpy as np
from tensorflow.keras.applications.vgg16 import VGG16 , preprocess_input # extract features from image data.
from tensorflow.keras.preprocessing.image import load_img , img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input , Dense , LSTM , Embedding , Dropout , add

In [3]:
base_direct = "archive"
working_direct = "/kaggle/working"

In [4]:
vgg = VGG16()

model = Model(inputs = vgg.inputs, outputs = vgg.layers[-2].output)
print(model.summary())

2024-02-28 22:01:25.568292: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-02-28 22:01:25.568352: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-02-28 22:01:25.568369: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-02-28 22:01:25.568590: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-28 22:01:25.568955: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [5]:
features = {}
directory = os.path.join(base_direct, "Images")

for img_name in os.listdir(directory):
    img_path = directory + "/" + img_name
    image = load_img(img_path, target_size = (224, 224))
    image = img_to_array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    feature = model.predict(image, verbose = 0)
    image_id = img_name.split(".")[0]
    features[image_id] = feature

2024-02-28 22:01:34.797199: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [6]:
pickle.dump(features, open("features.pkl", "wb"))
with open("features.pkl", "rb") as f:
    features = pickle.load(f)

In [12]:
with open(os.path.join(base_direct, 'captions.txt')) as f:
    next(f)
    captions_doc = f.read()

mapping = {}
for line in captions_doc.split("\n"):
    tokens = line.split(",")
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    image_id = image_id.split(".")[0]
    caption = " ".join(caption)
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)
print(len(mapping))

8091


In [13]:
def clean(mapping):
    for id, captions in mapping.items():
        for x in range(len(captions)):
            caption = captions[x]
            caption = caption.lower()
            # delete digits, special chars, etc., 
            caption = caption.replace('[^A-Za-z]', '')
            caption = " ".join(caption.split())
            caption = " ".join([word for word in caption.split() if len(word)>1])
            captions[x] = caption
mapping['1000268201_693b08cb0e']
clean(mapping)
mapping['1000268201_693b08cb0e']

['child in pink dress is climbing up set of stairs in an entry way',
 'girl going into wooden building',
 'little girl climbing into wooden playhouse',
 'little girl climbing the stairs to her playhouse',
 'little girl in pink dress going into wooden cabin']

In [14]:
all_cap = []
for key in mapping:
    for caption in mapping[key]:
        all_cap.append(caption)

In [15]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_cap)
vocab_size = len(tokenizer.word_index) + 1
vocab_size

8483

In [16]:
max_length = max(len(caption.split()) for caption in all_cap)
max_length

33