In [8]:
# all our imports
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Model
from os import listdir
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from pickle import dump
from pickle import load
import emoji
import advertools as adv

In [7]:
def extract_features(directory):
    model = tf.keras.applications.VGG16()
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    print(model.summary())
    features = dict()
    for name in listdir(directory):
        filename = directory + '/' + name
        image = load_img(filename, target_size=(224, 224))
        image = img_to_array(image)
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    feature = model.predict(image, verbose=0)
    image_id = name.split('.')[0]
    features[image_id] = feature
    return features

directory = 'instagram_data/img'
features = extract_features(directory)
print('Extracted Features: %d' % len(features))
dump(features, open('features.pkl', 'wb'))

2024-02-27 20:14:03.314369: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-02-27 20:14:03.314401: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-02-27 20:14:03.314409: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-02-27 20:14:03.314688: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-27 20:14:03.315073: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

2024-02-27 20:18:04.025889: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Extracted Features: 1


In [5]:
print(len(features['insta8959']))

1


In [24]:
import pandas as pd
def load_doc(filename):
    df = pd.read_csv(filename)
    # Remove  na values
    df = df.dropna()
    # Drop the first column
    df = df.drop(df.columns[0], axis=1)
    # Rename Image File column to image_id
    df = df.rename(columns={"Image File": "image_id"})
    # In the image_id column, remove the file extension
    df['image_id'] = df['image_id'].str.replace('img/', '')
    # keep only the emojis in the caption
    df["Caption"] = adv.extract_emoji(df["Caption"])["emoji"]

    # Turn the dataframe into a dictionary with the image_id as the key and the caption as the value
    captions = dict()
    for i in range(len(df)):
        image_id = df.iloc[i]['image_id']
        caption = df.iloc[i]['Caption']
        captions[image_id] = caption

    return captions
    

filename = 'instagram_data/captions_csv.csv'
descriptions = load_doc(filename)
print(descriptions)

{'insta2': [], 'insta3': [], 'insta4': ['🤍'], 'insta5': [], 'insta6': [], 'insta7': [], 'insta8': ['🦋', '🌈', '💫'], 'insta9': ['🤎'], 'insta10': ['🦋'], 'insta11': [], 'insta12': ['🌸', '🦒'], 'insta13': [], 'insta14': ['🌈', '🌸', '💐'], 'insta15': [], 'insta16': [], 'insta17': [], 'insta18': [], 'insta19': [], 'insta20': [], 'insta21': [], 'insta22': [], 'insta23': ['🥴'], 'insta24': [], 'insta25': [], 'insta26': ['💚'], 'insta27': ['🌼'], 'insta28': ['🖤'], 'insta29': [], 'insta30': ['🍝'], 'insta31': ['❤'], 'insta32': ['🖤'], 'insta33': ['❤️'], 'insta34': ['💙'], 'insta35': ['💙'], 'insta36': [], 'insta37': [], 'insta38': [], 'insta39': ['🎃', '🕸', '🖤'], 'insta40': ['😋'], 'insta41': ['🥰'], 'insta42': ['❤️'], 'insta43': [], 'insta44': [], 'insta45': [], 'insta46': [], 'insta47': [], 'insta48': [], 'insta49': [], 'insta50': [], 'insta51': [], 'insta52': ['❤️'], 'insta53': ['🦈'], 'insta54': ['💜', '🌈'], 'insta55': ['❤️'], 'insta56': [], 'insta57': [], 'insta58': ['☀️'], 'insta59': ['✨', '🌈', '💕'], 'ins

In [12]:
def to_vocabulary(descriptions):
    # build a list of all description strings
    all_desc = set()
    # Split the descriptions into words and add them to the set
    for key in descriptions.keys():
        [all_desc.update(d.split()) for d in descriptions[key].split()]
    return all_desc
 
# summarize vocabulary
vocabulary = to_vocabulary(descriptions)
print(vocabulary)
print(len(vocabulary))

20675


In [32]:
def save_descriptions(descriptions, filename):
	lines = list()
	
	for key, desc in descriptions.items():
		new_desc = ""
		if desc == []:
			continue
		else:
			for element in desc:
				new_desc += element
			lines.append(key + " " + new_desc)

	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

save_descriptions(descriptions, 'descriptions.txt')

In [9]:
def load_doc(filename): # there are two load_doc maybe we should change it
	file = open(filename)
	text = file.read()
	file.close()
	return text

def load_set(filename):
	doc = load_doc(filename)
	dataset = list()
	for line in doc.split('\n'):
		if len(line) < 1:
			continue
		identifier = line.split('.')[0]
		dataset.append(identifier)
	return set(dataset)

def load_clean_descriptions(filename, dataset):
	doc = load_doc(filename)
	descriptions = dict()
	for line in doc.split('\n'):
		tokens = line.split()
		image_id, image_desc = tokens[0], tokens[1:]
		if image_id in dataset:
			if image_id not in descriptions:
				descriptions[image_id] = list()
			desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
			descriptions[image_id].append(desc)
	return descriptions

In [10]:
# load photo features
def load_photo_features(filename, dataset):
	# load all features
	all_features = load(open(filename, 'rb'))
	# filter features
	features = {k: all_features[k] for k in dataset}
	return features

filename = 'instagram_data/captions_csv.csv'
train = load_set(filename)
print('Dataset: %d' % len(train))
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
print(train_descriptions)
# # photo features
# train_features = load_photo_features('features.pkl', train)
# print('Photos: train=%d' % len(train_features))


Dataset: 20516
Descriptions: train=0
{}
