##Pre-indexing the flicker8k dataset to speed up model API startup

In [1]:
%%capture
!pip install chromadb

In [2]:
!head Flickr8k.token.txt

1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .
1000268201_693b08cb0e.jpg#1	A girl going into a wooden building .
1000268201_693b08cb0e.jpg#2	A little girl climbing into a wooden playhouse .
1000268201_693b08cb0e.jpg#3	A little girl climbing the stairs to her playhouse .
1000268201_693b08cb0e.jpg#4	A little girl in a pink dress going into a wooden cabin .
1001773457_577c3a7d70.jpg#0	A black dog and a spotted dog are fighting
1001773457_577c3a7d70.jpg#1	A black dog and a tri-colored dog playing with each other on the road .
1001773457_577c3a7d70.jpg#2	A black dog and a white dog with brown spots are staring at each other in the street .
1001773457_577c3a7d70.jpg#3	Two dogs of different breeds looking at each other on the road .
1001773457_577c3a7d70.jpg#4	Two dogs on pavement moving toward each other .


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("Flickr8k.token.txt", sep='\t', header=None)

In [5]:
df.head()

Unnamed: 0,0,1
0,1000268201_693b08cb0e.jpg#0,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg#1,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg#2,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg#3,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg#4,A little girl in a pink dress going into a woo...


In [4]:
prefix = 'Flicker8k_Dataset/'

In [5]:
df.rename(columns={1: 'Caption'}, inplace=True)
df['Image'] = df[0].apply(lambda s: prefix + s.split('#')[0])

In [8]:
df

Unnamed: 0,0,Caption,Image
0,1000268201_693b08cb0e.jpg#0,A child in a pink dress is climbing up a set o...,Flicker8k_Dataset/1000268201_693b08cb0e.jpg
1,1000268201_693b08cb0e.jpg#1,A girl going into a wooden building .,Flicker8k_Dataset/1000268201_693b08cb0e.jpg
2,1000268201_693b08cb0e.jpg#2,A little girl climbing into a wooden playhouse .,Flicker8k_Dataset/1000268201_693b08cb0e.jpg
3,1000268201_693b08cb0e.jpg#3,A little girl climbing the stairs to her playh...,Flicker8k_Dataset/1000268201_693b08cb0e.jpg
4,1000268201_693b08cb0e.jpg#4,A little girl in a pink dress going into a woo...,Flicker8k_Dataset/1000268201_693b08cb0e.jpg
...,...,...,...
40455,997722733_0cb5439472.jpg#0,A man in a pink shirt climbs a rock face,Flicker8k_Dataset/997722733_0cb5439472.jpg
40456,997722733_0cb5439472.jpg#1,A man is rock climbing high in the air .,Flicker8k_Dataset/997722733_0cb5439472.jpg
40457,997722733_0cb5439472.jpg#2,A person in a red shirt climbing up a rock fac...,Flicker8k_Dataset/997722733_0cb5439472.jpg
40458,997722733_0cb5439472.jpg#3,A rock climber in a red shirt .,Flicker8k_Dataset/997722733_0cb5439472.jpg


In [6]:
df2 = df[df[0].apply(lambda s: s.split('#')[1]) == '1']
del df2[0]

In [10]:
df2

Unnamed: 0,Caption,Image
1,A girl going into a wooden building .,Flicker8k_Dataset/1000268201_693b08cb0e.jpg
6,A black dog and a tri-colored dog playing with...,Flicker8k_Dataset/1001773457_577c3a7d70.jpg
11,A little girl is sitting in front of a large p...,Flicker8k_Dataset/1002674143_1b742ab4b8.jpg
16,A man lays on the bench to which a white dog i...,Flicker8k_Dataset/1003163366_44323f5815.jpg
21,A man wears an orange hat and glasses .,Flicker8k_Dataset/1007129816_e794419615.jpg
...,...,...
40436,A man is doing a wheelie on a mountain bike .,Flicker8k_Dataset/990890291_afc72be141.jpg
40441,A group of people sit atop a snowy mountain .,Flicker8k_Dataset/99171998_7cc800ceef.jpg
40446,A large bird stands in the water on the beach .,Flicker8k_Dataset/99679241_adc853a5c0.jpg
40451,a woman behind a scrolled wall is writing,Flicker8k_Dataset/997338199_7343367d7f.jpg


In [7]:
import chromadb
from chromadb.config import Settings

In [8]:
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2", device="cuda")

In [9]:
persist_directory = 'chromadb_index'

!rm -rf ./chromadb_index

In [10]:
db_client = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False))
collection = db_client.create_collection(name="flicker8k", embedding_function=embedding_function)

In [13]:
ids = df2['Image'].to_list()
docs = df2['Caption'].to_list()

In [12]:
print(collection.count())

0


In [14]:
collection.add(ids=ids[:4000], documents=docs[:4000])

In [15]:
print(collection.count())

4000


In [16]:
collection.add(ids=ids[4000:], documents=docs[4000:])

In [17]:
print(collection.count())

8092


In [19]:
db_client2 = chromadb.PersistentClient(path=persist_directory, settings=Settings(anonymized_telemetry=False))
collection2 = db_client.get_or_create_collection(name="flicker8k", embedding_function=embedding_function)

In [20]:
print(collection2.count())

8092


In [22]:
!zip -r chromadb_index.zip chromadb_index

  adding: chromadb_index/ (stored 0%)
  adding: chromadb_index/chroma.sqlite3 (deflated 36%)
  adding: chromadb_index/664baed4-7f5b-4316-9968-a3177f2441a0/ (stored 0%)
  adding: chromadb_index/664baed4-7f5b-4316-9968-a3177f2441a0/index_metadata.pickle (deflated 68%)
  adding: chromadb_index/664baed4-7f5b-4316-9968-a3177f2441a0/data_level0.bin (deflated 26%)
  adding: chromadb_index/664baed4-7f5b-4316-9968-a3177f2441a0/link_lists.bin (deflated 77%)
  adding: chromadb_index/664baed4-7f5b-4316-9968-a3177f2441a0/length.bin (deflated 100%)
  adding: chromadb_index/664baed4-7f5b-4316-9968-a3177f2441a0/header.bin (deflated 55%)


In [24]:
import shutil

In [None]:
shutil.make_archive(output_filename, 'zip', dir_name)

In [None]:
shutil.unpack_archive(filename, extract_dir, 'zip')