In [1]:
from tqdm.notebook import tqdm, trange

In [2]:
tqdm.pandas()

In [3]:
import sys

In [4]:
import glob
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [5]:
import os

# General packages
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
from PIL import Image

from IPython.display import Image as IImage
from IPython.display import display
import warnings
warnings.filterwarnings("ignore")

In [6]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/barzamini/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/barzamini/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
def find_gpus(nums=6):
    os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp_free_gpus')
    with open('tmp_free_gpus', 'r') as lines_txt:
        frees = lines_txt.readlines()
        idx_freeMemory_pair = [ (idx,int(x.split()[2]))
                              for idx,x in enumerate(frees) ]
    idx_freeMemory_pair.sort(key=lambda my_tuple:my_tuple[1],reverse=True)
    usingGPUs = [str(idx_memory_pair[0])
                    for idx_memory_pair in idx_freeMemory_pair[:nums] ]
    usingGPUs =  ','.join(usingGPUs)
    print('using GPU idx: #', usingGPUs)
    return usingGPUs

In [8]:
os.environ['CUDA_VISIBLE_DEVICES'] = find_gpus(nums=2)

using GPU idx: # 1,0


In [9]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## get topics

In [10]:
from bertopic import BERTopic

In [11]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [12]:
lemmatizer = WordNetLemmatizer()

In [13]:
def get_topics(concept='pedestrian'):
    if concept == 'pedestrian':
        model_path = 'ped_auto_model'
    elif concept == 'aircraft':
        model_path = 'aircraft_auto_model'
    elif concept == 'car':
        model_path = 'car_auto_model'
    auto_model = BERTopic(nr_topics="auto")
    auto_model = BERTopic.load(model_path)
    similar_topics, similarity = auto_model.find_topics(concept, top_n=50)

    topics = dict()
    for t in similar_topics:
        l1 = auto_model.get_topic(t)
        el_list = []
        el_list.append(concept)
        for el in l1:
            w = lemmatizer.lemmatize(el[0], get_wordnet_pos(el[0]))
            el_list.append(w)
        topics[t]= set(el_list)
    return topics

# Load files

In [14]:
print(os.listdir('/raid/AISSEL/Hamed/datasets/wit/'))

['wit_v1.train.all-00007-of-00010_context_caption_en_sbert_c.tsv', 'wit_v1.train.all-00009-of-00010_context_caption_en_sbert_c.tsv', '__MACOSX', 'wit_v1.train.all-1percent_sample.tsv', 'wit_v1.train.all-00000-of-00010_en_csim.tsv', 'wit_v1.train.all-00006-of-00010_context_caption_en_sbert_cpa.tsv', 'wit_v1.train.all-00008-of-00010_context_caption_en_sbert_cpa.tsv', 'images', 'wit_v1.train.all-00000-of-00010_context_caption_en_sbert_cpa.tsv', 'wit_v1.train.all-00001-of-00010_context_caption_en_sbert_cpa.tsv', 'wit_v1.train.all-00002-of-00010_context_caption_en_sbert_cpa.tsv', 'wit_v1.train.all-00003-of-00010_context_caption_en_sbert_cpa.tsv', 'wit_v1.train.all-00004-of-00010_context_caption_en_sbert_cpa.tsv', 'wit_v1.train.all-00005-of-00010_context_caption_en_sbert_cpa.tsv', 'wit_v1.train.all-00007-of-00010_context_caption_en_sbert_cpa.tsv', 'wit_v1.train.all-00009-of-00010_context_caption_en_sbert_cpa.tsv', 'wit_v1.train.all-00000-of-00010_context_caption_en_sbert_c.tsv', 'wit_v1.trai

In [15]:
root_path = '/raid/AISSEL/Hamed/datasets/wit'

In [16]:
f_names =[el for el in os.listdir('/raid/AISSEL/Hamed/datasets/wit/') if el.endswith('_en_sbert_cpa.tsv')]
f_names

['wit_v1.train.all-00006-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00008-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00000-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00001-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00002-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00003-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00004-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00005-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00007-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00009-of-00010_context_caption_en_sbert_cpa.tsv']

In [17]:
df = pd.read_csv(f'{root_path}/{f_names[0]}', sep='\t')
df = df.drop('Unnamed: 0', 1)
df = df[df["language"]=='en']
df

Unnamed: 0,index,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,...,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score,concept_caption_score,concept_context_score,car_caption_score,aircraft_caption_score
0,14,en,https://en.wikipedia.org/wiki/LibreOffice,https://upload.wikimedia.org/wikipedia/commons...,LibreOffice,Included applications,LibreOffice / Features / Included applications,LibreOffice Math running on Ubuntu,English: Screenshots of LibreOffice Math 6.4 r...,,...,-0.037750,-0.025831,-0.045994,-0.067079,0.128117,-0.067173,-0.040416,-0.020415,-0.032282,-0.006119
1,28,en,https://en.wikipedia.org/wiki/Dalian,https://upload.wikimedia.org/wikipedia/commons...,Dalian,Research centres,Dalian / Education / Research centres,"Dalian Institute of Chemical Physics, of the C...",,,...,0.070782,0.014916,0.060768,0.137923,0.051047,0.125953,0.004426,0.083636,0.018341,0.077601
2,41,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,...,0.048340,0.031444,-0.178475,0.042817,0.119373,0.045854,0.090404,-0.012327,0.029681,0.181104
3,50,en,https://en.wikipedia.org/wiki/List_of_Bermuda_...,https://upload.wikimedia.org/wikipedia/commons...,List of Bermuda hurricanes,1960s,List of Bermuda hurricanes / List of storms / ...,"Hurricane Inga, one of the longest-lived Atlan...",Saffir-Simpson Hurricane ScaleTDTS12345 Englis...,Map showing the path and intensity of Hurrican...,...,-0.104979,-0.117618,-0.052297,-0.031839,-0.167693,-0.054938,0.005365,-0.095421,0.068992,0.103214
4,52,en,https://en.wikipedia.org/wiki/Parimelalhagar,https://upload.wikimedia.org/wikipedia/commons...,Parimelalhagar,Early life,Parimelalhagar / Early life,A page from the Parimelalhagar's commentary on...,English: A page from Arumuka Navalar's 1861 ed...,,...,0.013991,-0.050420,-0.043610,-0.030726,0.036060,-0.098819,0.025255,-0.061983,-0.008257,0.057020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540382,3704673,en,https://en.wikipedia.org/wiki/Standard_Electri...,https://upload.wikimedia.org/wikipedia/commons...,Standard Electric Time Company,,Standard Electric Time Company,A Standard 200177 fire alarm pull station,English: A Standard 200177 pull station in Har...,,...,-0.033670,-0.089871,-0.052723,-0.037692,-0.037566,0.038821,0.130745,-0.079322,0.099379,0.133040
540383,3704674,en,https://en.wikipedia.org/wiki/Malaysia_Airline...,https://upload.wikimedia.org/wikipedia/commons...,Malaysia Airlines Flight 370,Analysis,Malaysia Airlines Flight 370 / Investigation /...,A heat map indicating the probable location of...,English: Probability of the location where Mal...,,...,-0.060926,-0.013728,-0.057422,0.144767,-0.017854,-0.034770,-0.007270,0.046857,0.060193,0.218703
540384,3704675,en,https://en.wikipedia.org/wiki/Jackson_Plan,https://upload.wikimedia.org/wikipedia/commons...,Jackson Plan,Overall layout,Jackson Plan / Layout and effect of the plan /...,"Map of Singapore in 1914, the layout is now mo...","English: Map of the city of Singapore, ca 1914...",,...,0.071714,0.023376,0.013262,0.339486,0.056978,0.012006,0.018857,0.090633,-0.003516,-0.021940
540385,3704677,en,https://en.wikipedia.org/wiki/Candiacervus,https://upload.wikimedia.org/wikipedia/commons...,Candiacervus,Taxonomy,Candiacervus / Taxonomy,Hippopotamus creutzburgi and C. cretensis,English: My drawings of the two subspecies of ...,,...,0.145484,-0.006134,0.115790,-0.039335,0.091558,0.093865,0.052330,0.046950,0.023961,-0.041651


In [18]:
concept_col = ['concept_caption_score', 'car_caption_score', 'aircraft_caption_score']
new_cols = [el for el in list(df.columns) if '_score' in el]
org_cols = [el for el in list(df.columns) if not '_score' in el]

In [19]:
caption_score_list = [el for el in df.columns if 'caption_score' in el]
print(caption_score_list)

['196_caption_score', '412_caption_score', '172_caption_score', '363_caption_score', '198_caption_score', '114_caption_score', '419_caption_score', '294_caption_score', '388_caption_score', '314_caption_score', '98_caption_score', '80_caption_score', '134_caption_score', '432_caption_score', '394_caption_score', '53_caption_score', '56_caption_score', '150_caption_score', '278_caption_score', '-1_caption_score', '43_caption_score', '265_caption_score', '345_caption_score', '378_caption_score', '11_caption_score', '331_caption_score', '202_caption_score', '169_caption_score', '327_caption_score', '309_caption_score', '68_caption_score', '216_caption_score', '40_caption_score', '99_caption_score', '426_caption_score', '344_caption_score', '47_caption_score', '8_caption_score', '416_caption_score', '112_caption_score', '137_caption_score', '441_caption_score', '192_caption_score', '20_caption_score', '296_caption_score', '144_caption_score', '12_caption_score', '224_caption_score', '250_c

In [20]:
caption_score_list = caption_score_list[:-3]
print(caption_score_list)

['196_caption_score', '412_caption_score', '172_caption_score', '363_caption_score', '198_caption_score', '114_caption_score', '419_caption_score', '294_caption_score', '388_caption_score', '314_caption_score', '98_caption_score', '80_caption_score', '134_caption_score', '432_caption_score', '394_caption_score', '53_caption_score', '56_caption_score', '150_caption_score', '278_caption_score', '-1_caption_score', '43_caption_score', '265_caption_score', '345_caption_score', '378_caption_score', '11_caption_score', '331_caption_score', '202_caption_score', '169_caption_score', '327_caption_score', '309_caption_score', '68_caption_score', '216_caption_score', '40_caption_score', '99_caption_score', '426_caption_score', '344_caption_score', '47_caption_score', '8_caption_score', '416_caption_score', '112_caption_score', '137_caption_score', '441_caption_score', '192_caption_score', '20_caption_score', '296_caption_score', '144_caption_score', '12_caption_score', '224_caption_score', '250_c

In [21]:
context_score_list = [el for el in df.columns if 'context_score' in el]
context_score_list = context_score_list[:-1]
print(context_score_list)

['196_context_score', '412_context_score', '172_context_score', '363_context_score', '198_context_score', '114_context_score', '419_context_score', '294_context_score', '388_context_score', '314_context_score', '98_context_score', '80_context_score', '134_context_score', '432_context_score', '394_context_score', '53_context_score', '56_context_score', '150_context_score', '278_context_score', '-1_context_score', '43_context_score', '265_context_score', '345_context_score', '378_context_score', '11_context_score', '331_context_score', '202_context_score', '169_context_score', '327_context_score', '309_context_score', '68_context_score', '216_context_score', '40_context_score', '99_context_score', '426_context_score', '344_context_score', '47_context_score', '8_context_score', '416_context_score', '112_context_score', '137_context_score', '441_context_score', '192_context_score', '20_context_score', '296_context_score', '144_context_score', '12_context_score', '224_context_score', '250_c

In [23]:
# b, f, g, i, m

In [24]:
im_c = ['image_url', 'caption', 'context']
cap_s = ['concept_caption_score']
con_s = ['concept_context_score']
plus_col = ['car_caption_score', 'aircraft_caption_score']
col_list = im_c + cap_s + con_s + caption_score_list + context_score_list

In [25]:
csim_df = pd.DataFrame()
df = df.reset_index(drop=True)
csim_df = pd.concat([csim_df, df], ignore_index=True)
csim_df

Unnamed: 0,index,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,...,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score,concept_caption_score,concept_context_score,car_caption_score,aircraft_caption_score
0,14,en,https://en.wikipedia.org/wiki/LibreOffice,https://upload.wikimedia.org/wikipedia/commons...,LibreOffice,Included applications,LibreOffice / Features / Included applications,LibreOffice Math running on Ubuntu,English: Screenshots of LibreOffice Math 6.4 r...,,...,-0.037750,-0.025831,-0.045994,-0.067079,0.128117,-0.067173,-0.040416,-0.020415,-0.032282,-0.006119
1,28,en,https://en.wikipedia.org/wiki/Dalian,https://upload.wikimedia.org/wikipedia/commons...,Dalian,Research centres,Dalian / Education / Research centres,"Dalian Institute of Chemical Physics, of the C...",,,...,0.070782,0.014916,0.060768,0.137923,0.051047,0.125953,0.004426,0.083636,0.018341,0.077601
2,41,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,...,0.048340,0.031444,-0.178475,0.042817,0.119373,0.045854,0.090404,-0.012327,0.029681,0.181104
3,50,en,https://en.wikipedia.org/wiki/List_of_Bermuda_...,https://upload.wikimedia.org/wikipedia/commons...,List of Bermuda hurricanes,1960s,List of Bermuda hurricanes / List of storms / ...,"Hurricane Inga, one of the longest-lived Atlan...",Saffir-Simpson Hurricane ScaleTDTS12345 Englis...,Map showing the path and intensity of Hurrican...,...,-0.104979,-0.117618,-0.052297,-0.031839,-0.167693,-0.054938,0.005365,-0.095421,0.068992,0.103214
4,52,en,https://en.wikipedia.org/wiki/Parimelalhagar,https://upload.wikimedia.org/wikipedia/commons...,Parimelalhagar,Early life,Parimelalhagar / Early life,A page from the Parimelalhagar's commentary on...,English: A page from Arumuka Navalar's 1861 ed...,,...,0.013991,-0.050420,-0.043610,-0.030726,0.036060,-0.098819,0.025255,-0.061983,-0.008257,0.057020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540382,3704673,en,https://en.wikipedia.org/wiki/Standard_Electri...,https://upload.wikimedia.org/wikipedia/commons...,Standard Electric Time Company,,Standard Electric Time Company,A Standard 200177 fire alarm pull station,English: A Standard 200177 pull station in Har...,,...,-0.033670,-0.089871,-0.052723,-0.037692,-0.037566,0.038821,0.130745,-0.079322,0.099379,0.133040
540383,3704674,en,https://en.wikipedia.org/wiki/Malaysia_Airline...,https://upload.wikimedia.org/wikipedia/commons...,Malaysia Airlines Flight 370,Analysis,Malaysia Airlines Flight 370 / Investigation /...,A heat map indicating the probable location of...,English: Probability of the location where Mal...,,...,-0.060926,-0.013728,-0.057422,0.144767,-0.017854,-0.034770,-0.007270,0.046857,0.060193,0.218703
540384,3704675,en,https://en.wikipedia.org/wiki/Jackson_Plan,https://upload.wikimedia.org/wikipedia/commons...,Jackson Plan,Overall layout,Jackson Plan / Layout and effect of the plan /...,"Map of Singapore in 1914, the layout is now mo...","English: Map of the city of Singapore, ca 1914...",,...,0.071714,0.023376,0.013262,0.339486,0.056978,0.012006,0.018857,0.090633,-0.003516,-0.021940
540385,3704677,en,https://en.wikipedia.org/wiki/Candiacervus,https://upload.wikimedia.org/wikipedia/commons...,Candiacervus,Taxonomy,Candiacervus / Taxonomy,Hippopotamus creutzburgi and C. cretensis,English: My drawings of the two subspecies of ...,,...,0.145484,-0.006134,0.115790,-0.039335,0.091558,0.093865,0.052330,0.046950,0.023961,-0.041651


In [26]:
csim_df = pd.DataFrame()
for idx, fn in tqdm(enumerate(f_names)):
#     if idx == 2:
#         break
    df = pd.read_csv(f'{root_path}/{fn}', sep='\t')
    df = df.drop('Unnamed: 0', 1)
    df = df[df["language"]=='en']
    df = df[col_list]
#     df_1 = df_1.sort_values(by=['196_score'], ascending=False)
    df = df.reset_index(drop=True)
    csim_df = pd.concat([csim_df, df], ignore_index=True)
    

0it [00:00, ?it/s]

In [27]:
# csim_df.sort_values(by=['196_score'], ascending=False)
csim_df

Unnamed: 0,image_url,caption,context,concept_caption_score,concept_context_score,196_caption_score,412_caption_score,172_caption_score,363_caption_score,198_caption_score,...,137_context_score,441_context_score,192_context_score,20_context_score,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score
0,https://upload.wikimedia.org/wikipedia/commons...,LibreOffice Math running on Ubuntu English: Sc...,LibreOffice is a free and open-source office s...,-0.040416,-0.020415,0.002686,0.042616,0.010594,-0.072686,-0.080525,...,-0.026707,0.017691,-0.051926,-0.086811,-0.037750,-0.025831,-0.045994,-0.067079,0.128117,-0.067173
1,https://upload.wikimedia.org/wikipedia/commons...,"Dalian Institute of Chemical Physics, of the C...",Dalian is a major sub-provincial port city in ...,0.004426,0.083636,-0.045319,-0.014597,0.050622,-0.052078,0.002163,...,0.090900,0.139327,0.108564,0.032219,0.070782,0.014916,0.060768,0.137923,0.051047,0.125953
2,https://upload.wikimedia.org/wikipedia/commons...,Kawasaki Stadium English: kawasaki_fujimi Stad...,FMW 7th Anniversary Show was a professional wr...,0.090404,-0.012327,-0.049097,-0.018047,-0.039384,-0.064661,0.037314,...,-0.093685,-0.231178,-0.015962,0.008826,0.048340,0.031444,-0.178475,0.042817,0.119373,0.045854
3,https://upload.wikimedia.org/wikipedia/commons...,"Hurricane Inga, one of the longest-lived Atlan...",The British Overseas Territory of Bermuda has ...,0.005365,-0.095421,-0.057836,-0.039939,-0.033494,-0.033917,-0.042589,...,-0.134550,-0.122756,-0.088162,-0.107879,-0.104979,-0.117618,-0.052297,-0.031839,-0.167693,-0.054938
4,https://upload.wikimedia.org/wikipedia/commons...,A page from the Parimelalhagar's commentary on...,"Parimelalhagar, also known as Vanthuvarai Peru...",0.025255,-0.061983,0.028717,-0.008050,0.003633,-0.019985,-0.025944,...,-0.037593,0.069883,-0.044136,-0.007787,0.013991,-0.050420,-0.043610,-0.030726,0.036060,-0.098819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5411973,https://upload.wikimedia.org/wikipedia/commons...,"English: Jia Jingde, politician of China. 中文:...",This is a list of Vice Presidents of the Exami...,0.156600,0.093868,-0.021200,0.070536,0.045870,-0.022279,0.064005,...,0.064688,-0.003800,0.079544,0.097345,0.022783,0.035481,-0.009618,0.142356,0.097828,0.056724
5411974,https://upload.wikimedia.org/wikipedia/commons...,Beyer in 2013 DSC_9909.jpg,Brennen Beyer is an American football outside ...,0.185849,0.191958,0.046954,0.072107,0.109573,0.070777,0.164449,...,0.028834,0.043354,0.121015,0.170634,0.032242,0.086187,0.108909,-0.038890,0.102163,-0.002512
5411975,https://upload.wikimedia.org/wikipedia/commons...,Kannagi in Tamil Nadu. English: Idol of Kannak...,"Kannagi, sometimes spelled Kannaki, is a legen...",0.065777,0.075305,-0.023543,-0.038281,0.073440,0.010564,0.030676,...,0.136019,0.181244,0.122867,0.101799,0.063076,0.023685,0.042369,0.136377,0.022068,-0.019773
5411976,https://upload.wikimedia.org/wikipedia/commons...,English: Landscape with stormy clouds and a p...,"Don Det, is an island in the Mekong River in t...",0.051398,0.021199,0.041571,0.041349,0.022871,-0.075316,-0.023433,...,-0.032127,0.073285,0.013086,-0.011777,0.021279,0.020210,0.024772,0.081596,0.068120,0.066233


In [28]:
csim_df.describe()

Unnamed: 0,concept_caption_score,concept_context_score,196_caption_score,412_caption_score,172_caption_score,363_caption_score,198_caption_score,114_caption_score,419_caption_score,294_caption_score,...,137_context_score,441_context_score,192_context_score,20_context_score,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score
count,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,...,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0
mean,0.08732864,0.02756854,0.03316916,0.02898722,0.04938595,0.009565755,0.03607464,0.04620839,0.05406117,0.03718228,...,0.0139156,-0.003944045,0.003846679,0.006175797,0.03927394,0.000752269,0.01043332,0.01692591,0.01978357,0.007244231
std,0.06997118,0.06684423,0.06863361,0.06478608,0.06895159,0.06071932,0.07447689,0.06934013,0.07638213,0.06107245,...,0.07867535,0.07710654,0.061788,0.06737285,0.0873531,0.06067205,0.07671057,0.08884871,0.06969397,0.07776987
min,-0.2394902,-0.2688895,-0.2543995,-0.2410043,-0.2478864,-0.2473019,-0.2808687,-0.2716275,-0.2415859,-0.2307905,...,-0.3086933,-0.3402204,-0.2946304,-0.2907839,-0.2930294,-0.2660354,-0.3073893,-0.328301,-0.2828747,-0.384506
25%,0.03980459,-0.01782694,-0.01334092,-0.01534086,0.003201978,-0.03199529,-0.01532192,-0.001223383,0.002334806,-0.003728528,...,-0.03933429,-0.05569053,-0.03841725,-0.03938309,-0.02109603,-0.04113004,-0.04181451,-0.04457409,-0.02812582,-0.04579195
50%,0.086847,0.02613318,0.02696826,0.02532269,0.04495488,0.007484896,0.03195039,0.04330221,0.04744296,0.03457904,...,0.009939842,-0.005078442,0.002965325,0.00436971,0.0324168,-0.001201854,0.006485071,0.01087677,0.01813048,0.004907532
75%,0.1341138,0.07054175,0.0721949,0.06901627,0.08990105,0.04876487,0.08285133,0.09029126,0.09732897,0.07442215,...,0.06223764,0.0457986,0.0449262,0.04921226,0.0911472,0.04067654,0.05814612,0.07209412,0.06618579,0.05781632
max,0.7286295,0.6892114,0.7550066,0.6103925,0.6149387,0.5102714,0.61045,0.7142168,0.6140311,0.6116345,...,0.5473793,0.5563506,0.5750273,0.5424188,0.5708867,0.4586499,0.6041094,0.589149,0.4600561,0.5546166


In [29]:
# df_p = csim_df[csim_df['concept_caption_score'] >= csim_df['concept_caption_score'].mean()]
df_p = csim_df[csim_df['concept_caption_score'] >= 0.134]
df_p = df_p.reset_index(drop=True)
df_p

Unnamed: 0,image_url,caption,context,concept_caption_score,concept_context_score,196_caption_score,412_caption_score,172_caption_score,363_caption_score,198_caption_score,...,137_context_score,441_context_score,192_context_score,20_context_score,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score
0,https://upload.wikimedia.org/wikipedia/commons...,Menachem Mendel Schneerson English: Menachem M...,1994 was a common year starting on Saturday of...,0.166272,0.094395,0.078204,0.065567,0.080967,0.041644,0.075154,...,-0.012260,0.019606,0.024275,0.076669,-0.067856,0.096434,-0.048101,-0.041107,0.026869,0.011452
1,https://upload.wikimedia.org/wikipedia/commons...,English: Image cropped from a baseball card o...,"Donald Robert ""Duffy"" Dyer is an American form...",0.162247,-0.034380,0.052320,0.042743,-0.008497,0.038228,0.071760,...,-0.118144,-0.099278,-0.026665,-0.015405,-0.002073,-0.119949,-0.022791,-0.088388,0.176188,0.028713
2,https://upload.wikimedia.org/wikipedia/commons...,"The special theory of relativity, formulated i...","In relativistic physics, a velocity-addition f...",0.142336,0.106813,0.129890,0.102847,0.105520,0.096102,0.069470,...,-0.017552,0.020111,0.036183,0.007514,0.020004,0.120406,0.044195,0.014316,-0.009373,0.032084
3,https://upload.wikimedia.org/wikipedia/commons...,English: Randy Stonehill Signature,Randall Evan Stonehill is an American singer a...,0.223324,0.092803,0.123127,0.087765,0.120790,0.042908,0.068349,...,0.108193,-0.031020,-0.006661,0.128765,0.160655,-0.025057,0.193492,-0.012994,0.125541,0.050723
4,https://upload.wikimedia.org/wikipedia/commons...,English: United States Senator William Proxmi...,Edward William Proxmire was an American politi...,0.171389,0.067607,0.028057,0.090903,0.127647,0.020825,0.029832,...,0.066258,-0.010974,-0.011341,0.085126,-0.066214,-0.029877,0.043140,-0.093784,0.008349,-0.041261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1355833,https://upload.wikimedia.org/wikipedia/commons...,Massachusetts Bay Transportation Authority 170...,The Urban Transportation Development Corporati...,0.147268,0.091816,0.183336,0.155670,0.233472,0.025314,0.184065,...,0.122498,0.018380,0.130792,-0.011854,0.102683,0.041830,0.039997,0.298343,0.100381,0.069348
1355834,https://upload.wikimedia.org/wikipedia/commons...,A map showing the path of State Highway 22 in...,State Highway 22 is a state highway in Oklahom...,0.207656,0.078112,0.255292,0.206986,0.297474,0.104774,0.176442,...,0.217521,-0.047160,0.008157,0.072613,0.177827,-0.073539,0.180896,0.140562,-0.008423,0.056389
1355835,https://upload.wikimedia.org/wikipedia/commons...,English: CIPET,Central Institute of Petrochemical Engineering...,0.142362,-0.005783,0.031377,0.048821,0.112177,-0.011580,0.040081,...,0.087363,-0.050901,0.013602,0.041040,-0.003768,-0.039447,0.023529,0.135025,-0.000438,-0.107296
1355836,https://upload.wikimedia.org/wikipedia/commons...,"English: Jia Jingde, politician of China. 中文:...",This is a list of Vice Presidents of the Exami...,0.156600,0.093868,-0.021200,0.070536,0.045870,-0.022279,0.064005,...,0.064688,-0.003800,0.079544,0.097345,0.022783,0.035481,-0.009618,0.142356,0.097828,0.056724


In [30]:
df_p.describe()

Unnamed: 0,concept_caption_score,concept_context_score,196_caption_score,412_caption_score,172_caption_score,363_caption_score,198_caption_score,114_caption_score,419_caption_score,294_caption_score,...,137_context_score,441_context_score,192_context_score,20_context_score,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score
count,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,...,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0
mean,0.1764577,0.06276443,0.08559561,0.08686949,0.1113165,0.05740445,0.09280093,0.1027839,0.1135433,0.09396847,...,0.0429175,0.01191672,0.01923718,0.02708928,0.05571533,0.02264773,0.03161171,0.04298287,0.03614905,0.02557106
std,0.03635343,0.06590349,0.07012287,0.05866437,0.06498859,0.05323604,0.07046865,0.06140767,0.07860132,0.05407793,...,0.08439863,0.07665261,0.06044843,0.06729007,0.08930775,0.05915104,0.07877738,0.09391404,0.06996516,0.07838058
min,0.134,-0.2344599,-0.1492556,-0.1255416,-0.1148668,-0.1664211,-0.1644311,-0.1255432,-0.1382757,-0.1218244,...,-0.2908896,-0.3307963,-0.2700528,-0.2722588,-0.2784553,-0.2538928,-0.265859,-0.3016184,-0.2675191,-0.2956381
25%,0.148891,0.01922829,0.03577911,0.04545975,0.06611484,0.02048744,0.04381821,0.06141226,0.05866365,0.05715708,...,-0.0143626,-0.03960708,-0.02188648,-0.01804999,-0.005748722,-0.01798432,-0.0225525,-0.02260899,-0.01217066,-0.02787367
50%,0.1672995,0.06164784,0.07626972,0.08135188,0.1036905,0.05396095,0.08852602,0.09911543,0.1033881,0.08842104,...,0.03792316,0.01096774,0.01854405,0.02545261,0.04851583,0.02138509,0.02751547,0.03533991,0.03457581,0.02304182
75%,0.194579,0.1050258,0.1254801,0.122635,0.1480672,0.09003481,0.1365627,0.1395021,0.1575852,0.1243169,...,0.09446006,0.0617887,0.05941956,0.06989236,0.1075833,0.06125178,0.08111304,0.1001635,0.08310743,0.07678358
max,0.7286295,0.6892114,0.7550066,0.6103925,0.6149387,0.5102714,0.61045,0.7142168,0.6140311,0.6116345,...,0.5473793,0.5563506,0.5363311,0.5145766,0.5608774,0.4586499,0.6041094,0.589149,0.4173216,0.5398021


In [31]:
# ground truth
df_dict = dict()
for col in caption_score_list:
    num, _ = col.split('_', 1)
    c_list = [col] + [f'{num}_context_score'] + im_c + cap_s + con_s
    df = df_p[c_list]
    df = df.sort_values(by=[col], ascending=False)
    df_dict[col] = df

In [32]:
df_dict = dict()
for col in caption_score_list:
    num, _ = col.split('_', 1)
    c_list = [col] + [f'{num}_context_score'] + im_c + cap_s + con_s
    df = df_p[c_list]
    df = df[df[col] > df[col].quantile(0.9999)]
#     df = df[df[col] > 0.55]
    df = df.sort_values(by=[col], ascending=False)
    df_dict[col] = df

In [33]:
df_dict[col]

Unnamed: 0,376_caption_score,376_context_score,image_url,caption,context,concept_caption_score,concept_context_score
869924,0.567895,0.407775,https://upload.wikimedia.org/wikipedia/commons...,English: Recreation of Minnie Mouse's signatu...,This is a list of characters that have appeare...,0.189813,0.156888
788798,0.567709,0.407775,https://upload.wikimedia.org/wikipedia/commons...,English: Recreation of Mickey Mouse's signatu...,This is a list of characters that have appeare...,0.180582,0.156888
841795,0.543113,0.423695,https://upload.wikimedia.org/wikipedia/commons...,Cars 623 and 717 passing on the Red Car Troll...,"The Red Car Trolley is a 1,000 mm metre gauge ...",0.200099,0.132485
667637,0.537630,0.350208,https://upload.wikimedia.org/wikipedia/commons...,Disney villains at Disneyland's Mickey's Hallo...,Mickey's Halloween Party was an annual Hallowe...,0.199137,0.043375
184343,0.533986,0.539802,https://upload.wikimedia.org/wikipedia/commons...,Theatrical release poster English: Poster for ...,Trolley Troubles is a 1927 animated short subj...,0.172679,0.117095
...,...,...,...,...,...,...,...
597534,0.441235,0.146075,https://upload.wikimedia.org/wikipedia/commons...,English: Iconic Toy Train Station,"The Railway Museum at Mysore, India is an outd...",0.241279,0.132612
220985,0.440948,0.407775,https://upload.wikimedia.org/wikipedia/commons...,English: Recreation of Tweedledee's signature...,This is a list of characters that have appeare...,0.151732,0.156888
20310,0.440487,0.061867,https://upload.wikimedia.org/wikipedia/commons...,"English: A Disney bus in Walt Disney World, F...",The Low Floor Series bus is a series of transi...,0.176181,0.056294
937662,0.439831,-0.123846,https://upload.wikimedia.org/wikipedia/en/9/93...,"The official school mascot, Willy. He is the o...","ICCT Colleges Foundation Inc., is a tertiary e...",0.211186,-0.042735


In [34]:
from IPython.display import display

In [35]:
for el in df_dict:
    t,_ = el.split('_', 1)
#     print(t)
    df_dict[el].columns = [f'{t}_caption_sim', f'{t}_context_sim', 'image_url',
                          'caption', 'context', f'ped_caption_sim', f'ped_context_sim']
    display(df_dict[el].describe().round(2))
    print('\n')

Unnamed: 0,196_caption_sim,196_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.56,0.3,0.44,0.23
std,0.05,0.24,0.11,0.19
min,0.51,-0.14,0.24,-0.06
25%,0.51,0.11,0.35,0.08
50%,0.54,0.25,0.43,0.16
75%,0.57,0.46,0.52,0.36
max,0.76,0.75,0.73,0.69






Unnamed: 0,412_caption_sim,412_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.46,0.19,0.52,0.19
std,0.04,0.16,0.08,0.17
min,0.43,-0.09,0.2,-0.07
25%,0.44,0.06,0.48,0.07
50%,0.45,0.17,0.53,0.13
75%,0.47,0.3,0.56,0.26
max,0.61,0.65,0.73,0.69






Unnamed: 0,172_caption_sim,172_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.5,0.18,0.5,0.17
std,0.03,0.16,0.09,0.17
min,0.46,-0.07,0.25,-0.08
25%,0.47,0.05,0.44,0.06
50%,0.49,0.14,0.52,0.11
75%,0.51,0.3,0.55,0.25
max,0.61,0.56,0.73,0.69






Unnamed: 0,363_caption_sim,363_context_sim,ped_caption_sim,ped_context_sim
count,135.0,135.0,135.0,135.0
mean,0.37,0.18,0.37,0.17
std,0.02,0.12,0.14,0.13
min,0.35,-0.06,0.14,-0.06
25%,0.35,0.09,0.28,0.09
50%,0.37,0.18,0.32,0.15
75%,0.38,0.28,0.48,0.21
max,0.51,0.5,0.73,0.68






Unnamed: 0,198_caption_sim,198_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.49,0.21,0.36,0.14
std,0.03,0.15,0.12,0.1
min,0.45,-0.14,0.15,-0.04
25%,0.46,0.11,0.27,0.08
50%,0.48,0.18,0.33,0.13
75%,0.5,0.31,0.4,0.21
max,0.61,0.61,0.73,0.5






Unnamed: 0,114_caption_sim,114_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.62,0.47,0.29,0.16
std,0.03,0.09,0.04,0.07
min,0.59,0.06,0.17,-0.08
25%,0.6,0.42,0.27,0.11
50%,0.61,0.48,0.29,0.17
75%,0.62,0.52,0.33,0.2
max,0.71,0.63,0.38,0.27






Unnamed: 0,419_caption_sim,419_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.52,0.27,0.33,0.15
std,0.02,0.15,0.13,0.12
min,0.5,-0.06,0.14,-0.07
25%,0.5,0.16,0.24,0.08
50%,0.51,0.27,0.28,0.13
75%,0.53,0.39,0.4,0.21
max,0.61,0.55,0.73,0.69






Unnamed: 0,294_caption_sim,294_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.46,0.21,0.45,0.19
std,0.04,0.16,0.15,0.15
min,0.42,-0.09,0.14,-0.06
25%,0.43,0.08,0.31,0.09
50%,0.44,0.2,0.5,0.15
75%,0.47,0.36,0.55,0.24
max,0.61,0.61,0.73,0.69






Unnamed: 0,388_caption_sim,388_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.39,0.17,0.33,0.11
std,0.04,0.14,0.15,0.11
min,0.34,-0.14,0.14,-0.14
25%,0.35,0.06,0.22,0.05
50%,0.37,0.15,0.29,0.11
75%,0.4,0.26,0.42,0.16
max,0.54,0.5,0.73,0.5






Unnamed: 0,314_caption_sim,314_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.53,0.23,0.33,0.15
std,0.03,0.17,0.13,0.13
min,0.49,-0.03,0.14,-0.08
25%,0.5,0.11,0.23,0.07
50%,0.52,0.2,0.29,0.13
75%,0.54,0.32,0.36,0.19
max,0.68,0.67,0.73,0.69






Unnamed: 0,98_caption_sim,98_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.58,0.29,0.37,0.17
std,0.03,0.17,0.11,0.12
min,0.55,-0.05,0.21,-0.06
25%,0.56,0.16,0.3,0.1
50%,0.57,0.24,0.34,0.13
75%,0.6,0.45,0.41,0.21
max,0.73,0.63,0.73,0.68






Unnamed: 0,80_caption_sim,80_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.42,0.24,0.31,0.17
std,0.03,0.15,0.14,0.11
min,0.38,-0.1,0.13,-0.05
25%,0.39,0.13,0.21,0.08
50%,0.41,0.26,0.26,0.16
75%,0.43,0.35,0.34,0.23
max,0.51,0.59,0.73,0.5






Unnamed: 0,134_caption_sim,134_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.57,0.38,0.26,0.11
std,0.02,0.13,0.1,0.07
min,0.54,0.0,0.13,-0.02
25%,0.55,0.31,0.2,0.08
50%,0.56,0.4,0.23,0.1
75%,0.58,0.48,0.29,0.15
max,0.66,0.62,0.67,0.46






Unnamed: 0,432_caption_sim,432_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.45,0.2,0.22,0.08
std,0.02,0.14,0.12,0.08
min,0.42,-0.07,0.13,-0.07
25%,0.43,0.09,0.16,0.02
50%,0.44,0.21,0.18,0.07
75%,0.46,0.31,0.23,0.14
max,0.53,0.61,0.73,0.45






Unnamed: 0,394_caption_sim,394_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.52,0.22,0.44,0.13
std,0.03,0.14,0.13,0.11
min,0.5,-0.07,0.17,-0.07
25%,0.5,0.12,0.37,0.06
50%,0.51,0.21,0.46,0.11
75%,0.53,0.31,0.53,0.17
max,0.68,0.59,0.73,0.57






Unnamed: 0,53_caption_sim,53_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.45,0.22,0.38,0.13
std,0.03,0.15,0.16,0.13
min,0.41,-0.1,0.13,-0.09
25%,0.42,0.09,0.24,0.05
50%,0.43,0.19,0.36,0.1
75%,0.47,0.35,0.53,0.18
max,0.57,0.54,0.73,0.52






Unnamed: 0,56_caption_sim,56_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.56,0.32,0.29,0.14
std,0.02,0.13,0.09,0.06
min,0.53,-0.06,0.14,-0.05
25%,0.54,0.23,0.23,0.1
50%,0.55,0.34,0.27,0.14
75%,0.56,0.41,0.32,0.17
max,0.67,0.58,0.7,0.3






Unnamed: 0,150_caption_sim,150_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.51,0.27,0.35,0.17
std,0.03,0.14,0.14,0.13
min,0.48,-0.13,0.14,-0.06
25%,0.49,0.18,0.23,0.09
50%,0.5,0.26,0.32,0.15
75%,0.53,0.38,0.46,0.22
max,0.64,0.53,0.73,0.69






Unnamed: 0,278_caption_sim,278_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.57,0.19,0.38,0.1
std,0.03,0.14,0.15,0.11
min,0.54,-0.08,0.16,-0.09
25%,0.55,0.09,0.25,0.04
50%,0.56,0.17,0.34,0.09
75%,0.59,0.24,0.53,0.13
max,0.71,0.59,0.73,0.57






Unnamed: 0,-1_caption_sim,-1_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.46,0.14,0.31,0.07
std,0.02,0.09,0.15,0.08
min,0.45,-0.08,0.14,-0.07
25%,0.45,0.08,0.19,0.03
50%,0.46,0.13,0.25,0.07
75%,0.47,0.2,0.4,0.11
max,0.57,0.39,0.73,0.5






Unnamed: 0,43_caption_sim,43_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.51,0.34,0.28,0.18
std,0.03,0.14,0.08,0.09
min,0.48,-0.11,0.13,-0.03
25%,0.49,0.27,0.25,0.12
50%,0.51,0.35,0.27,0.19
75%,0.53,0.46,0.31,0.26
max,0.64,0.58,0.73,0.33






Unnamed: 0,265_caption_sim,265_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.43,0.22,0.32,0.12
std,0.04,0.16,0.16,0.12
min,0.39,-0.05,0.14,-0.08
25%,0.4,0.1,0.19,0.05
50%,0.41,0.2,0.25,0.1
75%,0.43,0.33,0.47,0.16
max,0.56,0.57,0.73,0.68






Unnamed: 0,345_caption_sim,345_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.47,0.24,0.39,0.17
std,0.04,0.17,0.15,0.13
min,0.43,-0.12,0.14,-0.06
25%,0.44,0.13,0.27,0.08
50%,0.46,0.22,0.36,0.13
75%,0.49,0.35,0.53,0.24
max,0.67,0.66,0.73,0.69






Unnamed: 0,378_caption_sim,378_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.53,0.33,0.29,0.13
std,0.02,0.12,0.11,0.09
min,0.51,0.01,0.14,-0.07
25%,0.52,0.27,0.26,0.08
50%,0.52,0.34,0.26,0.11
75%,0.53,0.41,0.26,0.15
max,0.63,0.54,0.67,0.45






Unnamed: 0,11_caption_sim,11_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.54,0.32,0.24,0.12
std,0.02,0.13,0.11,0.08
min,0.52,0.01,0.14,-0.02
25%,0.52,0.22,0.18,0.07
50%,0.53,0.34,0.21,0.1
75%,0.54,0.42,0.26,0.15
max,0.61,0.54,0.73,0.45






Unnamed: 0,331_caption_sim,331_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.5,0.23,0.35,0.1
std,0.02,0.13,0.15,0.08
min,0.47,-0.05,0.14,-0.07
25%,0.48,0.14,0.21,0.06
50%,0.49,0.23,0.32,0.1
75%,0.51,0.34,0.48,0.14
max,0.61,0.55,0.73,0.34






Unnamed: 0,202_caption_sim,202_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.45,0.22,0.38,0.17
std,0.03,0.15,0.15,0.13
min,0.42,-0.08,0.14,-0.05
25%,0.42,0.11,0.25,0.09
50%,0.44,0.21,0.37,0.15
75%,0.46,0.31,0.52,0.22
max,0.62,0.57,0.73,0.69






Unnamed: 0,169_caption_sim,169_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.48,0.28,0.35,0.14
std,0.03,0.19,0.15,0.12
min,0.45,-0.02,0.14,-0.07
25%,0.45,0.11,0.23,0.07
50%,0.47,0.26,0.3,0.14
75%,0.49,0.45,0.51,0.19
max,0.62,0.62,0.73,0.68






Unnamed: 0,327_caption_sim,327_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.54,0.27,0.24,0.1
std,0.02,0.15,0.04,0.07
min,0.52,-0.04,0.13,-0.07
25%,0.53,0.15,0.22,0.05
50%,0.54,0.28,0.24,0.11
75%,0.55,0.39,0.26,0.16
max,0.63,0.58,0.39,0.28






Unnamed: 0,309_caption_sim,309_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.52,0.36,0.24,0.1
std,0.02,0.12,0.12,0.06
min,0.49,-0.06,0.14,-0.04
25%,0.5,0.28,0.17,0.06
50%,0.51,0.38,0.21,0.1
75%,0.53,0.44,0.26,0.15
max,0.6,0.61,0.73,0.33






Unnamed: 0,68_caption_sim,68_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.47,0.26,0.25,0.11
std,0.02,0.13,0.1,0.08
min,0.44,-0.08,0.14,-0.07
25%,0.45,0.18,0.19,0.06
50%,0.46,0.28,0.23,0.11
75%,0.48,0.35,0.28,0.15
max,0.57,0.49,0.7,0.48






Unnamed: 0,216_caption_sim,216_context_sim,ped_caption_sim,ped_context_sim
count,135.0,135.0,135.0,135.0
mean,0.53,0.32,0.23,0.11
std,0.03,0.15,0.09,0.08
min,0.49,-0.04,0.13,-0.1
25%,0.51,0.2,0.17,0.05
50%,0.52,0.35,0.2,0.11
75%,0.54,0.45,0.26,0.15
max,0.63,0.63,0.67,0.41






Unnamed: 0,40_caption_sim,40_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.54,0.36,0.29,0.11
std,0.02,0.11,0.1,0.07
min,0.52,-0.03,0.16,-0.03
25%,0.53,0.29,0.24,0.06
50%,0.54,0.38,0.27,0.1
75%,0.55,0.43,0.31,0.15
max,0.61,0.53,0.73,0.41






Unnamed: 0,99_caption_sim,99_context_sim,ped_caption_sim,ped_context_sim
count,135.0,135.0,135.0,135.0
mean,0.42,0.17,0.36,0.12
std,0.04,0.14,0.16,0.12
min,0.39,-0.1,0.14,-0.14
25%,0.4,0.07,0.21,0.06
50%,0.41,0.15,0.33,0.09
75%,0.43,0.28,0.51,0.16
max,0.57,0.49,0.73,0.5






Unnamed: 0,426_caption_sim,426_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.37,0.17,0.32,0.11
std,0.02,0.11,0.15,0.1
min,0.35,-0.04,0.14,-0.08
25%,0.35,0.08,0.2,0.05
50%,0.36,0.17,0.25,0.09
75%,0.38,0.25,0.43,0.15
max,0.48,0.52,0.73,0.54






Unnamed: 0,344_caption_sim,344_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.48,0.23,0.33,0.09
std,0.03,0.12,0.16,0.11
min,0.45,-0.04,0.14,-0.11
25%,0.46,0.15,0.19,0.02
50%,0.47,0.25,0.27,0.07
75%,0.48,0.3,0.46,0.14
max,0.58,0.52,0.73,0.56






Unnamed: 0,47_caption_sim,47_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.6,0.54,0.22,0.17
std,0.03,0.12,0.06,0.05
min,0.57,0.13,0.13,-0.01
25%,0.58,0.46,0.19,0.14
50%,0.59,0.58,0.22,0.17
75%,0.62,0.63,0.25,0.2
max,0.74,0.74,0.44,0.3






Unnamed: 0,8_caption_sim,8_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.47,0.27,0.31,0.16
std,0.02,0.14,0.08,0.08
min,0.44,-0.09,0.15,-0.02
25%,0.45,0.16,0.27,0.1
50%,0.45,0.3,0.27,0.16
75%,0.48,0.38,0.34,0.2
max,0.58,0.59,0.7,0.42






Unnamed: 0,416_caption_sim,416_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.36,0.18,0.34,0.14
std,0.05,0.15,0.16,0.14
min,0.33,-0.06,0.13,-0.07
25%,0.33,0.06,0.2,0.05
50%,0.35,0.17,0.27,0.11
75%,0.36,0.29,0.49,0.19
max,0.66,0.55,0.73,0.68






Unnamed: 0,112_caption_sim,112_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.51,0.25,0.37,0.13
std,0.03,0.13,0.14,0.09
min,0.48,-0.08,0.14,-0.04
25%,0.49,0.17,0.25,0.09
50%,0.5,0.27,0.32,0.12
75%,0.53,0.34,0.48,0.16
max,0.62,0.55,0.73,0.69






Unnamed: 0,137_caption_sim,137_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.52,0.23,0.4,0.14
std,0.02,0.13,0.14,0.1
min,0.5,-0.09,0.19,-0.03
25%,0.51,0.13,0.27,0.08
50%,0.52,0.24,0.39,0.12
75%,0.54,0.33,0.52,0.16
max,0.61,0.53,0.73,0.69






Unnamed: 0,441_caption_sim,441_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.43,0.19,0.21,0.06
std,0.04,0.16,0.12,0.09
min,0.39,-0.07,0.13,-0.11
25%,0.4,0.06,0.15,0.02
50%,0.41,0.17,0.17,0.06
75%,0.45,0.3,0.2,0.1
max,0.57,0.56,0.73,0.69






Unnamed: 0,192_caption_sim,192_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.37,0.22,0.23,0.12
std,0.05,0.13,0.09,0.1
min,0.31,-0.05,0.14,-0.09
25%,0.32,0.13,0.18,0.05
50%,0.35,0.22,0.22,0.1
75%,0.43,0.33,0.25,0.19
max,0.48,0.54,0.62,0.54






Unnamed: 0,20_caption_sim,20_context_sim,ped_caption_sim,ped_context_sim
count,135.0,135.0,135.0,135.0
mean,0.45,0.24,0.25,0.1
std,0.03,0.15,0.11,0.09
min,0.42,-0.06,0.14,-0.06
25%,0.43,0.11,0.18,0.05
50%,0.44,0.26,0.22,0.08
75%,0.47,0.36,0.26,0.14
max,0.62,0.48,0.7,0.48






Unnamed: 0,296_caption_sim,296_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.55,0.34,0.2,0.08
std,0.03,0.14,0.07,0.06
min,0.52,-0.03,0.14,-0.09
25%,0.53,0.26,0.16,0.03
50%,0.54,0.38,0.18,0.08
75%,0.56,0.45,0.22,0.12
max,0.67,0.55,0.7,0.32






Unnamed: 0,144_caption_sim,144_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.37,0.07,0.32,0.09
std,0.02,0.09,0.14,0.1
min,0.35,-0.12,0.15,-0.23
25%,0.35,0.02,0.22,0.04
50%,0.36,0.06,0.28,0.07
75%,0.38,0.12,0.41,0.13
max,0.44,0.46,0.73,0.54






Unnamed: 0,12_caption_sim,12_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.47,0.23,0.3,0.08
std,0.04,0.13,0.14,0.09
min,0.43,-0.07,0.14,-0.06
25%,0.44,0.12,0.2,0.02
50%,0.46,0.23,0.26,0.07
75%,0.48,0.33,0.35,0.12
max,0.63,0.49,0.73,0.46






Unnamed: 0,224_caption_sim,224_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.5,0.32,0.27,0.12
std,0.03,0.12,0.11,0.07
min,0.48,0.02,0.14,-0.02
25%,0.49,0.26,0.19,0.07
50%,0.49,0.34,0.22,0.12
75%,0.51,0.4,0.29,0.16
max,0.64,0.59,0.67,0.37






Unnamed: 0,250_caption_sim,250_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.34,0.16,0.23,0.07
std,0.02,0.09,0.12,0.09
min,0.31,-0.08,0.13,-0.08
25%,0.32,0.1,0.16,0.02
50%,0.33,0.16,0.19,0.06
75%,0.34,0.21,0.24,0.11
max,0.45,0.37,0.65,0.5






Unnamed: 0,376_caption_sim,376_context_sim,ped_caption_sim,ped_context_sim
count,136.0,136.0,136.0,136.0
mean,0.47,0.3,0.19,0.1
std,0.03,0.15,0.04,0.07
min,0.44,-0.12,0.14,-0.15
25%,0.45,0.19,0.16,0.05
50%,0.46,0.37,0.18,0.1
75%,0.48,0.41,0.21,0.16
max,0.57,0.54,0.3,0.27






In [36]:
topics = get_topics(concept='pedestrian')

In [37]:
topics

{196: {'cross',
  'crossing',
  'crosswalk',
  'pedestrian',
  'signal',
  'stripe',
  'traffic'},
 412: {'arrest',
  'pedestrian',
  'perp',
  'pers',
  'walk',
  'walkability',
  'walkable'},
 172: {'car',
  'lane',
  'pedestrian',
  'road',
  'street',
  'traffic',
  'transportation',
  'vehicle'},
 363: {'assault',
  'bystander',
  'intervene',
  'intervention',
  'pedestrian',
  'stalk',
  'stalker',
  'victim',
  'witness'},
 198: {'car',
  'garage',
  'park',
  'parking',
  'parkjockey',
  'pedestrian',
  'tow',
  'vehicle'},
 114: {'athlete',
  'disability',
  'paralympic',
  'paralympics',
  'pedestrian',
  'sport',
  'sportspeople',
  'wheelchair'},
 419: {'avenue',
  'boulevard',
  'intersection',
  'manhattan',
  'pedestrian',
  'road',
  'street',
  'suffix'},
 294: {'drive',
  'fine',
  'licence',
  'offence',
  'pedestrian',
  'penalty',
  'reckless',
  'speed',
  'ticket',
  'traffic',
  'violation'},
 388: {'brownian',
  'diffusion',
  'distribution',
  'markov',
  'pe

In [38]:
def simple_search(context, t):
    percentage = 0
    words = list(topics[t])
    for w in words:
        if w in context:
            percentage = percentage + 1
    return float(percentage/len(words))

In [39]:
list(topics[376])

['lucy',
 'cartoon',
 'charlie',
 'bear',
 'snoopy',
 'pedestrian',
 'disney',
 'mickey',
 'trolley',
 'peanut']

In [40]:
df_dict[el]['context'].tolist()[0]

'This is a list of characters that have appeared as interactive characters in Disney theme parks.\nAtmosphere characters in Disney theme parks have oversized heads and cannot talk to guests, except for very uncommon articulated characters.'

In [41]:
print(el)
simple_search(df_dict[el]['context'].tolist()[0], 376)

376_caption_score


0.0

In [42]:
for t in topics:
    df_key = str(t) + '_caption_score'
    df_dict[df_key]['context_gt'] = df_dict[df_key]['context'].apply(simple_search, t=t)


In [43]:
df_dict['412_caption_score'].describe().round(2)

Unnamed: 0,412_caption_sim,412_context_sim,ped_caption_sim,ped_context_sim,context_gt
count,136.0,136.0,136.0,136.0,136.0
mean,0.46,0.19,0.52,0.19,0.12
std,0.04,0.16,0.08,0.17,0.15
min,0.43,-0.09,0.2,-0.07,0.0
25%,0.44,0.06,0.48,0.07,0.0
50%,0.45,0.17,0.53,0.13,0.07
75%,0.47,0.3,0.56,0.26,0.14
max,0.61,0.65,0.73,0.69,0.57


In [44]:
df_dict['412_caption_score']

Unnamed: 0,412_caption_sim,412_context_sim,image_url,caption,context,ped_caption_sim,ped_context_sim,context_gt
874765,0.610392,0.199226,https://upload.wikimedia.org/wikipedia/commons...,Pedestrian trail English: Pedestrian trail,The Blue Water River Walk is a nearly one mile...,0.704843,0.105616,0.142857
479397,0.586082,0.008303,https://upload.wikimedia.org/wikipedia/commons...,Pedestrian walk in Ferizaj English: Ferizaj City,"Ferizaj, in southeastern Kosovo, is its third-...",0.595274,0.053994,0.000000
160058,0.573594,0.481619,https://upload.wikimedia.org/wikipedia/commons...,Prohibition of pedestrians (includes any kind ...,Jaywalking occurs when a pedestrian walks in o...,0.548525,0.483317,0.428571
496246,0.565820,0.190722,https://upload.wikimedia.org/wikipedia/commons...,English: Pedestrian crossing,Road signs used by countries in the Americas a...,0.728629,0.157267,0.000000
812796,0.565820,0.160237,https://upload.wikimedia.org/wikipedia/commons...,English: Pedestrian crossing,Road signs in Malaysia are standardised road s...,0.728629,0.152377,0.000000
...,...,...,...,...,...,...,...,...
276205,0.426847,0.367209,https://upload.wikimedia.org/wikipedia/commons...,Construction of the new Covered Pedestrian Br...,The Riverview Covered Pedestrian Bridge is a w...,0.552095,0.452756,0.142857
335778,0.426847,0.367209,https://upload.wikimedia.org/wikipedia/commons...,Construction of the new Covered Pedestrian Br...,The Riverview Covered Pedestrian Bridge is a w...,0.552095,0.452756,0.142857
194,0.426847,0.367209,https://upload.wikimedia.org/wikipedia/commons...,Construction of the new Covered Pedestrian Br...,The Riverview Covered Pedestrian Bridge is a w...,0.552095,0.452756,0.142857
814077,0.426273,-0.059679,https://upload.wikimedia.org/wikipedia/commons...,Pedestrian route through the external rotunda,"The Neue Staatsgalerie in Stuttgart, Germany, ...",0.578300,-0.016740,0.000000


In [45]:
pg_dict = dict()
pg_dict['a'] = [68, 112, 56, 224]
pg_dict['b'] = [309, 344, 47, -1, 53]
pg_dict['c'] = [314, 331, 419, 394]
pg_dict['d'] = [98, 196, 202, 294]
pg_dict['e'] = [172, 137, 11, 134, 150, 216]
pg_dict['f'] = [363, 40, 43]
pg_dict['g'] = [441, 376]
pg_dict['h'] = [144, 416, 327]
pg_dict['i'] = [192, 20, 114]
pg_dict['j'] = [345]
pg_dict['k'] = [8, 265]
pg_dict['l'] = [250, 80, 426]
pg_dict['m'] = [169, 99, 378]
pg_dict['n'] = [432, 388, 412]
pg_dict['o'] = [12, 278, 198, 296] 

In [46]:
k_name = ['image_url', 'caption', 'context', 'concept2caption_sim', 
          'concept2context_sim', 'context_gt']
for el in pg_dict:
    pg_dict[el] = [f'{i}_caption_score' for i in pg_dict[el]]
    pg_dict[el] = [df_dict[i] for i in pg_dict[el]]
    pg_dict[el] = [df_.reset_index(drop=True) for df_ in pg_dict[el]]
    

In [47]:
for el in pg_dict:
    t_df = pd.DataFrame(columns = ['topic2caption_sim', 'topic2context_sim'] + k_name)
    for df_ in pg_dict[el]:
        df_.columns = ['topic2caption_sim', 'topic2context_sim'] + k_name
        t_df = pd.concat([t_df, df_], ignore_index=True)
    pg_dict[el] = t_df

In [48]:
for el in pg_dict:
    print(el)
    display(pg_dict[el].describe().round(2))


a


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,544.0,544.0,544.0,544.0,544.0
mean,0.51,0.29,0.29,0.12,0.19
std,0.04,0.13,0.12,0.07,0.16
min,0.44,-0.08,0.14,-0.07,0.0
25%,0.48,0.21,0.21,0.08,0.1
50%,0.5,0.31,0.26,0.12,0.12
75%,0.54,0.38,0.32,0.16,0.3
max,0.67,0.59,0.73,0.69,0.78


b


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,680.0,680.0,680.0,680.0,680.0
mean,0.5,0.3,0.3,0.11,0.17
std,0.06,0.19,0.14,0.1,0.16
min,0.41,-0.1,0.13,-0.11,0.0
25%,0.46,0.14,0.19,0.05,0.0
50%,0.48,0.29,0.24,0.1,0.11
75%,0.54,0.43,0.37,0.17,0.3
max,0.74,0.74,0.73,0.56,0.7


c


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,544.0,544.0,544.0,544.0,544.0
mean,0.52,0.24,0.36,0.13,0.15
std,0.03,0.15,0.14,0.11,0.19
min,0.47,-0.07,0.14,-0.08,0.0
25%,0.5,0.13,0.24,0.07,0.0
50%,0.51,0.23,0.32,0.11,0.11
75%,0.53,0.35,0.48,0.18,0.22
max,0.68,0.67,0.73,0.69,1.0


d


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,544.0,544.0,544.0,544.0,544.0
mean,0.51,0.26,0.41,0.19,0.22
std,0.07,0.19,0.13,0.15,0.24
min,0.42,-0.14,0.14,-0.06,0.0
25%,0.44,0.11,0.29,0.09,0.0
50%,0.51,0.22,0.41,0.15,0.14
75%,0.56,0.38,0.53,0.24,0.36
max,0.76,0.75,0.73,0.69,1.0


e


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,815.0,815.0,815.0,815.0,815.0
mean,0.53,0.28,0.33,0.14,0.21
std,0.04,0.15,0.15,0.11,0.2
min,0.46,-0.13,0.13,-0.1,0.0
25%,0.5,0.16,0.21,0.07,0.0
50%,0.52,0.29,0.27,0.11,0.14
75%,0.55,0.41,0.46,0.17,0.3
max,0.66,0.63,0.73,0.69,0.88


f


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,407.0,407.0,407.0,407.0,407.0
mean,0.48,0.29,0.31,0.16,0.19
std,0.08,0.15,0.11,0.1,0.18
min,0.35,-0.11,0.13,-0.06,0.0
25%,0.38,0.18,0.25,0.08,0.0
50%,0.51,0.31,0.28,0.14,0.12
75%,0.53,0.41,0.33,0.21,0.3
max,0.64,0.58,0.73,0.68,0.75


g


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,272.0,272.0,272.0,272.0,272.0
mean,0.45,0.24,0.2,0.08,0.04
std,0.04,0.16,0.09,0.08,0.07
min,0.39,-0.12,0.13,-0.15,0.0
25%,0.41,0.11,0.16,0.03,0.0
50%,0.45,0.26,0.18,0.08,0.0
75%,0.48,0.41,0.21,0.13,0.1
max,0.57,0.56,0.73,0.69,0.33


h


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,408.0,408.0,408.0,408.0,408.0
mean,0.42,0.18,0.3,0.11,0.12
std,0.09,0.16,0.13,0.11,0.11
min,0.33,-0.12,0.13,-0.23,0.0
25%,0.35,0.05,0.21,0.04,0.0
50%,0.37,0.14,0.26,0.1,0.1
75%,0.53,0.29,0.34,0.16,0.2
max,0.66,0.58,0.73,0.68,0.7


i


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,407.0,407.0,407.0,407.0,407.0
mean,0.48,0.31,0.26,0.13,0.16
std,0.11,0.16,0.09,0.09,0.17
min,0.31,-0.06,0.14,-0.09,0.0
25%,0.42,0.18,0.2,0.06,0.0
50%,0.45,0.33,0.25,0.12,0.12
75%,0.6,0.45,0.3,0.19,0.29
max,0.71,0.63,0.7,0.54,0.86


j


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,136.0,136.0,136.0,136.0,136.0
mean,0.47,0.24,0.39,0.17,0.16
std,0.04,0.17,0.15,0.13,0.16
min,0.43,-0.12,0.14,-0.06,0.0
25%,0.44,0.13,0.27,0.08,0.0
50%,0.46,0.22,0.36,0.13,0.12
75%,0.49,0.35,0.53,0.24,0.25
max,0.67,0.66,0.73,0.69,0.75


k


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,272.0,272.0,272.0,272.0,272.0
mean,0.45,0.25,0.31,0.14,0.21
std,0.04,0.16,0.13,0.1,0.2
min,0.39,-0.09,0.14,-0.08,0.0
25%,0.41,0.12,0.23,0.08,0.0
50%,0.45,0.26,0.27,0.12,0.11
75%,0.46,0.36,0.36,0.19,0.33
max,0.58,0.59,0.73,0.68,0.89


l


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,408.0,408.0,408.0,408.0,408.0
mean,0.37,0.19,0.29,0.12,0.07
std,0.04,0.13,0.14,0.11,0.11
min,0.31,-0.1,0.13,-0.08,0.0
25%,0.34,0.1,0.19,0.04,0.0
50%,0.36,0.18,0.23,0.09,0.0
75%,0.4,0.27,0.33,0.17,0.1
max,0.51,0.59,0.73,0.54,0.6


m


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,407.0,407.0,407.0,407.0,407.0
mean,0.48,0.26,0.34,0.13,0.19
std,0.05,0.17,0.15,0.11,0.19
min,0.39,-0.1,0.14,-0.14,0.0
25%,0.43,0.12,0.23,0.07,0.0
50%,0.47,0.27,0.26,0.11,0.12
75%,0.52,0.39,0.46,0.17,0.3
max,0.63,0.62,0.73,0.68,0.8


n


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,408.0,408.0,408.0,408.0,408.0
mean,0.43,0.19,0.36,0.13,0.08
std,0.05,0.15,0.17,0.13,0.12
min,0.34,-0.14,0.13,-0.14,0.0
25%,0.4,0.07,0.2,0.05,0.0
50%,0.44,0.17,0.33,0.1,0.0
75%,0.46,0.29,0.52,0.17,0.14
max,0.61,0.65,0.73,0.69,0.57


o


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,544.0,544.0,544.0,544.0,544.0
mean,0.52,0.24,0.31,0.1,0.14
std,0.05,0.15,0.14,0.1,0.17
min,0.43,-0.14,0.14,-0.09,0.0
25%,0.47,0.12,0.2,0.04,0.0
50%,0.52,0.23,0.27,0.09,0.11
75%,0.56,0.37,0.39,0.14,0.25
max,0.71,0.61,0.73,0.57,0.88


In [49]:
pg_dict[el]

Unnamed: 0,topic2caption_sim,topic2context_sim,image_url,caption,context,concept2caption_sim,concept2context_sim,context_gt
0,0.628946,0.295938,https://upload.wikimedia.org/wikipedia/commons...,Pedestrian trail English: Pedestrian trail,The Blue Water River Walk is a nearly one mile...,0.704843,0.105616,0.125000
1,0.591550,0.071280,https://upload.wikimedia.org/wikipedia/commons...,Walking Trail,"Bagh-e-Jinnah, formerly known as Lawrence Gard...",0.422199,0.072473,0.000000
2,0.591550,0.071280,https://upload.wikimedia.org/wikipedia/commons...,Walking Trail,"Bagh-e-Jinnah, formerly known as Lawrence Gard...",0.422199,0.072473,0.000000
3,0.591550,0.071280,https://upload.wikimedia.org/wikipedia/commons...,Walking Trail,"Bagh-e-Jinnah, formerly known as Lawrence Gard...",0.422199,0.072473,0.000000
4,0.589132,0.471493,https://upload.wikimedia.org/wikipedia/commons...,Hikers awaiting southbound train English: Look...,The Appalachian Trail station is a commuter ra...,0.300308,0.143705,0.250000
...,...,...,...,...,...,...,...,...
539,0.520411,0.294029,https://upload.wikimedia.org/wikipedia/commons...,J.B. Starkey Wilderness Trail English: J.B. St...,The Veterans Expressway and Suncoast Parkway i...,0.223965,0.118015,0.444444
540,0.520411,0.370678,https://upload.wikimedia.org/wikipedia/commons...,J.B. Starkey Wilderness Trail English: J.B. St...,Starkey Wilderness Preserve is a public recrea...,0.223965,0.015963,0.333333
541,0.519816,0.452994,https://upload.wikimedia.org/wikipedia/commons...,"English: National Park Meadow, Madison Juncti...","Mount Haynes el. 8,218 feet is a prominent pea...",0.138482,0.101505,0.111111
542,0.519740,0.444490,http://upload.wikimedia.org/wikipedia/commons/...,English: The terrain park at Great Bear in Si...,"Great Bear Recreation Park, more commonly refe...",0.181156,0.079883,0.000000


## select image for retrain from wit 
```['b', 'f', 'g', 'i', 'm']``` 
consiedered as missinig topics

In [72]:
from os.path import exists
import base64


In [73]:
missed_topics = ['b', 'f', 'g', 'i', 'm']
im_root_path = '/raid/AISSEL/Hamed/datasets/wit/images'

In [74]:
def b64(link):
    link = link.encode("utf-8")
    im_path = f'{im_root_path}/{str(base64.b64encode(link))[2:-1]}.jpg'
    if exists(im_path):
        return im_path
    return 
    

In [75]:
def link_to_b64(link):
    im_root_path = '/raid/AISSEL/Hamed/datasets/wit/images'
    link = link.encode("utf-8")
    _, pos = str(link)[:-1].rsplit('.', 1)
    if pos == 'svg':
        return
    im_path = f'{str(base64.b64encode(link))[2:-1]}'
    if exists(f'{im_root_path}/{im_path}.jpg'):
        return f'{im_path}.jpg'
    return 
    

In [76]:
missed_topics_dict = dict()
for el in missed_topics:
    missed_topics_dict[el] = pg_dict[el]
#     missed_topics_dict[el] = pg_dict[el][pg_dict[el]['topic2caption_sim'] >= 0.5]
    missed_topics_dict[el]['image_path'] = missed_topics_dict[el]['image_url'].apply(link_to_b64)
#     missed_topics_dict[el]['image_path'] = missed_topics_dict[el]['image_url'].apply(b64)
    missed_topics_dict[el] = missed_topics_dict[el][missed_topics_dict[el].image_path.notnull()]
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)



In [77]:
missed_topics_dict[el]

Unnamed: 0,topic2caption_sim,topic2context_sim,image_url,caption,context,concept2caption_sim,concept2context_sim,context_gt,image_path
0,0.620225,0.369207,https://upload.wikimedia.org/wikipedia/commons...,Homeless man homeless,Spirituality affects both mental and physical ...,0.327160,0.185558,0.200,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
1,0.571124,0.236815,http://upload.wikimedia.org/wikipedia/commons/...,Public housing blocks,The architecture of Singapore displays a range...,0.303036,0.065947,0.300,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...
2,0.547283,0.452512,https://upload.wikimedia.org/wikipedia/commons...,Homeless and poor man sleeping on the street. ...,Crisis accommodation is accommodation provided...,0.322983,0.144760,0.500,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
3,0.535434,0.512017,https://upload.wikimedia.org/wikipedia/commons...,"Homeless man in Fresno, California. English: H...",Homelessness in the United States has occurred...,0.343352,0.210216,0.500,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
4,0.533519,0.153052,https://upload.wikimedia.org/wikipedia/commons...,Abandoned homeless shelter using plastic tarp ...,"A tarpaulin or tarp, is a large sheet of stron...",0.224892,0.111633,0.000,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
...,...,...,...,...,...,...,...,...,...
321,0.511245,0.428121,https://upload.wikimedia.org/wikipedia/commons...,Millennium Park seen from 340 on the Park in 2...,Millennium Park is a public park located in th...,0.182702,0.074538,0.375,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
322,0.510789,0.374991,https://upload.wikimedia.org/wikipedia/commons...,English: path in Chicago's Portage Park,There are 95 sites in the National Register of...,0.315369,0.099449,0.125,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
323,0.509164,0.015824,https://upload.wikimedia.org/wikipedia/commons...,"a folk art wall in Lincoln Park, Chicago 655 w...",Folk art covers all forms of visual art made i...,0.250170,0.031279,0.000,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
324,0.509164,0.067301,https://upload.wikimedia.org/wikipedia/commons...,"a folk art wall in Lincoln Park, Chicago 655 w...",Folk and traditional arts are rooted in and re...,0.250170,0.043453,0.375,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...


In [78]:
missed_topics_dict[el].at[0,'image_path']

'aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy84Lzg2L0hvbWVsZXNzXyUyODgzMjk5MjQ1NTclMjkuanBn.jpg'

In [79]:
missed_topics_dict['f'].at[0,'image_path']

'aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy81LzU4L0RPSl9TdGFsa2luZ19ieV9UZWFtc19vcl9Hcm91cHNfRk9JQV9kb2N1bWVudHMtUGFnZTJvZjMuanBn.jpg'

In [80]:
#select just caption and image_path
selected_col = ['caption', 'image_path']
for el in missed_topics_dict:
    missed_topics_dict[el] = missed_topics_dict[el][selected_col]
    missed_topics_dict[el] = missed_topics_dict[el].drop_duplicates(subset=selected_col, keep=False)
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [81]:
missed_topics_dict[el]

Unnamed: 0,caption,image_path
0,Homeless man homeless,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
1,Public housing blocks,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...
2,Homeless and poor man sleeping on the street. ...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
3,"Homeless man in Fresno, California. English: H...",aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
4,Abandoned homeless shelter using plastic tarp ...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
...,...,...
225,English: Chicago and Alton Depot This is an i...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
226,English: Chicago Apartments This is an image ...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
227,Millennium Park seen from 340 on the Park in 2...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
228,English: path in Chicago's Portage Park,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...


### get percantage like OFa data

In [82]:
ofa_col = ['uniq_id', 'image_id', 'caption', 'labels', 'image']
data_root = '/raid/AISSEL/Hamed/datasets/caption_data_org'
f_names =[el for el in os.listdir(data_root) if el.endswith('.tsv')]
f_names

['caption_stage1_train.tsv',
 'caption_stage2_train.tsv',
 'caption_test.tsv',
 'caption_val.tsv']

In [61]:
ofa_len = []
for i in f_names:
    df_ofa = pd.read_csv(f'{data_root}/{i}', sep='\t', names=ofa_col)
    ofa_len.append(len(df_ofa))
    df_ofa = ""

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [116]:
print(ofa_len)

[566747, 113287, 5000, 5000]


In [118]:
for i in ofa_len:
    print((i/sum(ofa_len))* 100)

82.13319923366095
16.41759681407003
0.7246019761345093
0.7246019761345093


In [83]:

# set order based on number images in related google image so oreder is: g, l, f, d, a  
missed_topics_dict['b']

Unnamed: 0,caption,image_path
0,V/Line passenger trains at Geelong Railway Sta...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
1,Crossing plains near Bannockburn Geelong - Bal...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
2,Train on the Belair line passing the underpass...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
3,Tram lines along the Flinders Street facade Fl...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
4,"English: Ouyen Railway Station, looking towar...",aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
...,...,...
552,"A machine laying asphalt concrete, fed from a ...",aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
553,Pedestrian crosswalk English: Pedestrian cross...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
554,Bidar - Cement murals on main roads Bidar Cem...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
555,Pedestrian walk in Ferizaj English: Ferizaj City,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...


In [84]:
# l - g
l_minus_g = missed_topics_dict['b'].merge(missed_topics_dict['f'], how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']
l_minus_g = l_minus_g[selected_col].reset_index(drop=True)
l_minus_g

Unnamed: 0,caption,image_path
0,"English: By letter dated March 22, 2010, The ...",aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
1,Kober at the 2015 Walker Stalker Con English: ...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
2,"A pedestrian in Toronto walks across a street,...",aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
3,"Cantillo at Walker Stalker Con, San Francisco,...",aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
4,Bystander intervention aims to teach people to...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
...,...,...
287,Protest in San Diego on May 31 English: Black ...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
288,Protest in Chicago on January 20 2I4A7398,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
289,Downtown State College was the location of the...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
290,"A protester holding a flyer with the words ""Ge...",aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...


In [85]:
# remove thos images wh
def df2_df1(df1, df2):
    df = df1.merge(df2, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']
    df = df[selected_col].reset_index(drop=True)
    return df

In [86]:
for el in missed_topics_dict:
    print(el, len(missed_topics_dict[el]))

b 557
f 300
g 240
i 262
m 230


In [87]:
# 'm', 'g', 'i', 'f', 'b' 
print(f'm: {len(missed_topics_dict["m"])}')
# g = g - m
g = df2_df1(missed_topics_dict['m'], missed_topics_dict['g'])
print(f'g1: {len(g)}')
#i = i - m
i = df2_df1(missed_topics_dict['m'], missed_topics_dict['i'])
print(f'i1: {len(i)}')
# f = f - m
f = df2_df1(missed_topics_dict['m'], missed_topics_dict['f'])
print(f'f1: {len(f)}')
# b = b - m
b = df2_df1(missed_topics_dict['m'], missed_topics_dict['b'])
print(f'b1: {len(b)}')

# 'g', 'i', 'f', 'b'
# i = i - g
i = df2_df1(g, i)
print(f'i2: {len(i)}')
# f = f - g
f = df2_df1(g, f)
print(f'f2: {len(f)}')
# b = b - g
b = df2_df1(g, b)
print(f'b2: {len(b)}')

# 'i', 'f', 'b' 
# f= f - i
f = df2_df1(i, f)
print(f'f3: {len(f)}')
# b = b - i
b = df2_df1(i, b)
print(f'b3: {len(b)}')

# 'f', 'b'
# b = b - f
b = df2_df1(f, b)
print(f'b4: {len(b)}')

m: 230
g1: 238
i1: 261
f1: 292
b1: 546
i2: 260
f2: 292
b2: 545
f3: 286
b3: 542
b4: 539


In [88]:
for el in missed_topics_dict:
    print(el, len(missed_topics_dict[el]))

b 557
f 300
g 240
i 262
m 230


In [89]:
print(f'm: {len(missed_topics_dict["m"])}')
print(f'g: {len(g)}')
print(f'i: {len(i)}')
print(f'f: {len(f)}')
print(f'b: {len(b)}')


m: 230
g: 238
i: 260
f: 286
b: 539


In [71]:
#intersection
missed_topics_dict['m'].merge(missed_topics_dict['b'], how = 'inner' ,indicator=False)

Unnamed: 0,caption,image_path
0,Pedestrian crosswalk English: Pedestrian cross...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
1,English: Hubbard Park pedestrian access,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
2,Pedestrian walk in Ferizaj English: Ferizaj City,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
3,Traffic along Persiaran Kewajipan Subang Jaya ...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
4,Pedestrian passage in Ma'alot Dafna עברית: שכו...,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...
5,A: Pedestrian\nB: Two-wheel traffic\nC: Physic...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
6,English: Pedestrian crossing greek signΕλληνι...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
7,Construction of polymer cement overlay to chan...,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
8,English: 1978 view from pedestrian bridge,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
9,"Brick arches over pedestrian path, Earlwood Ea...",aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...


In [90]:
missed_topics_dict["g"] = g
missed_topics_dict["i"] = i
missed_topics_dict["f"] = f
missed_topics_dict["b"] = b

In [91]:
for el in missed_topics_dict:
    print(el, len(missed_topics_dict[el]))

b 539
f 286
g 238
i 260
m 230


In [93]:
#intersection
missed_topics_dict['b'].merge(missed_topics_dict['f'], how = 'inner' ,indicator=False)

Unnamed: 0,caption,image_path


### create OFA style data

In [94]:
from io import BytesIO
import pillow_avif
from svglib.svglib import svg2rlg
from reportlab.graphics import renderPM

In [95]:
im_test = '/raid/AISSEL/Hamed/datasets/wit/images'

In [96]:
def b64(fn):
    im_test = '/raid/AISSEL/Hamed/datasets/wit/images'
    fn = f'{im_test}/{fn}'
    base64_str = None
    if exists(fn):
        try:
            img = Image.open(fn)
            img_buffer = BytesIO()
            img.save(img_buffer, format=img.format)
            byte_data = img_buffer.getvalue()
            base64_str = str(base64.b64encode(byte_data))[2:-1]
        except:
            base64_str = None
            print(fn)

    return base64_str

In [97]:
def remove_special(input_string):
    final_string = ""
    for character in input_string:
        if  character == " ":
            final_string = final_string + character
        else:
            if(character.isalnum()):
                final_string = final_string + character
    return final_string

In [98]:
def create_data(df, count):
    col = ['uniq_id', 'image_id', 'caption', 'labels', 'image']
    t_df = pd.DataFrame(columns=col)
    t_df['caption'] = df['caption'].apply(remove_special)
    t_df['uniq_id'] = df.index + count
    t_df['image_id'] = df.index + count
    t_df['image'] = df['image_path'].apply(b64)
    t_df['labels'] = ' '
    t_df = t_df[t_df.image.notnull()]
    t_df = t_df.reset_index(drop=True)
    
    return t_df

In [100]:
len(missed_topics_dict['f'])

286

In [101]:
missed_topics_with_images = dict()
count = 0
for el in missed_topics_dict:
    missed_topics_with_images[el] = create_data(missed_topics_dict[el], count)
    count = len(missed_topics_with_images[el])

/raid/AISSEL/Hamed/datasets/wit/images/aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZWRpYS9jb21tb25zLzUvNTEvU2VudGllcm9fZGVsbGVfRm9ycmVfZGlfU2FuX1JvbWVkaW8uSlBH.jpg
/raid/AISSEL/Hamed/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy83Lzc1L1RoZV9VbmRlcmdyb3VuZF9ha2FfVHViZV8lMjg0NDcxNzMzNDIxMCUyOS5qcGc=.jpg
/raid/AISSEL/Hamed/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy8xLzE2L0F1cmF0X01hcmNoXzIwMjBfZC5qcGc=.jpg
/raid/AISSEL/Hamed/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9mL2ZjL1VTX0ltbWlncmF0aW9uX2FuZF9DdXN0b21zX0VuZm9yY2VtZW50X2FycmVzdC5qcGc=.jpg
/raid/AISSEL/Hamed/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9lL2VhL1BvbGljZW1lbl9pbl9mb3JtYWxfdW5pZm9ybV8lMjg4NjU3ODMyMDU1JTI5LmpwZw==.jpg
/raid/AISSEL/Hamed/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy8wLzBlL01vZF9wbG9kLmpwZw==.jpg
/raid/AISSEL/Hamed/datas

In [102]:
missed_topics_with_images['f']

Unnamed: 0,uniq_id,image_id,caption,labels,image
0,537,537,English By letter dated March 22 2010 The US ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,538,538,Kober at the 2015 Walker Stalker Con English W...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,539,539,Cantillo at Walker Stalker Con San Francisco F...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,540,540,Bystander intervention aims to teach people to...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,541,541,Pedestrian accident location sign in Stuttgart...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
...,...,...,...,...,...
276,818,818,Protest in San Diego on May 31 English Black L...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
277,819,819,Protest in Chicago on January 20 2I4A7398,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
278,820,820,Downtown State College was the location of the...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
279,821,821,A protester holding a flyer with the words Gen...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [103]:
saved_path = '/raid/AISSEL/Hamed/datasets/missing_topic'

In [104]:
for el in missed_topics_with_images:
    missed_topics_with_images[el].to_csv(f'{saved_path}/{el}.tsv', sep="\t", index=False, header=False)
    print(el, len(missed_topics_with_images[el]))

b 537
f 281
g 199
i 259
m 226


### get portion for dataset 
```stage 1: 75%, stage 2: %15, test: %5, validation % 5```

In [105]:
data_dict = dict()

In [106]:
def get_portion(df, p1=75, p2=15, p3=5, p4=5):
    s1 = df.sample(frac = p1/100)
    rest_part_1 = df.drop(s1.index)
    s2 = rest_part_1.sample(frac = p2/(100-p1))
    rest_part_2 = rest_part_1.drop(s2.index)
    s3 = rest_part_2.sample(frac = p3/(100 - p1 - p2))
    s4 = rest_part_2.drop(s3.index)
    return s1, s2, s3, s4

In [108]:
s1 = missed_topics_with_images[el].sample(frac = 0.75)
len(s1) 

170

In [109]:
rest_part_1 = missed_topics_with_images[el].drop(s1.index)

In [110]:
s2 = rest_part_1.sample(frac = 15/(100-75))
len(s2) 

34

In [111]:
rest_part_2 = rest_part_1.drop(s2.index)

In [112]:
s3 = rest_part_2.sample(frac = 5/(100 - 75 - 15))
len(s3) 

11

In [113]:
s4 = rest_part_2.drop(s3.index)
len(s4)

11

In [114]:
name_lsit = ['stage1_train', 'stage2_train', 'val', 'test']

In [115]:
por_dict = dict()
data_dict = dict()
for n in name_lsit:
    data_dict[n] = pd.DataFrame() 
for el in missed_topics_with_images:
    s1, s2, v, t = get_portion(missed_topics_with_images[el])
    
    s1 = s1.reset_index(drop=True)
    data_dict['stage1_train'] = pd.concat([data_dict['stage1_train'], s1], ignore_index=True)
    
    s2 = s2.reset_index(drop=True)
    data_dict['stage2_train'] = pd.concat([data_dict['stage2_train'], s2], ignore_index=True)
    
    t = t.reset_index(drop=True)
    data_dict['test'] = pd.concat([data_dict['test'], t], ignore_index=True)
    
    v = v.reset_index(drop=True)
    data_dict['val'] = pd.concat([data_dict['val'], v], ignore_index=True)
#     data_dict[el] = por_dict

In [116]:
data_dict['val']

Unnamed: 0,uniq_id,image_id,caption,labels,image
0,57,57,VLine operated Sprinter railcar at North Shore...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,25,25,The former Melbourne Line near Melbourne stati...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,411,411,View along Water Street in Cambridge Iowa Engl...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,189,189,A soldier from the 2nd Engineer Regiment patro...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,103,103,Railway siding with novelty passengers Decembe...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
...,...,...,...,...,...
70,275,275,Housing,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
71,343,343,Traffic along Persiaran Kewajipan Subang Jaya ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
72,356,356,Homeless children in the United States59 The n...,,iVBORw0KGgoAAAANSUhEUgAAAaAAAAFfCAIAAACHkI/CAA...
73,466,466,English Chicago Park Boulevard System Histori...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [117]:
data_dict['test']

Unnamed: 0,uniq_id,image_id,caption,labels,image
0,1,1,Crossing plains near Bannockburn Geelong Ball...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,18,18,Trams pass trains on the Flinders Street Viadu...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,19,19,Bluestone road overbridge near Lethbridge Near...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,149,149,Map of Roccadaspide and Castel San Lorenzo Eng...,,iVBORw0KGgoAAAANSUhEUgAAB9AAAAfQCAYAAACaOMR5AA...
4,153,153,A paved Roman road in Pompeii English Street i...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
...,...,...,...,...,...
70,360,360,Homeless man in New York English A homeless ma...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
71,368,368,English Concrete housing construction in Vene...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
72,376,376,Rooftop water towers atop apartment buildings ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
73,406,406,Common insulation applications in an apartment...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [118]:
data_dict['stage2_train']

Unnamed: 0,uniq_id,image_id,caption,labels,image
0,404,404,Water fountain downtown La Crosse English La C...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,315,315,2014 2 Broadway Manhattan,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,436,436,Local protests such as here at the end of the ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,93,93,Southbound view from Platform 1 in November 20...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,530,530,New road distribution in Davis CA Now pedestri...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
...,...,...,...,...,...
220,274,274,A tent city at Oakland California E 12th Stree...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
221,353,353,A homeless man sleeping in Tokyo English Elder...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
222,478,478,English Building in Edison Park park in the n...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
223,292,292,Skid Row Los Angeles contains one of the large...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [119]:
data_dict['stage1_train']

Unnamed: 0,uniq_id,image_id,caption,labels,image
0,266,266,Station entrance at Lincoln Road on the south ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,326,326,English Images related to the Broadway LIRR s...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,200,200,Italiano Mappa della metropolitana di Roma,,iVBORw0KGgoAAAANSUhEUgAABd0AAAL6CAYAAADDpf//AA...
3,374,374,The Nanjing Pedestrian Street in the evening w...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,193,193,View of a road in the old town Italiano Scorci...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
...,...,...,...,...,...
1122,409,409,English Image from the roof of the Steger amp...,,iVBORw0KGgoAAAANSUhEUgAAA84AAAJlCAIAAACE2zyWAA...
1123,273,273,English nice pedestrian place,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1124,261,261,Homeless and poor man sleeping on the street E...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1125,473,473,Park Avenue Building Park Avenue Building Detr...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [1]:
(75 + 75 + 1127 + 225)

1502

In [123]:
for el in data_dict:
    name = f'caption_{el}.tsv'
    data_dict[el].to_csv(f'{saved_path}/{name}', sep="\t", index=False, header=False)

    print(name)


caption_stage1_train.tsv
caption_stage2_train.tsv
caption_val.tsv
caption_test.tsv
