In [1]:
from tqdm.notebook import tqdm, trange

In [2]:
tqdm.pandas()

In [3]:
import sys

In [4]:
import glob
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [5]:
import os

# General packages
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
from PIL import Image

from IPython.display import Image as IImage
from IPython.display import display
import warnings
warnings.filterwarnings("ignore")

In [6]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/test/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/test/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
def find_gpus(nums=6):
    os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp_free_gpus')
    with open('tmp_free_gpus', 'r') as lines_txt:
        frees = lines_txt.readlines()
        idx_freeMemory_pair = [ (idx,int(x.split()[2]))
                              for idx,x in enumerate(frees) ]
    idx_freeMemory_pair.sort(key=lambda my_tuple:my_tuple[1],reverse=True)
    usingGPUs = [str(idx_memory_pair[0])
                    for idx_memory_pair in idx_freeMemory_pair[:nums] ]
    usingGPUs =  ','.join(usingGPUs)
    print('using GPU idx: #', usingGPUs)
    return usingGPUs

In [8]:
os.environ['CUDA_VISIBLE_DEVICES'] = find_gpus(nums=2)

using GPU idx: # 1,0


In [9]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## get topics

In [9]:
from bertopic import BERTopic

In [10]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [11]:
lemmatizer = WordNetLemmatizer()

In [12]:
def get_topics(concept='pedestrian'):
    if concept == 'pedestrian':
        model_path = '../ped_auto_model'
    elif concept == 'aircraft':
        model_path = '../aircraft_auto_model'
    elif concept == 'car':
        model_path = '../car_auto_model'
    auto_model = BERTopic(nr_topics="auto")
    auto_model = BERTopic.load(model_path)
    similar_topics, similarity = auto_model.find_topics(concept, top_n=50)

    topics = dict()
    for t in similar_topics:
        l1 = auto_model.get_topic(t)
        el_list = []
        el_list.append(concept)
        for el in l1:
            w = lemmatizer.lemmatize(el[0], get_wordnet_pos(el[0]))
            el_list.append(w)
        topics[t]= set(el_list)
    return topics

# Load files

In [22]:
os.listdir('/raid/AISSEL/htest/datasets/wit/')

['wit_v1.train.all-00007-of-00010_context_caption_en_sbert_c.tsv',
 'wit_v1.train.all-00009-of-00010_context_caption_en_sbert_c.tsv',
 '__MACOSX',
 'wit_v1.train.all-1percent_sample.tsv',
 'wit_v1.train.all-00000-of-00010_en_csim.tsv',
 'wit_v1.train.all-00006-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00008-of-00010_context_caption_en_sbert_cpa.tsv',
 'aircraft_q75_en_sbert.tsv',
 'images',
 'wit_v1.train.all-00000-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00001-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00002-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00003-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00004-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00005-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00007-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00009-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00000-of-00010

In [23]:
root_path = '/raid/AISSEL/htest/datasets/wit'

In [24]:
f_names =[el for el in os.listdir('/raid/AISSEL/htest/datasets/wit/') if el.endswith('_en_sbert_cpa.tsv')]
f_names

['wit_v1.train.all-00006-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00008-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00000-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00001-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00002-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00003-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00004-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00005-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00007-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00009-of-00010_context_caption_en_sbert_cpa.tsv']

In [25]:
df = pd.read_csv(f'{root_path}/{f_names[0]}', sep='\t')
df = df.drop('Unnamed: 0', 1)
df = df[df["language"]=='en']
df

Unnamed: 0,index,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,...,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score,concept_caption_score,concept_context_score,car_caption_score,aircraft_caption_score
0,14,en,https://en.wikipedia.org/wiki/LibreOffice,https://upload.wikimedia.org/wikipedia/commons...,LibreOffice,Included applications,LibreOffice / Features / Included applications,LibreOffice Math running on Ubuntu,English: Screenshots of LibreOffice Math 6.4 r...,,...,-0.037750,-0.025831,-0.045994,-0.067079,0.128117,-0.067173,-0.040416,-0.020415,-0.032282,-0.006119
1,28,en,https://en.wikipedia.org/wiki/Dalian,https://upload.wikimedia.org/wikipedia/commons...,Dalian,Research centres,Dalian / Education / Research centres,"Dalian Institute of Chemical Physics, of the C...",,,...,0.070782,0.014916,0.060768,0.137923,0.051047,0.125953,0.004426,0.083636,0.018341,0.077601
2,41,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,...,0.048340,0.031444,-0.178475,0.042817,0.119373,0.045854,0.090404,-0.012327,0.029681,0.181104
3,50,en,https://en.wikipedia.org/wiki/List_of_Bermuda_...,https://upload.wikimedia.org/wikipedia/commons...,List of Bermuda hurricanes,1960s,List of Bermuda hurricanes / List of storms / ...,"Hurricane Inga, one of the longest-lived Atlan...",Saffir-Simpson Hurricane ScaleTDTS12345 Englis...,Map showing the path and intensity of Hurrican...,...,-0.104979,-0.117618,-0.052297,-0.031839,-0.167693,-0.054938,0.005365,-0.095421,0.068992,0.103214
4,52,en,https://en.wikipedia.org/wiki/Parimelalhagar,https://upload.wikimedia.org/wikipedia/commons...,Parimelalhagar,Early life,Parimelalhagar / Early life,A page from the Parimelalhagar's commentary on...,English: A page from Arumuka Navalar's 1861 ed...,,...,0.013991,-0.050420,-0.043610,-0.030726,0.036060,-0.098819,0.025255,-0.061983,-0.008257,0.057020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540382,3704673,en,https://en.wikipedia.org/wiki/Standard_Electri...,https://upload.wikimedia.org/wikipedia/commons...,Standard Electric Time Company,,Standard Electric Time Company,A Standard 200177 fire alarm pull station,English: A Standard 200177 pull station in Har...,,...,-0.033670,-0.089871,-0.052723,-0.037692,-0.037566,0.038821,0.130745,-0.079322,0.099379,0.133040
540383,3704674,en,https://en.wikipedia.org/wiki/Malaysia_Airline...,https://upload.wikimedia.org/wikipedia/commons...,Malaysia Airlines Flight 370,Analysis,Malaysia Airlines Flight 370 / Investigation /...,A heat map indicating the probable location of...,English: Probability of the location where Mal...,,...,-0.060926,-0.013728,-0.057422,0.144767,-0.017854,-0.034770,-0.007270,0.046857,0.060193,0.218703
540384,3704675,en,https://en.wikipedia.org/wiki/Jackson_Plan,https://upload.wikimedia.org/wikipedia/commons...,Jackson Plan,Overall layout,Jackson Plan / Layout and effect of the plan /...,"Map of Singapore in 1914, the layout is now mo...","English: Map of the city of Singapore, ca 1914...",,...,0.071714,0.023376,0.013262,0.339486,0.056978,0.012006,0.018857,0.090633,-0.003516,-0.021940
540385,3704677,en,https://en.wikipedia.org/wiki/Candiacervus,https://upload.wikimedia.org/wikipedia/commons...,Candiacervus,Taxonomy,Candiacervus / Taxonomy,Hippopotamus creutzburgi and C. cretensis,English: My drawings of the two subspecies of ...,,...,0.145484,-0.006134,0.115790,-0.039335,0.091558,0.093865,0.052330,0.046950,0.023961,-0.041651


In [26]:
concept_col = ['concept_caption_score', 'car_caption_score', 'aircraft_caption_score']
new_cols = [el for el in list(df.columns) if '_score' in el]
org_cols = [el for el in list(df.columns) if not '_score' in el]

In [27]:
caption_score_list = [el for el in df.columns if 'caption_score' in el]
print(caption_score_list)

['196_caption_score', '412_caption_score', '172_caption_score', '363_caption_score', '198_caption_score', '114_caption_score', '419_caption_score', '294_caption_score', '388_caption_score', '314_caption_score', '98_caption_score', '80_caption_score', '134_caption_score', '432_caption_score', '394_caption_score', '53_caption_score', '56_caption_score', '150_caption_score', '278_caption_score', '-1_caption_score', '43_caption_score', '265_caption_score', '345_caption_score', '378_caption_score', '11_caption_score', '331_caption_score', '202_caption_score', '169_caption_score', '327_caption_score', '309_caption_score', '68_caption_score', '216_caption_score', '40_caption_score', '99_caption_score', '426_caption_score', '344_caption_score', '47_caption_score', '8_caption_score', '416_caption_score', '112_caption_score', '137_caption_score', '441_caption_score', '192_caption_score', '20_caption_score', '296_caption_score', '144_caption_score', '12_caption_score', '224_caption_score', '250_c

In [28]:
caption_score_list = caption_score_list[:-3]
print(caption_score_list)

['196_caption_score', '412_caption_score', '172_caption_score', '363_caption_score', '198_caption_score', '114_caption_score', '419_caption_score', '294_caption_score', '388_caption_score', '314_caption_score', '98_caption_score', '80_caption_score', '134_caption_score', '432_caption_score', '394_caption_score', '53_caption_score', '56_caption_score', '150_caption_score', '278_caption_score', '-1_caption_score', '43_caption_score', '265_caption_score', '345_caption_score', '378_caption_score', '11_caption_score', '331_caption_score', '202_caption_score', '169_caption_score', '327_caption_score', '309_caption_score', '68_caption_score', '216_caption_score', '40_caption_score', '99_caption_score', '426_caption_score', '344_caption_score', '47_caption_score', '8_caption_score', '416_caption_score', '112_caption_score', '137_caption_score', '441_caption_score', '192_caption_score', '20_caption_score', '296_caption_score', '144_caption_score', '12_caption_score', '224_caption_score', '250_c

In [29]:
context_score_list = [el for el in df.columns if 'context_score' in el]
context_score_list = context_score_list[:-1]
print(context_score_list)

['196_context_score', '412_context_score', '172_context_score', '363_context_score', '198_context_score', '114_context_score', '419_context_score', '294_context_score', '388_context_score', '314_context_score', '98_context_score', '80_context_score', '134_context_score', '432_context_score', '394_context_score', '53_context_score', '56_context_score', '150_context_score', '278_context_score', '-1_context_score', '43_context_score', '265_context_score', '345_context_score', '378_context_score', '11_context_score', '331_context_score', '202_context_score', '169_context_score', '327_context_score', '309_context_score', '68_context_score', '216_context_score', '40_context_score', '99_context_score', '426_context_score', '344_context_score', '47_context_score', '8_context_score', '416_context_score', '112_context_score', '137_context_score', '441_context_score', '192_context_score', '20_context_score', '296_context_score', '144_context_score', '12_context_score', '224_context_score', '250_c

In [30]:
# b, f, g, i, m

In [31]:
im_c = ['image_url', 'caption', 'context']
cap_s = ['concept_caption_score']
con_s = ['concept_context_score']
plus_col = ['car_caption_score', 'aircraft_caption_score']
col_list = im_c + cap_s + con_s + caption_score_list + context_score_list

In [32]:
col_list

['image_url',
 'caption',
 'context',
 'concept_caption_score',
 'concept_context_score',
 '196_caption_score',
 '412_caption_score',
 '172_caption_score',
 '363_caption_score',
 '198_caption_score',
 '114_caption_score',
 '419_caption_score',
 '294_caption_score',
 '388_caption_score',
 '314_caption_score',
 '98_caption_score',
 '80_caption_score',
 '134_caption_score',
 '432_caption_score',
 '394_caption_score',
 '53_caption_score',
 '56_caption_score',
 '150_caption_score',
 '278_caption_score',
 '-1_caption_score',
 '43_caption_score',
 '265_caption_score',
 '345_caption_score',
 '378_caption_score',
 '11_caption_score',
 '331_caption_score',
 '202_caption_score',
 '169_caption_score',
 '327_caption_score',
 '309_caption_score',
 '68_caption_score',
 '216_caption_score',
 '40_caption_score',
 '99_caption_score',
 '426_caption_score',
 '344_caption_score',
 '47_caption_score',
 '8_caption_score',
 '416_caption_score',
 '112_caption_score',
 '137_caption_score',
 '441_caption_score',

In [33]:
csim_df = pd.DataFrame()
df = df.reset_index(drop=True)
csim_df = pd.concat([csim_df, df], ignore_index=True)
csim_df

Unnamed: 0,index,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,...,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score,concept_caption_score,concept_context_score,car_caption_score,aircraft_caption_score
0,14,en,https://en.wikipedia.org/wiki/LibreOffice,https://upload.wikimedia.org/wikipedia/commons...,LibreOffice,Included applications,LibreOffice / Features / Included applications,LibreOffice Math running on Ubuntu,English: Screenshots of LibreOffice Math 6.4 r...,,...,-0.037750,-0.025831,-0.045994,-0.067079,0.128117,-0.067173,-0.040416,-0.020415,-0.032282,-0.006119
1,28,en,https://en.wikipedia.org/wiki/Dalian,https://upload.wikimedia.org/wikipedia/commons...,Dalian,Research centres,Dalian / Education / Research centres,"Dalian Institute of Chemical Physics, of the C...",,,...,0.070782,0.014916,0.060768,0.137923,0.051047,0.125953,0.004426,0.083636,0.018341,0.077601
2,41,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,...,0.048340,0.031444,-0.178475,0.042817,0.119373,0.045854,0.090404,-0.012327,0.029681,0.181104
3,50,en,https://en.wikipedia.org/wiki/List_of_Bermuda_...,https://upload.wikimedia.org/wikipedia/commons...,List of Bermuda hurricanes,1960s,List of Bermuda hurricanes / List of storms / ...,"Hurricane Inga, one of the longest-lived Atlan...",Saffir-Simpson Hurricane ScaleTDTS12345 Englis...,Map showing the path and intensity of Hurrican...,...,-0.104979,-0.117618,-0.052297,-0.031839,-0.167693,-0.054938,0.005365,-0.095421,0.068992,0.103214
4,52,en,https://en.wikipedia.org/wiki/Parimelalhagar,https://upload.wikimedia.org/wikipedia/commons...,Parimelalhagar,Early life,Parimelalhagar / Early life,A page from the Parimelalhagar's commentary on...,English: A page from Arumuka Navalar's 1861 ed...,,...,0.013991,-0.050420,-0.043610,-0.030726,0.036060,-0.098819,0.025255,-0.061983,-0.008257,0.057020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540382,3704673,en,https://en.wikipedia.org/wiki/Standard_Electri...,https://upload.wikimedia.org/wikipedia/commons...,Standard Electric Time Company,,Standard Electric Time Company,A Standard 200177 fire alarm pull station,English: A Standard 200177 pull station in Har...,,...,-0.033670,-0.089871,-0.052723,-0.037692,-0.037566,0.038821,0.130745,-0.079322,0.099379,0.133040
540383,3704674,en,https://en.wikipedia.org/wiki/Malaysia_Airline...,https://upload.wikimedia.org/wikipedia/commons...,Malaysia Airlines Flight 370,Analysis,Malaysia Airlines Flight 370 / Investigation /...,A heat map indicating the probable location of...,English: Probability of the location where Mal...,,...,-0.060926,-0.013728,-0.057422,0.144767,-0.017854,-0.034770,-0.007270,0.046857,0.060193,0.218703
540384,3704675,en,https://en.wikipedia.org/wiki/Jackson_Plan,https://upload.wikimedia.org/wikipedia/commons...,Jackson Plan,Overall layout,Jackson Plan / Layout and effect of the plan /...,"Map of Singapore in 1914, the layout is now mo...","English: Map of the city of Singapore, ca 1914...",,...,0.071714,0.023376,0.013262,0.339486,0.056978,0.012006,0.018857,0.090633,-0.003516,-0.021940
540385,3704677,en,https://en.wikipedia.org/wiki/Candiacervus,https://upload.wikimedia.org/wikipedia/commons...,Candiacervus,Taxonomy,Candiacervus / Taxonomy,Hippopotamus creutzburgi and C. cretensis,English: My drawings of the two subspecies of ...,,...,0.145484,-0.006134,0.115790,-0.039335,0.091558,0.093865,0.052330,0.046950,0.023961,-0.041651


In [34]:
csim_df = pd.DataFrame()
for idx, fn in tqdm(enumerate(f_names)):
#     if idx == 2:
#         break
    df = pd.read_csv(f'{root_path}/{fn}', sep='\t')
    df = df.drop('Unnamed: 0', 1)
    df = df[df["language"]=='en']
    df = df[col_list]
#     df_1 = df_1.sort_values(by=['196_score'], ascending=False)
    df = df.reset_index(drop=True)
    csim_df = pd.concat([csim_df, df], ignore_index=True)
    

0it [00:00, ?it/s]

In [35]:
# csim_df.sort_values(by=['196_score'], ascending=False)
csim_df

Unnamed: 0,image_url,caption,context,concept_caption_score,concept_context_score,196_caption_score,412_caption_score,172_caption_score,363_caption_score,198_caption_score,...,137_context_score,441_context_score,192_context_score,20_context_score,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score
0,https://upload.wikimedia.org/wikipedia/commons...,LibreOffice Math running on Ubuntu English: Sc...,LibreOffice is a free and open-source office s...,-0.040416,-0.020415,0.002686,0.042616,0.010594,-0.072686,-0.080525,...,-0.026707,0.017691,-0.051926,-0.086811,-0.037750,-0.025831,-0.045994,-0.067079,0.128117,-0.067173
1,https://upload.wikimedia.org/wikipedia/commons...,"Dalian Institute of Chemical Physics, of the C...",Dalian is a major sub-provincial port city in ...,0.004426,0.083636,-0.045319,-0.014597,0.050622,-0.052078,0.002163,...,0.090900,0.139327,0.108564,0.032219,0.070782,0.014916,0.060768,0.137923,0.051047,0.125953
2,https://upload.wikimedia.org/wikipedia/commons...,Kawasaki Stadium English: kawasaki_fujimi Stad...,FMW 7th Anniversary Show was a professional wr...,0.090404,-0.012327,-0.049097,-0.018047,-0.039384,-0.064661,0.037314,...,-0.093685,-0.231178,-0.015962,0.008826,0.048340,0.031444,-0.178475,0.042817,0.119373,0.045854
3,https://upload.wikimedia.org/wikipedia/commons...,"Hurricane Inga, one of the longest-lived Atlan...",The British Overseas Territory of Bermuda has ...,0.005365,-0.095421,-0.057836,-0.039939,-0.033494,-0.033917,-0.042589,...,-0.134550,-0.122756,-0.088162,-0.107879,-0.104979,-0.117618,-0.052297,-0.031839,-0.167693,-0.054938
4,https://upload.wikimedia.org/wikipedia/commons...,A page from the Parimelalhagar's commentary on...,"Parimelalhagar, also known as Vanthuvarai Peru...",0.025255,-0.061983,0.028717,-0.008050,0.003633,-0.019985,-0.025944,...,-0.037593,0.069883,-0.044136,-0.007787,0.013991,-0.050420,-0.043610,-0.030726,0.036060,-0.098819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5411973,https://upload.wikimedia.org/wikipedia/commons...,"English: Jia Jingde, politician of China. 中文:...",This is a list of Vice Presidents of the Exami...,0.156600,0.093868,-0.021200,0.070536,0.045870,-0.022279,0.064005,...,0.064688,-0.003800,0.079544,0.097345,0.022783,0.035481,-0.009618,0.142356,0.097828,0.056724
5411974,https://upload.wikimedia.org/wikipedia/commons...,Beyer in 2013 DSC_9909.jpg,Brennen Beyer is an American football outside ...,0.185849,0.191958,0.046954,0.072107,0.109573,0.070777,0.164449,...,0.028834,0.043354,0.121015,0.170634,0.032242,0.086187,0.108909,-0.038890,0.102163,-0.002512
5411975,https://upload.wikimedia.org/wikipedia/commons...,Kannagi in Tamil Nadu. English: Idol of Kannak...,"Kannagi, sometimes spelled Kannaki, is a legen...",0.065777,0.075305,-0.023543,-0.038281,0.073440,0.010564,0.030676,...,0.136019,0.181244,0.122867,0.101799,0.063076,0.023685,0.042369,0.136377,0.022068,-0.019773
5411976,https://upload.wikimedia.org/wikipedia/commons...,English: Landscape with stormy clouds and a p...,"Don Det, is an island in the Mekong River in t...",0.051398,0.021199,0.041571,0.041349,0.022871,-0.075316,-0.023433,...,-0.032127,0.073285,0.013086,-0.011777,0.021279,0.020210,0.024772,0.081596,0.068120,0.066233


In [36]:
csim_df.describe()

Unnamed: 0,concept_caption_score,concept_context_score,196_caption_score,412_caption_score,172_caption_score,363_caption_score,198_caption_score,114_caption_score,419_caption_score,294_caption_score,...,137_context_score,441_context_score,192_context_score,20_context_score,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score
count,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,...,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0,5411978.0
mean,0.08732864,0.02756854,0.03316916,0.02898722,0.04938595,0.009565755,0.03607464,0.04620839,0.05406117,0.03718228,...,0.0139156,-0.003944045,0.003846679,0.006175797,0.03927394,0.000752269,0.01043332,0.01692591,0.01978357,0.007244231
std,0.06997118,0.06684423,0.06863361,0.06478608,0.06895159,0.06071932,0.07447689,0.06934013,0.07638213,0.06107245,...,0.07867535,0.07710654,0.061788,0.06737285,0.0873531,0.06067205,0.07671057,0.08884871,0.06969397,0.07776987
min,-0.2394902,-0.2688895,-0.2543995,-0.2410043,-0.2478864,-0.2473019,-0.2808687,-0.2716275,-0.2415859,-0.2307905,...,-0.3086933,-0.3402204,-0.2946304,-0.2907839,-0.2930294,-0.2660354,-0.3073893,-0.328301,-0.2828747,-0.384506
25%,0.03980459,-0.01782694,-0.01334092,-0.01534086,0.003201978,-0.03199529,-0.01532192,-0.001223383,0.002334806,-0.003728528,...,-0.03933429,-0.05569053,-0.03841725,-0.03938309,-0.02109603,-0.04113004,-0.04181451,-0.04457409,-0.02812582,-0.04579195
50%,0.086847,0.02613318,0.02696826,0.02532269,0.04495488,0.007484896,0.03195039,0.04330221,0.04744296,0.03457904,...,0.009939842,-0.005078442,0.002965325,0.00436971,0.0324168,-0.001201854,0.006485071,0.01087677,0.01813048,0.004907532
75%,0.1341138,0.07054175,0.0721949,0.06901627,0.08990105,0.04876487,0.08285133,0.09029126,0.09732897,0.07442215,...,0.06223764,0.0457986,0.0449262,0.04921226,0.0911472,0.04067654,0.05814612,0.07209412,0.06618579,0.05781632
max,0.7286295,0.6892114,0.7550066,0.6103925,0.6149387,0.5102714,0.61045,0.7142168,0.6140311,0.6116345,...,0.5473793,0.5563506,0.5750273,0.5424188,0.5708867,0.4586499,0.6041094,0.589149,0.4600561,0.5546166


In [37]:
csim_df[['concept_caption_score']].describe().round(3)

Unnamed: 0,concept_caption_score
count,5411978.0
mean,0.087
std,0.07
min,-0.239
25%,0.04
50%,0.087
75%,0.134
max,0.729


In [70]:
# df_p = csim_df[csim_df['concept_caption_score'] >= csim_df['concept_caption_score'].mean()]
df_p = csim_df[csim_df['concept_caption_score'] >= 0.134]
df_p = df_p.reset_index(drop=True)
df_p

Unnamed: 0,image_url,caption,context,concept_caption_score,concept_context_score,196_caption_score,412_caption_score,172_caption_score,363_caption_score,198_caption_score,...,137_context_score,441_context_score,192_context_score,20_context_score,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score
0,https://upload.wikimedia.org/wikipedia/commons...,Menachem Mendel Schneerson English: Menachem M...,1994 was a common year starting on Saturday of...,0.166272,0.094395,0.078204,0.065567,0.080967,0.041644,0.075154,...,-0.012260,0.019606,0.024275,0.076669,-0.067856,0.096434,-0.048101,-0.041107,0.026869,0.011452
1,https://upload.wikimedia.org/wikipedia/commons...,English: Image cropped from a baseball card o...,"Donald Robert ""Duffy"" Dyer is an American form...",0.162247,-0.034380,0.052320,0.042743,-0.008497,0.038228,0.071760,...,-0.118144,-0.099278,-0.026665,-0.015405,-0.002073,-0.119949,-0.022791,-0.088388,0.176188,0.028713
2,https://upload.wikimedia.org/wikipedia/commons...,"The special theory of relativity, formulated i...","In relativistic physics, a velocity-addition f...",0.142336,0.106813,0.129890,0.102847,0.105520,0.096102,0.069470,...,-0.017552,0.020111,0.036183,0.007514,0.020004,0.120406,0.044195,0.014316,-0.009373,0.032084
3,https://upload.wikimedia.org/wikipedia/commons...,English: Randy Stonehill Signature,Randall Evan Stonehill is an American singer a...,0.223324,0.092803,0.123127,0.087765,0.120790,0.042908,0.068349,...,0.108193,-0.031020,-0.006661,0.128765,0.160655,-0.025057,0.193492,-0.012994,0.125541,0.050723
4,https://upload.wikimedia.org/wikipedia/commons...,English: United States Senator William Proxmi...,Edward William Proxmire was an American politi...,0.171389,0.067607,0.028057,0.090903,0.127647,0.020825,0.029832,...,0.066258,-0.010974,-0.011341,0.085126,-0.066214,-0.029877,0.043140,-0.093784,0.008349,-0.041261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1355833,https://upload.wikimedia.org/wikipedia/commons...,Massachusetts Bay Transportation Authority 170...,The Urban Transportation Development Corporati...,0.147268,0.091816,0.183336,0.155670,0.233472,0.025314,0.184065,...,0.122498,0.018380,0.130792,-0.011854,0.102683,0.041830,0.039997,0.298343,0.100381,0.069348
1355834,https://upload.wikimedia.org/wikipedia/commons...,A map showing the path of State Highway 22 in...,State Highway 22 is a state highway in Oklahom...,0.207656,0.078112,0.255292,0.206986,0.297474,0.104774,0.176442,...,0.217521,-0.047160,0.008157,0.072613,0.177827,-0.073539,0.180896,0.140562,-0.008423,0.056389
1355835,https://upload.wikimedia.org/wikipedia/commons...,English: CIPET,Central Institute of Petrochemical Engineering...,0.142362,-0.005783,0.031377,0.048821,0.112177,-0.011580,0.040081,...,0.087363,-0.050901,0.013602,0.041040,-0.003768,-0.039447,0.023529,0.135025,-0.000438,-0.107296
1355836,https://upload.wikimedia.org/wikipedia/commons...,"English: Jia Jingde, politician of China. 中文:...",This is a list of Vice Presidents of the Exami...,0.156600,0.093868,-0.021200,0.070536,0.045870,-0.022279,0.064005,...,0.064688,-0.003800,0.079544,0.097345,0.022783,0.035481,-0.009618,0.142356,0.097828,0.056724


In [39]:
df_p.describe().round(3)

Unnamed: 0,concept_caption_score,concept_context_score,196_caption_score,412_caption_score,172_caption_score,363_caption_score,198_caption_score,114_caption_score,419_caption_score,294_caption_score,...,137_context_score,441_context_score,192_context_score,20_context_score,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score
count,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,...,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0,1355838.0
mean,0.176,0.063,0.086,0.087,0.111,0.057,0.093,0.103,0.114,0.094,...,0.043,0.012,0.019,0.027,0.056,0.023,0.032,0.043,0.036,0.026
std,0.036,0.066,0.07,0.059,0.065,0.053,0.07,0.061,0.079,0.054,...,0.084,0.077,0.06,0.067,0.089,0.059,0.079,0.094,0.07,0.078
min,0.134,-0.234,-0.149,-0.126,-0.115,-0.166,-0.164,-0.126,-0.138,-0.122,...,-0.291,-0.331,-0.27,-0.272,-0.278,-0.254,-0.266,-0.302,-0.268,-0.296
25%,0.149,0.019,0.036,0.045,0.066,0.02,0.044,0.061,0.059,0.057,...,-0.014,-0.04,-0.022,-0.018,-0.006,-0.018,-0.023,-0.023,-0.012,-0.028
50%,0.167,0.062,0.076,0.081,0.104,0.054,0.089,0.099,0.103,0.088,...,0.038,0.011,0.019,0.025,0.049,0.021,0.028,0.035,0.035,0.023
75%,0.195,0.105,0.125,0.123,0.148,0.09,0.137,0.14,0.158,0.124,...,0.094,0.062,0.059,0.07,0.108,0.061,0.081,0.1,0.083,0.077
max,0.729,0.689,0.755,0.61,0.615,0.51,0.61,0.714,0.614,0.612,...,0.547,0.556,0.536,0.515,0.561,0.459,0.604,0.589,0.417,0.54


In [40]:
df_dict = dict()
for col in caption_score_list:
    num, _ = col.split('_', 1)
    c_list = [col] + [f'{num}_context_score'] + im_c + cap_s + con_s
    df = df_p[c_list]
    
    df = df[df[col] >= 0.39]
    df = df[df['concept_caption_score'] >= 0.29]
#     df = df[df[col] > 0.55]
    df = df.sort_values(by=[col], ascending=False)
    df_dict[col] = df

In [41]:
df_dict[col]

Unnamed: 0,376_caption_score,376_context_score,image_url,caption,context,concept_caption_score,concept_context_score
985720,0.46348,0.196054,http://upload.wikimedia.org/wikipedia/commons/...,English: Fox Walker 0-6-0ST 358/1877 Minnie a...,The Mangapps Railway Museum is a heritage rail...,0.304707,0.115252
306610,0.46348,0.261957,http://upload.wikimedia.org/wikipedia/commons/...,English: Fox Walker 0-6-0ST 358/1877 Minnie a...,"List of Peckett and Sons railway locomotives, ...",0.304707,0.075167
717229,0.439399,0.357037,https://upload.wikimedia.org/wikipedia/commons...,Main Street at Disneyland as seen from a Horse...,"Disneyland Park, originally Disneyland, is the...",0.290381,-0.061103
582445,0.430533,0.252759,https://upload.wikimedia.org/wikipedia/commons...,English: Bumper cars in Pripyat's amusement park,Bumper cars or dodgems is the generic name for...,0.328375,0.248971
308373,0.430315,0.045216,https://upload.wikimedia.org/wikipedia/commons...,A pedestrian-only section of East Nanjing Road,The 2010 census put Shanghai's total populatio...,0.66941,0.067717
233344,0.429185,0.318452,https://upload.wikimedia.org/wikipedia/commons...,Pepe the Frog as he appears in graffiti from t...,Pepe the Frog is an internet meme. He is an an...,0.29665,0.118757
855275,0.426821,-0.005306,https://upload.wikimedia.org/wikipedia/commons...,Trolleybus on the street,"Chișinău, also known as Kishinev, is the capit...",0.404396,0.10286
266208,0.424288,0.357476,https://upload.wikimedia.org/wikipedia/commons...,Bugs' star on the Hollywood Walk of Fame. Bugs...,"Bugs Bunny is an animated cartoon character, c...",0.305238,0.011324
124005,0.422475,0.115039,https://upload.wikimedia.org/wikipedia/commons...,Elephant walking,Gait is the pattern of movement of the limbs o...,0.346736,0.169604
885724,0.421779,0.016103,https://upload.wikimedia.org/wikipedia/commons...,English: Santa Clara's pedestrian boulevard,Villa Clara is one of the provinces of Cuba. I...,0.558971,0.081253


In [42]:
col

'376_caption_score'

In [43]:
from IPython.display import display

In [44]:
for el in df_dict:
    t,_ = el.split('_', 1)
#     print(t)
    df_dict[el].columns = [f'{t}_caption_sim', f'{t}_context_sim', 'image_url',
                          'caption', 'context', f'ped_caption_sim', f'ped_context_sim']
    display(df_dict[el].describe().round(2))
    print('\n')

Unnamed: 0,196_caption_sim,196_context_sim,ped_caption_sim,ped_context_sim
count,670.0,670.0,670.0,670.0
mean,0.45,0.22,0.41,0.17
std,0.06,0.17,0.09,0.14
min,0.39,-0.14,0.29,-0.12
25%,0.41,0.09,0.34,0.08
50%,0.43,0.21,0.41,0.13
75%,0.47,0.31,0.48,0.23
max,0.76,0.75,0.73,0.69






Unnamed: 0,412_caption_sim,412_context_sim,ped_caption_sim,ped_context_sim
count,302.0,302.0,302.0,302.0
mean,0.43,0.17,0.48,0.17
std,0.04,0.15,0.08,0.15
min,0.39,-0.09,0.31,-0.08
25%,0.4,0.06,0.42,0.07
50%,0.42,0.14,0.48,0.13
75%,0.45,0.27,0.53,0.24
max,0.61,0.57,0.73,0.69






Unnamed: 0,172_caption_sim,172_context_sim,ped_caption_sim,ped_context_sim
count,735.0,735.0,735.0,735.0
mean,0.43,0.17,0.41,0.15
std,0.04,0.14,0.09,0.13
min,0.39,-0.13,0.29,-0.13
25%,0.4,0.08,0.33,0.06
50%,0.42,0.15,0.41,0.12
75%,0.45,0.26,0.48,0.19
max,0.61,0.56,0.73,0.69






Unnamed: 0,363_caption_sim,363_context_sim,ped_caption_sim,ped_context_sim
count,19.0,19.0,19.0,19.0
mean,0.41,0.16,0.43,0.18
std,0.01,0.12,0.11,0.12
min,0.39,-0.01,0.3,0.03
25%,0.39,0.06,0.33,0.09
50%,0.4,0.16,0.42,0.15
75%,0.42,0.26,0.54,0.23
max,0.45,0.31,0.61,0.46






Unnamed: 0,198_caption_sim,198_context_sim,ped_caption_sim,ped_context_sim
count,386.0,386.0,386.0,386.0
mean,0.43,0.19,0.4,0.15
std,0.04,0.14,0.1,0.12
min,0.39,-0.1,0.29,-0.07
25%,0.4,0.07,0.32,0.07
50%,0.42,0.17,0.37,0.12
75%,0.45,0.29,0.47,0.2
max,0.61,0.56,0.73,0.68






Unnamed: 0,114_caption_sim,114_context_sim,ped_caption_sim,ped_context_sim
count,236.0,236.0,236.0,236.0
mean,0.51,0.36,0.35,0.17
std,0.08,0.16,0.08,0.08
min,0.39,-0.09,0.29,-0.06
25%,0.44,0.28,0.3,0.12
50%,0.51,0.41,0.32,0.17
75%,0.6,0.48,0.34,0.21
max,0.65,0.64,0.73,0.48






Unnamed: 0,419_caption_sim,419_context_sim,ped_caption_sim,ped_context_sim
count,1149.0,1149.0,1149.0,1149.0
mean,0.43,0.18,0.37,0.13
std,0.03,0.14,0.08,0.11
min,0.39,-0.16,0.29,-0.15
25%,0.4,0.07,0.31,0.06
50%,0.42,0.18,0.34,0.11
75%,0.45,0.29,0.41,0.19
max,0.61,0.55,0.73,0.69






Unnamed: 0,294_caption_sim,294_context_sim,ped_caption_sim,ped_context_sim
count,227.0,227.0,227.0,227.0
mean,0.43,0.18,0.47,0.18
std,0.04,0.16,0.1,0.15
min,0.39,-0.11,0.29,-0.07
25%,0.4,0.06,0.4,0.08
50%,0.42,0.14,0.48,0.14
75%,0.45,0.29,0.54,0.24
max,0.61,0.61,0.73,0.69






Unnamed: 0,388_caption_sim,388_context_sim,ped_caption_sim,ped_context_sim
count,13.0,13.0,13.0,13.0
mean,0.42,0.12,0.5,0.1
std,0.02,0.11,0.12,0.1
min,0.39,0.01,0.35,-0.04
25%,0.4,0.04,0.4,0.05
50%,0.41,0.12,0.46,0.08
75%,0.43,0.17,0.6,0.11
max,0.46,0.34,0.7,0.33






Unnamed: 0,314_caption_sim,314_context_sim,ped_caption_sim,ped_context_sim
count,599.0,599.0,599.0,599.0
mean,0.44,0.19,0.4,0.15
std,0.04,0.13,0.09,0.12
min,0.39,-0.18,0.29,-0.13
25%,0.41,0.1,0.32,0.07
50%,0.42,0.18,0.38,0.12
75%,0.45,0.27,0.47,0.2
max,0.68,0.67,0.73,0.69






Unnamed: 0,98_caption_sim,98_context_sim,ped_caption_sim,ped_context_sim
count,1462.0,1462.0,1462.0,1462.0
mean,0.46,0.22,0.37,0.15
std,0.05,0.16,0.08,0.11
min,0.39,-0.15,0.29,-0.15
25%,0.42,0.09,0.31,0.08
50%,0.45,0.2,0.35,0.14
75%,0.47,0.37,0.4,0.18
max,0.73,0.64,0.73,0.69






Unnamed: 0,80_caption_sim,80_context_sim,ped_caption_sim,ped_context_sim
count,48.0,48.0,48.0,48.0
mean,0.42,0.2,0.45,0.2
std,0.02,0.16,0.14,0.13
min,0.39,-0.1,0.29,0.02
25%,0.4,0.1,0.33,0.1
50%,0.41,0.17,0.38,0.18
75%,0.43,0.34,0.57,0.26
max,0.5,0.59,0.73,0.5






Unnamed: 0,134_caption_sim,134_context_sim,ped_caption_sim,ped_context_sim
count,1286.0,1286.0,1286.0,1286.0
mean,0.44,0.21,0.37,0.13
std,0.04,0.13,0.08,0.11
min,0.39,-0.15,0.29,-0.13
25%,0.4,0.11,0.31,0.06
50%,0.42,0.21,0.34,0.11
75%,0.45,0.3,0.42,0.18
max,0.62,0.59,0.73,0.69






Unnamed: 0,432_caption_sim,432_context_sim,ped_caption_sim,ped_context_sim
count,38.0,38.0,38.0,38.0
mean,0.42,0.12,0.46,0.14
std,0.03,0.14,0.14,0.12
min,0.39,-0.13,0.29,-0.03
25%,0.4,0.02,0.32,0.07
50%,0.42,0.09,0.46,0.12
75%,0.43,0.23,0.59,0.18
max,0.52,0.44,0.73,0.46






Unnamed: 0,394_caption_sim,394_context_sim,ped_caption_sim,ped_context_sim
count,1521.0,1521.0,1521.0,1521.0
mean,0.43,0.19,0.38,0.13
std,0.04,0.12,0.08,0.11
min,0.39,-0.11,0.29,-0.13
25%,0.4,0.1,0.32,0.05
50%,0.42,0.19,0.36,0.11
75%,0.45,0.27,0.42,0.18
max,0.68,0.58,0.73,0.69






Unnamed: 0,53_caption_sim,53_context_sim,ped_caption_sim,ped_context_sim
count,126.0,126.0,126.0,126.0
mean,0.43,0.18,0.48,0.16
std,0.04,0.15,0.1,0.14
min,0.39,-0.14,0.29,-0.07
25%,0.41,0.08,0.4,0.07
50%,0.42,0.16,0.49,0.12
75%,0.46,0.28,0.55,0.24
max,0.57,0.54,0.73,0.52






Unnamed: 0,56_caption_sim,56_context_sim,ped_caption_sim,ped_context_sim
count,942.0,942.0,942.0,942.0
mean,0.44,0.24,0.37,0.14
std,0.04,0.13,0.09,0.09
min,0.39,-0.14,0.29,-0.12
25%,0.41,0.15,0.31,0.08
50%,0.43,0.25,0.33,0.13
75%,0.46,0.34,0.42,0.18
max,0.63,0.56,0.73,0.69






Unnamed: 0,150_caption_sim,150_context_sim,ped_caption_sim,ped_context_sim
count,699.0,699.0,699.0,699.0
mean,0.43,0.21,0.39,0.16
std,0.04,0.14,0.09,0.13
min,0.39,-0.13,0.29,-0.13
25%,0.4,0.11,0.32,0.08
50%,0.42,0.2,0.37,0.13
75%,0.45,0.31,0.46,0.21
max,0.64,0.53,0.73,0.69






Unnamed: 0,278_caption_sim,278_context_sim,ped_caption_sim,ped_context_sim
count,1980.0,1980.0,1980.0,1980.0
mean,0.44,0.19,0.37,0.13
std,0.04,0.12,0.07,0.11
min,0.39,-0.14,0.29,-0.15
25%,0.41,0.11,0.31,0.06
50%,0.43,0.19,0.35,0.11
75%,0.46,0.27,0.42,0.18
max,0.71,0.56,0.73,0.69






Unnamed: 0,-1_caption_sim,-1_context_sim,ped_caption_sim,ped_context_sim
count,504.0,504.0,504.0,504.0
mean,0.42,0.12,0.39,0.12
std,0.02,0.09,0.08,0.11
min,0.39,-0.12,0.29,-0.15
25%,0.4,0.06,0.33,0.05
50%,0.41,0.11,0.38,0.1
75%,0.43,0.17,0.43,0.17
max,0.57,0.41,0.73,0.69






Unnamed: 0,43_caption_sim,43_context_sim,ped_caption_sim,ped_context_sim
count,367.0,367.0,367.0,367.0
mean,0.43,0.24,0.36,0.17
std,0.04,0.14,0.09,0.1
min,0.39,-0.16,0.29,-0.08
25%,0.4,0.13,0.31,0.1
50%,0.42,0.26,0.33,0.17
75%,0.45,0.35,0.37,0.22
max,0.64,0.52,0.73,0.69






Unnamed: 0,265_caption_sim,265_context_sim,ped_caption_sim,ped_context_sim
count,56.0,56.0,56.0,56.0
mean,0.43,0.19,0.5,0.17
std,0.04,0.16,0.11,0.16
min,0.39,-0.02,0.3,-0.06
25%,0.4,0.08,0.41,0.08
50%,0.41,0.15,0.52,0.12
75%,0.44,0.28,0.57,0.21
max,0.54,0.55,0.73,0.68






Unnamed: 0,345_caption_sim,345_context_sim,ped_caption_sim,ped_context_sim
count,240.0,240.0,240.0,240.0
mean,0.43,0.21,0.44,0.17
std,0.04,0.14,0.1,0.14
min,0.39,-0.12,0.29,-0.12
25%,0.4,0.11,0.34,0.08
50%,0.42,0.18,0.44,0.13
75%,0.45,0.31,0.53,0.23
max,0.67,0.66,0.73,0.69






Unnamed: 0,378_caption_sim,378_context_sim,ped_caption_sim,ped_context_sim
count,566.0,566.0,566.0,566.0
mean,0.43,0.21,0.4,0.14
std,0.04,0.13,0.09,0.12
min,0.39,-0.2,0.29,-0.15
25%,0.4,0.12,0.32,0.06
50%,0.42,0.2,0.38,0.12
75%,0.45,0.3,0.47,0.19
max,0.63,0.54,0.73,0.69






Unnamed: 0,11_caption_sim,11_context_sim,ped_caption_sim,ped_context_sim
count,669.0,669.0,669.0,669.0
mean,0.43,0.22,0.38,0.14
std,0.04,0.13,0.09,0.11
min,0.39,-0.12,0.29,-0.13
25%,0.4,0.13,0.31,0.07
50%,0.42,0.22,0.35,0.13
75%,0.45,0.32,0.43,0.18
max,0.61,0.53,0.73,0.69






Unnamed: 0,331_caption_sim,331_context_sim,ped_caption_sim,ped_context_sim
count,797.0,797.0,797.0,797.0
mean,0.43,0.19,0.4,0.13
std,0.03,0.12,0.09,0.12
min,0.39,-0.14,0.29,-0.13
25%,0.4,0.11,0.32,0.05
50%,0.42,0.19,0.38,0.1
75%,0.44,0.28,0.46,0.16
max,0.61,0.55,0.73,0.69






Unnamed: 0,202_caption_sim,202_context_sim,ped_caption_sim,ped_context_sim
count,159.0,159.0,159.0,159.0
mean,0.43,0.17,0.47,0.16
std,0.03,0.13,0.1,0.14
min,0.39,-0.1,0.29,-0.06
25%,0.4,0.09,0.39,0.08
50%,0.42,0.16,0.47,0.12
75%,0.44,0.23,0.53,0.19
max,0.62,0.57,0.73,0.69






Unnamed: 0,169_caption_sim,169_context_sim,ped_caption_sim,ped_context_sim
count,254.0,254.0,254.0,254.0
mean,0.43,0.17,0.45,0.14
std,0.04,0.14,0.1,0.13
min,0.39,-0.06,0.29,-0.08
25%,0.4,0.07,0.35,0.06
50%,0.42,0.13,0.46,0.11
75%,0.45,0.22,0.54,0.18
max,0.62,0.62,0.73,0.69






Unnamed: 0,327_caption_sim,327_context_sim,ped_caption_sim,ped_context_sim
count,262.0,262.0,262.0,262.0
mean,0.44,0.22,0.33,0.13
std,0.04,0.14,0.06,0.09
min,0.39,-0.08,0.29,-0.07
25%,0.41,0.1,0.3,0.07
50%,0.43,0.22,0.31,0.12
75%,0.46,0.33,0.34,0.18
max,0.58,0.5,0.7,0.5






Unnamed: 0,309_caption_sim,309_context_sim,ped_caption_sim,ped_context_sim
count,365.0,365.0,365.0,365.0
mean,0.43,0.23,0.41,0.13
std,0.03,0.12,0.1,0.11
min,0.39,-0.09,0.29,-0.12
25%,0.4,0.15,0.32,0.07
50%,0.42,0.24,0.39,0.11
75%,0.45,0.32,0.47,0.17
max,0.56,0.52,0.73,0.69






Unnamed: 0,68_caption_sim,68_context_sim,ped_caption_sim,ped_context_sim
count,114.0,114.0,114.0,114.0
mean,0.42,0.22,0.38,0.13
std,0.03,0.12,0.11,0.1
min,0.39,-0.03,0.29,-0.09
25%,0.4,0.14,0.3,0.07
50%,0.42,0.21,0.33,0.12
75%,0.44,0.31,0.42,0.19
max,0.51,0.43,0.73,0.5






Unnamed: 0,216_caption_sim,216_context_sim,ped_caption_sim,ped_context_sim
count,412.0,412.0,412.0,412.0
mean,0.43,0.2,0.39,0.13
std,0.04,0.11,0.09,0.1
min,0.39,-0.08,0.29,-0.08
25%,0.4,0.13,0.31,0.07
50%,0.41,0.21,0.35,0.12
75%,0.44,0.27,0.45,0.18
max,0.61,0.48,0.73,0.69






Unnamed: 0,40_caption_sim,40_context_sim,ped_caption_sim,ped_context_sim
count,502.0,502.0,502.0,502.0
mean,0.45,0.22,0.39,0.14
std,0.05,0.14,0.1,0.12
min,0.39,-0.14,0.29,-0.07
25%,0.41,0.09,0.31,0.07
50%,0.44,0.22,0.33,0.12
75%,0.47,0.33,0.48,0.18
max,0.6,0.53,0.73,0.69






Unnamed: 0,99_caption_sim,99_context_sim,ped_caption_sim,ped_context_sim
count,73.0,73.0,73.0,73.0
mean,0.43,0.15,0.49,0.16
std,0.05,0.14,0.1,0.13
min,0.39,-0.07,0.29,-0.06
25%,0.4,0.06,0.44,0.08
50%,0.41,0.12,0.5,0.11
75%,0.44,0.23,0.55,0.18
max,0.57,0.45,0.73,0.5






Unnamed: 0,426_caption_sim,426_context_sim,ped_caption_sim,ped_context_sim
count,13.0,13.0,13.0,13.0
mean,0.42,0.16,0.5,0.15
std,0.01,0.18,0.13,0.19
min,0.39,-0.03,0.32,-0.08
25%,0.41,0.02,0.39,0.02
50%,0.42,0.15,0.56,0.07
75%,0.43,0.21,0.6,0.24
max,0.44,0.49,0.7,0.5






Unnamed: 0,344_caption_sim,344_context_sim,ped_caption_sim,ped_context_sim
count,241.0,241.0,241.0,241.0
mean,0.43,0.17,0.43,0.11
std,0.03,0.1,0.11,0.13
min,0.39,-0.06,0.29,-0.08
25%,0.41,0.1,0.32,0.04
50%,0.43,0.17,0.42,0.08
75%,0.45,0.24,0.51,0.15
max,0.58,0.52,0.73,0.68






Unnamed: 0,47_caption_sim,47_context_sim,ped_caption_sim,ped_context_sim
count,1001.0,1001.0,1001.0,1001.0
mean,0.44,0.26,0.36,0.13
std,0.04,0.14,0.08,0.09
min,0.39,-0.12,0.29,-0.1
25%,0.41,0.18,0.3,0.08
50%,0.43,0.27,0.32,0.13
75%,0.46,0.34,0.39,0.18
max,0.71,0.71,0.73,0.68






Unnamed: 0,8_caption_sim,8_context_sim,ped_caption_sim,ped_context_sim
count,181.0,181.0,181.0,181.0
mean,0.43,0.25,0.36,0.17
std,0.04,0.16,0.08,0.09
min,0.39,-0.15,0.29,-0.08
25%,0.41,0.12,0.31,0.11
50%,0.42,0.26,0.34,0.17
75%,0.45,0.39,0.38,0.23
max,0.58,0.53,0.73,0.5






Unnamed: 0,416_caption_sim,416_context_sim,ped_caption_sim,ped_context_sim
count,9.0,9.0,9.0,9.0
mean,0.49,0.22,0.54,0.19
std,0.1,0.23,0.12,0.19
min,0.39,-0.03,0.36,0.02
25%,0.43,0.1,0.47,0.05
50%,0.46,0.13,0.56,0.11
75%,0.54,0.4,0.6,0.24
max,0.66,0.55,0.7,0.5






Unnamed: 0,112_caption_sim,112_context_sim,ped_caption_sim,ped_context_sim
count,567.0,567.0,567.0,567.0
mean,0.44,0.2,0.4,0.14
std,0.04,0.13,0.09,0.11
min,0.39,-0.12,0.29,-0.12
25%,0.4,0.11,0.32,0.07
50%,0.43,0.21,0.38,0.12
75%,0.46,0.3,0.47,0.17
max,0.62,0.55,0.73,0.69






Unnamed: 0,137_caption_sim,137_context_sim,ped_caption_sim,ped_context_sim
count,1174.0,1174.0,1174.0,1174.0
mean,0.44,0.18,0.39,0.14
std,0.04,0.12,0.08,0.11
min,0.39,-0.14,0.29,-0.13
25%,0.41,0.09,0.32,0.06
50%,0.43,0.17,0.37,0.12
75%,0.46,0.27,0.45,0.18
max,0.61,0.55,0.73,0.69






Unnamed: 0,441_caption_sim,441_context_sim,ped_caption_sim,ped_context_sim
count,14.0,14.0,14.0,14.0
mean,0.4,0.09,0.52,0.15
std,0.01,0.12,0.15,0.17
min,0.39,-0.03,0.31,-0.03
25%,0.4,0.02,0.43,0.06
50%,0.4,0.04,0.52,0.11
75%,0.41,0.12,0.63,0.16
max,0.44,0.41,0.73,0.69






Unnamed: 0,192_caption_sim,192_context_sim,ped_caption_sim,ped_context_sim
count,1.0,1.0,1.0,1.0
mean,0.47,0.02,0.43,0.14
std,,,,
min,0.47,0.02,0.43,0.14
25%,0.47,0.02,0.43,0.14
50%,0.47,0.02,0.43,0.14
75%,0.47,0.02,0.43,0.14
max,0.47,0.02,0.43,0.14






Unnamed: 0,20_caption_sim,20_context_sim,ped_caption_sim,ped_context_sim
count,73.0,73.0,73.0,73.0
mean,0.42,0.18,0.4,0.16
std,0.04,0.13,0.12,0.11
min,0.39,-0.08,0.29,-0.02
25%,0.4,0.08,0.33,0.1
50%,0.41,0.14,0.34,0.13
75%,0.43,0.32,0.45,0.2
max,0.62,0.44,0.73,0.5






Unnamed: 0,296_caption_sim,296_context_sim,ped_caption_sim,ped_context_sim
count,337.0,337.0,337.0,337.0
mean,0.43,0.21,0.37,0.11
std,0.04,0.11,0.09,0.08
min,0.39,-0.08,0.29,-0.07
25%,0.4,0.12,0.3,0.06
50%,0.42,0.21,0.33,0.1
75%,0.44,0.3,0.4,0.15
max,0.61,0.44,0.73,0.52






Unnamed: 0,144_caption_sim,144_context_sim,ped_caption_sim,ped_context_sim
count,11.0,11.0,11.0,11.0
mean,0.42,0.14,0.49,0.21
std,0.02,0.18,0.1,0.16
min,0.39,-0.12,0.3,0.05
25%,0.41,0.03,0.42,0.1
50%,0.41,0.11,0.44,0.13
75%,0.43,0.14,0.58,0.28
max,0.44,0.46,0.65,0.5






Unnamed: 0,12_caption_sim,12_context_sim,ped_caption_sim,ped_context_sim
count,116.0,116.0,116.0,116.0
mean,0.44,0.16,0.44,0.13
std,0.05,0.12,0.11,0.14
min,0.39,-0.13,0.29,-0.06
25%,0.4,0.07,0.35,0.03
50%,0.42,0.16,0.42,0.08
75%,0.46,0.26,0.52,0.21
max,0.63,0.48,0.73,0.55






Unnamed: 0,224_caption_sim,224_context_sim,ped_caption_sim,ped_context_sim
count,423.0,423.0,423.0,423.0
mean,0.43,0.24,0.39,0.14
std,0.03,0.11,0.1,0.1
min,0.39,-0.11,0.29,-0.07
25%,0.4,0.17,0.31,0.08
50%,0.42,0.26,0.36,0.12
75%,0.44,0.33,0.47,0.17
max,0.54,0.46,0.73,0.69






Unnamed: 0,250_caption_sim,250_context_sim,ped_caption_sim,ped_context_sim
count,0.0,0.0,0.0,0.0
mean,,,,
std,,,,
min,,,,
25%,,,,
50%,,,,
75%,,,,
max,,,,






Unnamed: 0,376_caption_sim,376_context_sim,ped_caption_sim,ped_context_sim
count,32.0,32.0,32.0,32.0
mean,0.41,0.16,0.38,0.1
std,0.02,0.11,0.11,0.1
min,0.39,-0.13,0.29,-0.15
25%,0.4,0.11,0.31,0.08
50%,0.41,0.17,0.34,0.11
75%,0.42,0.25,0.41,0.14
max,0.46,0.36,0.7,0.34






In [45]:
topics = get_topics(concept='pedestrian')

In [46]:
topics

{196: {'cross',
  'crossing',
  'crosswalk',
  'pedestrian',
  'signal',
  'stripe',
  'traffic'},
 412: {'arrest',
  'pedestrian',
  'perp',
  'pers',
  'walk',
  'walkability',
  'walkable'},
 172: {'car',
  'lane',
  'pedestrian',
  'road',
  'street',
  'traffic',
  'transportation',
  'vehicle'},
 363: {'assault',
  'bystander',
  'intervene',
  'intervention',
  'pedestrian',
  'stalk',
  'stalker',
  'victim',
  'witness'},
 198: {'car',
  'garage',
  'park',
  'parking',
  'parkjockey',
  'pedestrian',
  'tow',
  'vehicle'},
 114: {'athlete',
  'disability',
  'paralympic',
  'paralympics',
  'pedestrian',
  'sport',
  'sportspeople',
  'wheelchair'},
 419: {'avenue',
  'boulevard',
  'intersection',
  'manhattan',
  'pedestrian',
  'road',
  'street',
  'suffix'},
 294: {'drive',
  'fine',
  'licence',
  'offence',
  'pedestrian',
  'penalty',
  'reckless',
  'speed',
  'ticket',
  'traffic',
  'violation'},
 388: {'brownian',
  'diffusion',
  'distribution',
  'markov',
  'pe

In [47]:
def simple_search(context, t):
    percentage = 0
    words = list(topics[t])
    for w in words:
        if w in context:
            percentage = percentage + 1
    return float(percentage/len(words))

In [48]:
list(topics[376])

['cartoon',
 'charlie',
 'disney',
 'trolley',
 'mickey',
 'lucy',
 'snoopy',
 'bear',
 'peanut',
 'pedestrian']

In [49]:
df_dict[el]['context'].tolist()[0]

'The Mangapps Railway Museum is a heritage railway centre located near Burnham-on-Crouch in Essex, England. The 0.75 miles of standard gauge running line and museum are owned and operated by the Jolly family assisted by volunteers.\n'

In [50]:
print(el)
simple_search(df_dict[el]['context'].tolist()[0], 376)

376_caption_score


0.0

In [51]:
for t in topics:
    df_key = str(t) + '_caption_score'
    df_dict[df_key]['context_gt'] = df_dict[df_key]['context'].apply(simple_search, t=t)


In [52]:
df_dict['412_caption_score'].describe().round(2)

Unnamed: 0,412_caption_sim,412_context_sim,ped_caption_sim,ped_context_sim,context_gt
count,302.0,302.0,302.0,302.0,302.0
mean,0.43,0.17,0.48,0.17,0.11
std,0.04,0.15,0.08,0.15,0.14
min,0.39,-0.09,0.31,-0.08,0.0
25%,0.4,0.06,0.42,0.07,0.0
50%,0.42,0.14,0.48,0.13,0.0
75%,0.45,0.27,0.53,0.24,0.14
max,0.61,0.57,0.73,0.69,0.57


In [53]:
df_dict['412_caption_score']

Unnamed: 0,412_caption_sim,412_context_sim,image_url,caption,context,ped_caption_sim,ped_context_sim,context_gt
874765,0.610392,0.199226,https://upload.wikimedia.org/wikipedia/commons...,Pedestrian trail English: Pedestrian trail,The Blue Water River Walk is a nearly one mile...,0.704843,0.105616,0.142857
479397,0.586082,0.008303,https://upload.wikimedia.org/wikipedia/commons...,Pedestrian walk in Ferizaj English: Ferizaj City,"Ferizaj, in southeastern Kosovo, is its third-...",0.595274,0.053994,0.000000
160058,0.573594,0.481619,https://upload.wikimedia.org/wikipedia/commons...,Prohibition of pedestrians (includes any kind ...,Jaywalking occurs when a pedestrian walks in o...,0.548525,0.483317,0.428571
496246,0.565820,0.190722,https://upload.wikimedia.org/wikipedia/commons...,English: Pedestrian crossing,Road signs used by countries in the Americas a...,0.728629,0.157267,0.000000
812796,0.565820,0.160237,https://upload.wikimedia.org/wikipedia/commons...,English: Pedestrian crossing,Road signs in Malaysia are standardised road s...,0.728629,0.152377,0.000000
...,...,...,...,...,...,...,...,...
970576,0.391278,0.305109,https://upload.wikimedia.org/wikipedia/commons...,Dalton Road entrance to Portland Walk in 2012 ...,Portland Walk is a shopping centre in Barrow-i...,0.310865,0.180440,0.000000
261512,0.391050,0.137319,https://upload.wikimedia.org/wikipedia/commons...,English: Surtees Bridge pedestrian cycleway.,The Surtees Bridge is a road bridge carrying t...,0.480469,0.071229,0.000000
299516,0.390983,0.060743,http://upload.wikimedia.org/wikipedia/commons/...,Station and the connecting pedestrian walkway ...,Utsunomiya Station is a railway station in the...,0.411879,0.036971,0.000000
221112,0.390926,0.018218,https://upload.wikimedia.org/wikipedia/commons...,English: 2016 Walking Liberty Centennial Obverse,"Although technically a circulating coin, no do...",0.389392,0.002155,0.000000


In [54]:
pg_dict = dict()
pg_dict['a'] = [68, 112, 56, 224]
pg_dict['b'] = [309, 344, 47, -1, 53]
pg_dict['c'] = [314, 331, 419, 394]
pg_dict['d'] = [98, 196, 202, 294]
pg_dict['e'] = [172, 137, 11, 134, 150, 216]
pg_dict['f'] = [363, 40, 43]
pg_dict['g'] = [441, 376]
pg_dict['h'] = [144, 416, 327]
pg_dict['i'] = [192, 20, 114]
pg_dict['j'] = [345]
pg_dict['k'] = [8, 265]
pg_dict['l'] = [250, 80, 426]
pg_dict['m'] = [169, 99, 378]
pg_dict['n'] = [432, 388, 412]
pg_dict['o'] = [12, 278, 198, 296] 

In [55]:
k_name = ['image_url', 'caption', 'context', 'concept2caption_sim', 
          'concept2context_sim', 'context_gt']
# k_name = ['image_url', 'caption', 'context', 'concept2caption_sim', 
#           'concept2context_sim']
for el in pg_dict:
    pg_dict[el] = [f'{i}_caption_score' for i in pg_dict[el]]
    pg_dict[el] = [df_dict[i] for i in pg_dict[el]]
    pg_dict[el] = [df_.reset_index(drop=True) for df_ in pg_dict[el]]
    

In [56]:
for el in pg_dict:
    t_df = pd.DataFrame(columns = ['topic2caption_sim', 'topic2context_sim'] + k_name)
    for df_ in pg_dict[el]:
        df_.columns = ['topic2caption_sim', 'topic2context_sim'] + k_name
        t_df = pd.concat([t_df, df_], ignore_index=True)
    pg_dict[el] = t_df

In [57]:
for el in pg_dict:
    print(el)
    display(pg_dict[el].describe().round(2))


a


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,2046.0,2046.0,2046.0,2046.0,2046.0
mean,0.44,0.23,0.38,0.14,0.13
std,0.04,0.13,0.09,0.1,0.14
min,0.39,-0.14,0.29,-0.12,0.0
25%,0.4,0.14,0.31,0.08,0.0
50%,0.42,0.24,0.34,0.13,0.11
75%,0.45,0.33,0.45,0.17,0.2
max,0.63,0.56,0.73,0.69,0.78


b


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,2237.0,2237.0,2237.0,2237.0,2237.0
mean,0.43,0.21,0.39,0.13,0.12
std,0.04,0.13,0.09,0.11,0.12
min,0.39,-0.14,0.29,-0.15,0.0
25%,0.4,0.11,0.31,0.06,0.0
50%,0.42,0.21,0.36,0.11,0.1
75%,0.45,0.31,0.45,0.17,0.2
max,0.71,0.71,0.73,0.69,0.7


c


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,4066.0,4066.0,4066.0,4066.0,4066.0
mean,0.43,0.19,0.38,0.13,0.09
std,0.04,0.13,0.08,0.12,0.13
min,0.39,-0.18,0.29,-0.15,0.0
25%,0.4,0.09,0.31,0.06,0.0
50%,0.42,0.18,0.36,0.11,0.0
75%,0.45,0.28,0.43,0.18,0.14
max,0.68,0.67,0.73,0.69,1.0


d


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,2518.0,2518.0,2518.0,2518.0,2518.0
mean,0.45,0.21,0.4,0.16,0.17
std,0.05,0.16,0.09,0.13,0.19
min,0.39,-0.15,0.29,-0.15,0.0
25%,0.41,0.09,0.32,0.08,0.0
50%,0.44,0.19,0.36,0.14,0.1
75%,0.47,0.32,0.46,0.2,0.3
max,0.76,0.75,0.73,0.69,1.0


e


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,4975.0,4975.0,4975.0,4975.0,4975.0
mean,0.43,0.2,0.39,0.14,0.13
std,0.04,0.13,0.09,0.12,0.16
min,0.39,-0.15,0.29,-0.13,0.0
25%,0.4,0.1,0.32,0.07,0.0
50%,0.42,0.2,0.36,0.12,0.09
75%,0.45,0.29,0.45,0.19,0.2
max,0.64,0.59,0.73,0.69,1.0


f


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,888.0,888.0,888.0,888.0,888.0
mean,0.44,0.23,0.38,0.15,0.15
std,0.04,0.14,0.1,0.11,0.15
min,0.39,-0.16,0.29,-0.08,0.0
25%,0.41,0.11,0.31,0.08,0.0
50%,0.43,0.24,0.33,0.13,0.12
75%,0.46,0.34,0.43,0.21,0.25
max,0.64,0.53,0.73,0.69,0.75


g


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,46.0,46.0,46.0,46.0,46.0
mean,0.41,0.14,0.42,0.12,0.02
std,0.02,0.12,0.14,0.12,0.05
min,0.39,-0.13,0.29,-0.15,0.0
25%,0.4,0.05,0.32,0.08,0.0
50%,0.41,0.15,0.36,0.11,0.0
75%,0.42,0.24,0.49,0.15,0.0
max,0.46,0.41,0.73,0.69,0.22


h


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,282.0,282.0,282.0,282.0,282.0
mean,0.44,0.22,0.34,0.13,0.1
std,0.05,0.15,0.08,0.1,0.11
min,0.39,-0.12,0.29,-0.07,0.0
25%,0.41,0.1,0.3,0.07,0.0
50%,0.43,0.21,0.31,0.12,0.1
75%,0.46,0.33,0.34,0.18,0.2
max,0.66,0.55,0.7,0.5,0.5


i


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,310.0,310.0,310.0,310.0,310.0
mean,0.49,0.32,0.36,0.17,0.16
std,0.08,0.17,0.09,0.09,0.16
min,0.39,-0.09,0.29,-0.06,0.0
25%,0.41,0.17,0.31,0.11,0.0
50%,0.47,0.34,0.33,0.16,0.12
75%,0.59,0.47,0.36,0.21,0.25
max,0.65,0.64,0.73,0.5,0.62


j


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,240.0,240.0,240.0,240.0,240.0
mean,0.43,0.21,0.44,0.17,0.14
std,0.04,0.14,0.1,0.14,0.14
min,0.39,-0.12,0.29,-0.12,0.0
25%,0.4,0.11,0.34,0.08,0.0
50%,0.42,0.18,0.44,0.13,0.12
75%,0.45,0.31,0.53,0.23,0.25
max,0.67,0.66,0.73,0.69,0.75


k


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,237.0,237.0,237.0,237.0,237.0
mean,0.43,0.24,0.39,0.17,0.23
std,0.04,0.16,0.11,0.11,0.24
min,0.39,-0.15,0.29,-0.08,0.0
25%,0.41,0.11,0.32,0.1,0.0
50%,0.42,0.23,0.35,0.16,0.11
75%,0.45,0.38,0.44,0.23,0.33
max,0.58,0.55,0.73,0.68,0.89


l


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,61.0,61.0,61.0,61.0,61.0
mean,0.42,0.19,0.46,0.19,0.1
std,0.02,0.16,0.14,0.14,0.13
min,0.39,-0.1,0.29,-0.08,0.0
25%,0.41,0.07,0.33,0.07,0.0
50%,0.42,0.15,0.42,0.17,0.1
75%,0.43,0.34,0.58,0.25,0.2
max,0.5,0.59,0.73,0.5,0.6


m


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,893.0,893.0,893.0,893.0,893.0
mean,0.43,0.19,0.42,0.14,0.14
std,0.04,0.14,0.1,0.13,0.15
min,0.39,-0.2,0.29,-0.15,0.0
25%,0.4,0.09,0.33,0.06,0.0
50%,0.42,0.17,0.42,0.12,0.12
75%,0.45,0.28,0.51,0.19,0.25
max,0.63,0.62,0.73,0.69,0.75


n


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,353.0,353.0,353.0,353.0,353.0
mean,0.43,0.17,0.48,0.16,0.1
std,0.04,0.15,0.09,0.15,0.13
min,0.39,-0.13,0.29,-0.08,0.0
25%,0.4,0.05,0.42,0.07,0.0
50%,0.42,0.14,0.48,0.12,0.0
75%,0.45,0.26,0.54,0.23,0.14
max,0.61,0.57,0.73,0.69,0.57


o


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,2819.0,2819.0,2819.0,2819.0,2819.0
mean,0.44,0.19,0.38,0.13,0.08
std,0.04,0.12,0.08,0.11,0.12
min,0.39,-0.14,0.29,-0.15,0.0
25%,0.41,0.1,0.31,0.06,0.0
50%,0.43,0.19,0.35,0.11,0.0
75%,0.46,0.28,0.42,0.18,0.12
max,0.71,0.56,0.73,0.69,0.88


In [58]:
df__ = pd.DataFrame()
for el in pg_dict:
    df__ = pd.concat([df_, pg_dict[el]], ignore_index=True)
display(df__.describe().round(2))

Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,3156.0,3156.0,3156.0,3156.0,3156.0
mean,0.44,0.19,0.38,0.13,0.09
std,0.04,0.12,0.08,0.11,0.12
min,0.39,-0.14,0.29,-0.15,0.0
25%,0.41,0.1,0.31,0.06,0.0
50%,0.42,0.19,0.35,0.11,0.0
75%,0.46,0.28,0.42,0.18,0.12
max,0.71,0.56,0.73,0.69,0.88


In [59]:
for el in pg_dict:
    print(el)
    display(pg_dict[el].describe().round(2))


a


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,2046.0,2046.0,2046.0,2046.0,2046.0
mean,0.44,0.23,0.38,0.14,0.13
std,0.04,0.13,0.09,0.1,0.14
min,0.39,-0.14,0.29,-0.12,0.0
25%,0.4,0.14,0.31,0.08,0.0
50%,0.42,0.24,0.34,0.13,0.11
75%,0.45,0.33,0.45,0.17,0.2
max,0.63,0.56,0.73,0.69,0.78


b


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,2237.0,2237.0,2237.0,2237.0,2237.0
mean,0.43,0.21,0.39,0.13,0.12
std,0.04,0.13,0.09,0.11,0.12
min,0.39,-0.14,0.29,-0.15,0.0
25%,0.4,0.11,0.31,0.06,0.0
50%,0.42,0.21,0.36,0.11,0.1
75%,0.45,0.31,0.45,0.17,0.2
max,0.71,0.71,0.73,0.69,0.7


c


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,4066.0,4066.0,4066.0,4066.0,4066.0
mean,0.43,0.19,0.38,0.13,0.09
std,0.04,0.13,0.08,0.12,0.13
min,0.39,-0.18,0.29,-0.15,0.0
25%,0.4,0.09,0.31,0.06,0.0
50%,0.42,0.18,0.36,0.11,0.0
75%,0.45,0.28,0.43,0.18,0.14
max,0.68,0.67,0.73,0.69,1.0


d


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,2518.0,2518.0,2518.0,2518.0,2518.0
mean,0.45,0.21,0.4,0.16,0.17
std,0.05,0.16,0.09,0.13,0.19
min,0.39,-0.15,0.29,-0.15,0.0
25%,0.41,0.09,0.32,0.08,0.0
50%,0.44,0.19,0.36,0.14,0.1
75%,0.47,0.32,0.46,0.2,0.3
max,0.76,0.75,0.73,0.69,1.0


e


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,4975.0,4975.0,4975.0,4975.0,4975.0
mean,0.43,0.2,0.39,0.14,0.13
std,0.04,0.13,0.09,0.12,0.16
min,0.39,-0.15,0.29,-0.13,0.0
25%,0.4,0.1,0.32,0.07,0.0
50%,0.42,0.2,0.36,0.12,0.09
75%,0.45,0.29,0.45,0.19,0.2
max,0.64,0.59,0.73,0.69,1.0


f


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,888.0,888.0,888.0,888.0,888.0
mean,0.44,0.23,0.38,0.15,0.15
std,0.04,0.14,0.1,0.11,0.15
min,0.39,-0.16,0.29,-0.08,0.0
25%,0.41,0.11,0.31,0.08,0.0
50%,0.43,0.24,0.33,0.13,0.12
75%,0.46,0.34,0.43,0.21,0.25
max,0.64,0.53,0.73,0.69,0.75


g


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,46.0,46.0,46.0,46.0,46.0
mean,0.41,0.14,0.42,0.12,0.02
std,0.02,0.12,0.14,0.12,0.05
min,0.39,-0.13,0.29,-0.15,0.0
25%,0.4,0.05,0.32,0.08,0.0
50%,0.41,0.15,0.36,0.11,0.0
75%,0.42,0.24,0.49,0.15,0.0
max,0.46,0.41,0.73,0.69,0.22


h


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,282.0,282.0,282.0,282.0,282.0
mean,0.44,0.22,0.34,0.13,0.1
std,0.05,0.15,0.08,0.1,0.11
min,0.39,-0.12,0.29,-0.07,0.0
25%,0.41,0.1,0.3,0.07,0.0
50%,0.43,0.21,0.31,0.12,0.1
75%,0.46,0.33,0.34,0.18,0.2
max,0.66,0.55,0.7,0.5,0.5


i


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,310.0,310.0,310.0,310.0,310.0
mean,0.49,0.32,0.36,0.17,0.16
std,0.08,0.17,0.09,0.09,0.16
min,0.39,-0.09,0.29,-0.06,0.0
25%,0.41,0.17,0.31,0.11,0.0
50%,0.47,0.34,0.33,0.16,0.12
75%,0.59,0.47,0.36,0.21,0.25
max,0.65,0.64,0.73,0.5,0.62


j


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,240.0,240.0,240.0,240.0,240.0
mean,0.43,0.21,0.44,0.17,0.14
std,0.04,0.14,0.1,0.14,0.14
min,0.39,-0.12,0.29,-0.12,0.0
25%,0.4,0.11,0.34,0.08,0.0
50%,0.42,0.18,0.44,0.13,0.12
75%,0.45,0.31,0.53,0.23,0.25
max,0.67,0.66,0.73,0.69,0.75


k


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,237.0,237.0,237.0,237.0,237.0
mean,0.43,0.24,0.39,0.17,0.23
std,0.04,0.16,0.11,0.11,0.24
min,0.39,-0.15,0.29,-0.08,0.0
25%,0.41,0.11,0.32,0.1,0.0
50%,0.42,0.23,0.35,0.16,0.11
75%,0.45,0.38,0.44,0.23,0.33
max,0.58,0.55,0.73,0.68,0.89


l


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,61.0,61.0,61.0,61.0,61.0
mean,0.42,0.19,0.46,0.19,0.1
std,0.02,0.16,0.14,0.14,0.13
min,0.39,-0.1,0.29,-0.08,0.0
25%,0.41,0.07,0.33,0.07,0.0
50%,0.42,0.15,0.42,0.17,0.1
75%,0.43,0.34,0.58,0.25,0.2
max,0.5,0.59,0.73,0.5,0.6


m


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,893.0,893.0,893.0,893.0,893.0
mean,0.43,0.19,0.42,0.14,0.14
std,0.04,0.14,0.1,0.13,0.15
min,0.39,-0.2,0.29,-0.15,0.0
25%,0.4,0.09,0.33,0.06,0.0
50%,0.42,0.17,0.42,0.12,0.12
75%,0.45,0.28,0.51,0.19,0.25
max,0.63,0.62,0.73,0.69,0.75


n


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,353.0,353.0,353.0,353.0,353.0
mean,0.43,0.17,0.48,0.16,0.1
std,0.04,0.15,0.09,0.15,0.13
min,0.39,-0.13,0.29,-0.08,0.0
25%,0.4,0.05,0.42,0.07,0.0
50%,0.42,0.14,0.48,0.12,0.0
75%,0.45,0.26,0.54,0.23,0.14
max,0.61,0.57,0.73,0.69,0.57


o


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim,context_gt
count,2819.0,2819.0,2819.0,2819.0,2819.0
mean,0.44,0.19,0.38,0.13,0.08
std,0.04,0.12,0.08,0.11,0.12
min,0.39,-0.14,0.29,-0.15,0.0
25%,0.41,0.1,0.31,0.06,0.0
50%,0.43,0.19,0.35,0.11,0.0
75%,0.46,0.28,0.42,0.18,0.12
max,0.71,0.56,0.73,0.69,0.88


In [60]:
pg_dict[el]

Unnamed: 0,topic2caption_sim,topic2context_sim,image_url,caption,context,concept2caption_sim,concept2context_sim,context_gt
0,0.628946,0.295938,https://upload.wikimedia.org/wikipedia/commons...,Pedestrian trail English: Pedestrian trail,The Blue Water River Walk is a nearly one mile...,0.704843,0.105616,0.125000
1,0.591550,0.071280,https://upload.wikimedia.org/wikipedia/commons...,Walking Trail,"Bagh-e-Jinnah, formerly known as Lawrence Gard...",0.422199,0.072473,0.000000
2,0.591550,0.071280,https://upload.wikimedia.org/wikipedia/commons...,Walking Trail,"Bagh-e-Jinnah, formerly known as Lawrence Gard...",0.422199,0.072473,0.000000
3,0.591550,0.071280,https://upload.wikimedia.org/wikipedia/commons...,Walking Trail,"Bagh-e-Jinnah, formerly known as Lawrence Gard...",0.422199,0.072473,0.000000
4,0.589132,0.471493,https://upload.wikimedia.org/wikipedia/commons...,Hikers awaiting southbound train English: Look...,The Appalachian Trail station is a commuter ra...,0.300308,0.143705,0.250000
...,...,...,...,...,...,...,...,...
2814,0.390792,0.012666,http://upload.wikimedia.org/wikipedia/commons/...,Pedestrian boulevard at night Lietuvių: Šiauli...,Šiauliai is the fourth largest city in Lithuan...,0.534422,0.013801,0.111111
2815,0.390645,0.119095,https://upload.wikimedia.org/wikipedia/commons...,English: A train crossing through Lawachara N...,"Bangladesh Railway Class 2900 is, as of 2020, ...",0.305481,0.108808,0.000000
2816,0.390449,0.086269,https://upload.wikimedia.org/wikipedia/commons...,"Pedestrian bridge as seen from platform, 2008 ...",Metro Nativitas is a station on Line 2 of the ...,0.524615,0.145939,0.000000
2817,0.390249,0.165292,https://upload.wikimedia.org/wikipedia/commons...,Standing Rock in Steedman English: Standing Ro...,Steedman is an unincorporated community in sou...,0.309894,0.136679,0.000000


## select image for retrain from wit 


In [61]:
from io import BytesIO
import pillow_avif
from svglib.svglib import svg2rlg
from reportlab.graphics import renderPM
im_test = '/raid/AISSEL/htest/datasets/wit/images'

In [62]:
from os.path import exists
import base64
im_root_path = '/raid/AISSEL/htest/datasets/wit/images'

In [63]:
# def b64(link):
#     link = link.encode("utf-8")
#     im_path = f'{im_root_path}/{str(base64.b64encode(link))[2:-1]}.jpg'
#     if exists(im_path):
#         return im_path
#     return 
    

In [64]:
def link_to_b64(link):
    im_root_path = '/raid/AISSEL/htest/datasets/wit/images'
    link = link.encode("utf-8")
    _, pos = str(link)[:-1].rsplit('.', 1)
    if pos == 'svg':
        return
    im_path = f'{str(base64.b64encode(link))[2:-1]}'
    if exists(f'{im_root_path}/{im_path}.jpg'):
        return f'{im_path}.jpg'
    return 
    

In [65]:
def b64(fn):
    im_test = '/raid/AISSEL/htest/datasets/wit/images'
    fn = f'{im_test}/{fn}'
    base64_str = None
    if exists(fn):
        try:
            img = Image.open(fn)
            img_buffer = BytesIO()
            img.save(img_buffer, format=img.format)
            byte_data = img_buffer.getvalue()
            base64_str = str(base64.b64encode(byte_data))[2:-1]
        except:
            base64_str = None
            print(fn)

    return base64_str


def remove_special(input_string):
    final_string = ""
    for character in input_string:
        if  character == " ":
            final_string = final_string + character
        else:
            if(character.isalnum()):
                final_string = final_string + character
    return final_string


def remove_special(input_string):
    final_string = ""
    for character in input_string:
        if  character == " ":
            final_string = final_string + character
        else:
            if(character.isalnum()):
                final_string = final_string + character
    return final_string



def create_data(df, count):
#     col = ['uniq_id', 'image_id', 'caption', 'context', 'topic_id', 'labels', 'image']
    col = ['uniq_id', 'image_id', 'caption', 'labels', 'image', 'topic2caption_sim', 'concept2caption_sim']
    t_df = pd.DataFrame(columns=col)
    t_df['caption'] = df['caption'].apply(remove_special)
#     t_df['context'] = df['context'].apply(remove_special)
#     t_df['topic_id'] = df['topic_id']
    t_df['topic2caption_sim'] = df['topic2caption_sim']
    t_df['concept2caption_sim'] = df['concept2caption_sim']
    t_df['topic2context_sim'] = df['topic2context_sim']
    t_df['uniq_id'] = df.index + count
    t_df['image_id'] = df.index + count
    t_df['image'] = df['image_path'].apply(b64)
    t_df['labels'] = ' '
    t_df = t_df[t_df.image.notnull()]
    t_df = t_df.reset_index(drop=True)
    
    return t_df

In [66]:
def df2_df1(df1, df2):
    df = df1.merge(df2, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']
    df = df[selected_col].reset_index(drop=True)
    return df

In [67]:
def df2_df1(df1, df2):
    cond = df2['image_path'].isin(df1['image_path'])
    df2.drop(df2[cond].index, inplace = True)
    return df2

In [68]:
def get_portion(df, p1=75, p2=15, p3=5, p4=5):
    s1 = df.sample(frac = p1/100)
    rest_part_1 = df.drop(s1.index)
    s2 = rest_part_1.sample(frac = p2/(100-p1))
    rest_part_2 = rest_part_1.drop(s2.index)
    s3 = rest_part_2.sample(frac = p3/(100 - p1 - p2))
    s4 = rest_part_2.drop(s3.index)
    return s1, s2, s3, s4

### Consider All topics

In [69]:
missed_topics_dict = dict()
for el in pg_dict:
    missed_topics_dict[el] = pg_dict[el]
    missed_topics_dict[el]['image_path'] = missed_topics_dict[el]['image_url'].apply(link_to_b64)
    missed_topics_dict[el] = missed_topics_dict[el][missed_topics_dict[el].image_path.notnull()]
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

KeyboardInterrupt: 

In [None]:
missed_topics_dict[el]

In [None]:
# selected_col = ['caption', 'context', 'topic_id', 'image_path']
selected_col = ['image_path','topic2caption_sim', 'concept2caption_sim', 'topic2context_sim', 'context_gt', 'caption']
for el in missed_topics_dict:
    missed_topics_dict[el] = missed_topics_dict[el][selected_col]
    missed_topics_dict[el] = missed_topics_dict[el].drop_duplicates(subset=selected_col, keep=False)
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [None]:
for el in missed_topics_dict:
#     missed_topics_dict[el] = missed_topics_dict[el][selected_col]
    missed_topics_dict[el] = missed_topics_dict[el].drop_duplicates(keep=False)
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [None]:
missed_topics_dict['a']

In [None]:
for el in missed_topics_dict:
    print(el, len(missed_topics_dict[el]))

In [67]:
sorted_key = []
for k in sorted(missed_topics_dict, key=lambda k: len(missed_topics_dict[k])):
    print(k, len(missed_topics_dict[k]))
    sorted_key.append(k)

g 46
l 58
k 223
j 233
h 282
i 282
n 347
f 873
m 883
a 2020
b 2206
d 2398
o 2784
c 3930
e 4700


In [68]:
sorted_key

['g', 'l', 'k', 'j', 'h', 'i', 'n', 'f', 'm', 'a', 'b', 'd', 'o', 'c', 'e']

In [69]:
def df2_df1(df1, df2):
    cond = df2['image_path'].isin(df1['image_path'])
    df2.drop(df2[cond].index, inplace = True)
    return df2

In [70]:
for idx, k in enumerate(sorted_key):
    print(sorted_key[idx:], k)
    for el in sorted_key[idx:]:
        if el == k:
            continue
        print(f'we are going to calc: {el} - {k}')
        missed_topics_dict[el] = df2_df1(missed_topics_dict[k], missed_topics_dict[el])
#     break

['g', 'l', 'k', 'j', 'h', 'i', 'n', 'f', 'm', 'a', 'b', 'd', 'o', 'c', 'e'] g
we are going to calc: l - g
we are going to calc: k - g
we are going to calc: j - g
we are going to calc: h - g
we are going to calc: i - g
we are going to calc: n - g
we are going to calc: f - g
we are going to calc: m - g
we are going to calc: a - g
we are going to calc: b - g
we are going to calc: d - g
we are going to calc: o - g
we are going to calc: c - g
we are going to calc: e - g
['l', 'k', 'j', 'h', 'i', 'n', 'f', 'm', 'a', 'b', 'd', 'o', 'c', 'e'] l
we are going to calc: k - l
we are going to calc: j - l
we are going to calc: h - l
we are going to calc: i - l
we are going to calc: n - l
we are going to calc: f - l
we are going to calc: m - l
we are going to calc: a - l
we are going to calc: b - l
we are going to calc: d - l
we are going to calc: o - l
we are going to calc: c - l
we are going to calc: e - l
['k', 'j', 'h', 'i', 'n', 'f', 'm', 'a', 'b', 'd', 'o', 'c', 'e'] k
we are going to calc: j -

In [71]:
sorted_key = []
for k in sorted(missed_topics_dict, key=lambda k: len(missed_topics_dict[k])):
    print(k, len(missed_topics_dict[k]))
    sorted_key.append(k)

g 46
l 52
j 176
n 193
k 206
i 250
h 266
e 404
m 500
f 644
o 888
c 921
d 1024
b 1052
a 1228


In [72]:
missed_topics_dict[el]

Unnamed: 0,image_path,topic2caption_sim,concept2caption_sim,topic2context_sim,context_gt,caption
244,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.438041,0.297401,0.111466,0.125,1978 Dodge Street Van 1978 Dodge Street Van dr...
258,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.434571,0.294126,-0.049544,0.000,English: lane
273,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...,0.432757,0.331775,0.363471,0.125,Brick Lane
274,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...,0.432757,0.331775,0.127627,0.000,Brick Lane
302,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.426759,0.311529,0.042451,0.000,English: Old Lane from High Street
...,...,...,...,...,...,...
4665,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.392725,0.342140,0.358445,0.200,English: An emergency escape ramp on AH1 road...
4669,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.392393,0.367683,0.319098,0.000,Aerial view of Wellington in 2012 showing the ...
4688,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.390739,0.374302,0.343242,0.100,English: Pedestrian bridge under construction...
4689,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...,0.390648,0.291587,0.235879,0.000,"English: Mechanics Hall, Main Street, Worcest..."


In [73]:
missed_topics_with_images = dict()
count = 0
for el in missed_topics_dict:
    missed_topics_with_images[el] = create_data(missed_topics_dict[el], count)
    count = len(missed_topics_with_images[el])

/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9mL2ZiLyVEMCU5RiVEMCVCMCVEMCVCRCVEMCVCRSVEMSU4MCVEMCVCMCVEMCVCQyVEMCVCMF8yLmpwZw==.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9mL2ZiLyVEMCU5RiVEMCVCMCVEMCVCRCVEMCVCRSVEMSU4MCVEMCVCMCVEMCVCQyVEMCVCMF8yLmpwZw==.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvZW4vOS85Zi9TeWRuZXlQYXJhZGVQbGFxdWUuanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9mL2ZjL1dOWUNfVHJhbnNtaXR0ZXJfUGFya19zdHJlZXRfYXJ0XyUyODM0NDI5MDc2NDExJTI5LmpwZw==.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9lL2U0LyUyODElMjlXYWl0YXJhX1RyYWluX1N0YXRpb24uanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy8xLzE1L1N0cmVldF8tX3Bhbm9yYW1pb18lMjg5OSUyO

/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9mL2YyL0JpZ19Db3VudHJ5X2Zvcl9hX0xvbmVfSGlrZXJfJTI4MjAyNTc1OTA0ODQlMjkuanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy8zLzNlL1Jvc2hhbmFyYV9FYnJhaGltX3dhbGtpbmdfb25fdGhlX2NhdHdhbGsuanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvZW4vOC84NS9Db25leV9IYWxsX3JvdW5kYWJvdXQuanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy81LzU5L1BBU01PX0NhcmQuanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9lL2UxL1BlZGVzdHJpYW5fVmlld19vZl9SZXRyYWN0YWJsZV9VbWJyZWxsYXMuanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvZW4vOS85ZC9QcmF0dF9TdHJlZXRfSGFydGZvcmRfMS5qcGc=.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud

In [74]:
missed_topics_with_images[el]

Unnamed: 0,uniq_id,image_id,caption,labels,image,topic2caption_sim,concept2caption_sim,topic2context_sim
0,195,195,Hiker at Wisner Park in Elmira NY,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.545356,0.324049,0.270464
1,196,196,Hiker at Tompkinsville Park in Staten NY,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.531638,0.352481,0.320010
2,197,197,Hiker at Garfield Square in Pottsville PA,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.530654,0.336373,0.270464
3,198,198,Hiker at Fisher Veterans Park in Lebanon PA,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.524322,0.375746,0.270464
4,200,200,Wilderness Road,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.497447,0.384842,0.393688
...,...,...,...,...,...,...,...,...
876,2961,2961,English Billy Goat C Trail approach from west...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.392449,0.330498,0.280384
877,2963,2963,English I like this photos for its alone agai...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.391950,0.315300,0.296100
878,2967,2967,English A wall of Georgia National Guardsmen ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.391661,0.315743,0.120356
879,2970,2970,English A train crossing through Lawachara Na...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.390645,0.305481,0.119095


In [75]:
s = 0
for el in missed_topics_with_images:
    missed_topics_with_images[el] = missed_topics_with_images[el].dropna(subset=['caption'])
    missed_topics_with_images[el]['topics']  = [el] * len(missed_topics_with_images[el])
    print(el, len(missed_topics_with_images[el]))
    s = s + len(missed_topics_with_images[el])
print(s)

a 1226
b 1044
c 910
d 1020
e 402
f 638
g 45
h 259
i 249
j 175
k 202
l 51
m 497
n 190
o 881
7789


In [76]:
scol = ['topic2caption_sim', 'concept2caption_sim']
for el in missed_topics_with_images:
    print(el)
    display(missed_topics_with_images[el][scol].describe().round(2))

a


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,1226.0,1226.0
mean,0.43,0.33
std,0.04,0.04
min,0.39,0.29
25%,0.4,0.3
50%,0.42,0.32
75%,0.45,0.35
max,0.63,0.49


b


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,1044.0,1044.0
mean,0.43,0.34
std,0.03,0.04
min,0.39,0.29
25%,0.4,0.3
50%,0.41,0.32
75%,0.44,0.37
max,0.65,0.51


c


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,910.0,910.0
mean,0.42,0.32
std,0.03,0.03
min,0.39,0.29
25%,0.4,0.3
50%,0.41,0.31
75%,0.43,0.34
max,0.58,0.45


d


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,1020.0,1020.0
mean,0.45,0.34
std,0.05,0.04
min,0.39,0.29
25%,0.41,0.31
50%,0.45,0.33
75%,0.46,0.35
max,0.64,0.49


e


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,402.0,402.0
mean,0.42,0.32
std,0.03,0.03
min,0.39,0.29
25%,0.4,0.3
50%,0.41,0.31
75%,0.43,0.33
max,0.56,0.44


f


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,638.0,638.0
mean,0.44,0.33
std,0.04,0.05
min,0.39,0.29
25%,0.41,0.3
50%,0.43,0.32
75%,0.46,0.34
max,0.64,0.55


g


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,45.0,45.0
mean,0.41,0.43
std,0.02,0.14
min,0.39,0.29
25%,0.4,0.32
50%,0.41,0.36
75%,0.42,0.49
max,0.46,0.73


h


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,259.0,259.0
mean,0.44,0.33
std,0.04,0.05
min,0.39,0.29
25%,0.41,0.3
50%,0.43,0.31
75%,0.46,0.34
max,0.58,0.6


i


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,249.0,249.0
mean,0.49,0.34
std,0.08,0.05
min,0.39,0.29
25%,0.42,0.3
50%,0.47,0.33
75%,0.58,0.34
max,0.65,0.65


j


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,175.0,175.0
mean,0.43,0.41
std,0.04,0.09
min,0.39,0.29
25%,0.4,0.33
50%,0.42,0.39
75%,0.44,0.47
max,0.67,0.63


k


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,202.0,202.0
mean,0.43,0.38
std,0.04,0.08
min,0.39,0.29
25%,0.41,0.32
50%,0.42,0.35
75%,0.45,0.4
max,0.58,0.66


l


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,51.0,51.0
mean,0.42,0.44
std,0.02,0.12
min,0.39,0.29
25%,0.4,0.33
50%,0.41,0.39
75%,0.43,0.56
max,0.5,0.65


m


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,497.0,497.0
mean,0.43,0.36
std,0.04,0.06
min,0.39,0.29
25%,0.4,0.31
50%,0.42,0.34
75%,0.45,0.41
max,0.63,0.53


n


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,190.0,190.0
mean,0.42,0.44
std,0.03,0.07
min,0.39,0.29
25%,0.4,0.38
50%,0.41,0.44
75%,0.43,0.49
max,0.52,0.63


o


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,881.0,881.0
mean,0.42,0.33
std,0.03,0.03
min,0.39,0.29
25%,0.4,0.3
50%,0.42,0.32
75%,0.44,0.34
max,0.66,0.5


In [77]:
df_2 = pd.DataFrame()
for el in missed_topics_with_images:
    df_2 = pd.concat([df_2, missed_topics_with_images[el]], ignore_index=True)
# df_
display(df_2[scol].describe().round(2))

Unnamed: 0,topic2caption_sim,concept2caption_sim
count,7789.0,7789.0
mean,0.43,0.34
std,0.04,0.05
min,0.39,0.29
25%,0.4,0.3
50%,0.42,0.32
75%,0.45,0.35
max,0.67,0.73


## missed one for sim

In [121]:
missed_ones = ['a', 'f', 'g', 'i', 'j', 'k', 'l', 'm', 'n', 'o']

In [122]:
for el in missed_ones:
    print(el)

    display(missed_topics_with_images[el][['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

a


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,1226.0,1226.0
mean,0.43,0.33
std,0.04,0.04
min,0.39,0.29
25%,0.4,0.3
50%,0.42,0.32
75%,0.45,0.35
max,0.63,0.49


f


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,638.0,638.0
mean,0.44,0.33
std,0.04,0.05
min,0.39,0.29
25%,0.41,0.3
50%,0.43,0.32
75%,0.46,0.34
max,0.64,0.55


g


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,45.0,45.0
mean,0.41,0.43
std,0.02,0.14
min,0.39,0.29
25%,0.4,0.32
50%,0.41,0.36
75%,0.42,0.49
max,0.46,0.73


i


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,249.0,249.0
mean,0.49,0.34
std,0.08,0.05
min,0.39,0.29
25%,0.42,0.3
50%,0.47,0.33
75%,0.58,0.34
max,0.65,0.65


j


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,175.0,175.0
mean,0.43,0.41
std,0.04,0.09
min,0.39,0.29
25%,0.4,0.33
50%,0.42,0.39
75%,0.44,0.47
max,0.67,0.63


k


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,202.0,202.0
mean,0.43,0.38
std,0.04,0.08
min,0.39,0.29
25%,0.41,0.32
50%,0.42,0.35
75%,0.45,0.4
max,0.58,0.66


l


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,51.0,51.0
mean,0.42,0.44
std,0.02,0.12
min,0.39,0.29
25%,0.4,0.33
50%,0.41,0.39
75%,0.43,0.56
max,0.5,0.65


m


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,497.0,497.0
mean,0.43,0.36
std,0.04,0.06
min,0.39,0.29
25%,0.4,0.31
50%,0.42,0.34
75%,0.45,0.41
max,0.63,0.53


n


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,190.0,190.0
mean,0.42,0.44
std,0.03,0.07
min,0.39,0.29
25%,0.4,0.38
50%,0.41,0.44
75%,0.43,0.49
max,0.52,0.63


o


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,881.0,881.0
mean,0.42,0.33
std,0.03,0.03
min,0.39,0.29
25%,0.4,0.3
50%,0.42,0.32
75%,0.44,0.34
max,0.66,0.5


In [123]:
df_l = pd.DataFrame()
for el in missed_ones:
    df_l = pd.concat([df_l, missed_topics_with_images[el]], ignore_index=True)
# df_
display(df_l[['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

Unnamed: 0,topic2caption_sim,concept2caption_sim
count,4154.0,4154.0
mean,0.43,0.35
std,0.04,0.06
min,0.39,0.29
25%,0.4,0.3
50%,0.42,0.32
75%,0.45,0.36
max,0.67,0.73


### based on cider human

In [78]:
missed_ones = ['f', 'g', 'h', 'j', 'l','n']


In [79]:
for el in missed_ones:
    print(el)

    display(missed_topics_with_images[el][['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

f


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,638.0,638.0
mean,0.44,0.33
std,0.04,0.05
min,0.39,0.29
25%,0.41,0.3
50%,0.43,0.32
75%,0.46,0.34
max,0.64,0.55


g


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,45.0,45.0
mean,0.41,0.43
std,0.02,0.14
min,0.39,0.29
25%,0.4,0.32
50%,0.41,0.36
75%,0.42,0.49
max,0.46,0.73


h


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,259.0,259.0
mean,0.44,0.33
std,0.04,0.05
min,0.39,0.29
25%,0.41,0.3
50%,0.43,0.31
75%,0.46,0.34
max,0.58,0.6


j


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,175.0,175.0
mean,0.43,0.41
std,0.04,0.09
min,0.39,0.29
25%,0.4,0.33
50%,0.42,0.39
75%,0.44,0.47
max,0.67,0.63


l


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,51.0,51.0
mean,0.42,0.44
std,0.02,0.12
min,0.39,0.29
25%,0.4,0.33
50%,0.41,0.39
75%,0.43,0.56
max,0.5,0.65


n


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,190.0,190.0
mean,0.42,0.44
std,0.03,0.07
min,0.39,0.29
25%,0.4,0.38
50%,0.41,0.44
75%,0.43,0.49
max,0.52,0.63


In [80]:
df_l = pd.DataFrame()
for el in missed_ones:
    df_l = pd.concat([df_l, missed_topics_with_images[el]], ignore_index=True)
# df_
display(df_l[['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

Unnamed: 0,topic2caption_sim,concept2caption_sim
count,1358.0,1358.0
mean,0.43,0.36
std,0.04,0.08
min,0.39,0.29
25%,0.4,0.3
50%,0.42,0.33
75%,0.45,0.4
max,0.67,0.73


### CIDEr web

In [89]:
missed_ones = ['e', 'f', 'g', 'h', 'j', 'm','n']


In [90]:
for el in missed_ones:
    print(el)

    display(missed_topics_with_images[el][['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

e


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,402.0,402.0
mean,0.42,0.32
std,0.03,0.03
min,0.39,0.29
25%,0.4,0.3
50%,0.41,0.31
75%,0.43,0.33
max,0.56,0.44


f


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,638.0,638.0
mean,0.44,0.33
std,0.04,0.05
min,0.39,0.29
25%,0.41,0.3
50%,0.43,0.32
75%,0.46,0.34
max,0.64,0.55


g


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,45.0,45.0
mean,0.41,0.43
std,0.02,0.14
min,0.39,0.29
25%,0.4,0.32
50%,0.41,0.36
75%,0.42,0.49
max,0.46,0.73


h


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,259.0,259.0
mean,0.44,0.33
std,0.04,0.05
min,0.39,0.29
25%,0.41,0.3
50%,0.43,0.31
75%,0.46,0.34
max,0.58,0.6


j


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,175.0,175.0
mean,0.43,0.41
std,0.04,0.09
min,0.39,0.29
25%,0.4,0.33
50%,0.42,0.39
75%,0.44,0.47
max,0.67,0.63


m


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,497.0,497.0
mean,0.43,0.36
std,0.04,0.06
min,0.39,0.29
25%,0.4,0.31
50%,0.42,0.34
75%,0.45,0.41
max,0.63,0.53


n


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,190.0,190.0
mean,0.42,0.44
std,0.03,0.07
min,0.39,0.29
25%,0.4,0.38
50%,0.41,0.44
75%,0.43,0.49
max,0.52,0.63


In [91]:
df_l = pd.DataFrame()
for el in missed_ones:
    df_l = pd.concat([df_l, missed_topics_with_images[el]], ignore_index=True)
# df_
display(df_l[['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

Unnamed: 0,topic2caption_sim,concept2caption_sim
count,2206.0,2206.0
mean,0.43,0.35
std,0.04,0.07
min,0.39,0.29
25%,0.4,0.3
50%,0.42,0.32
75%,0.45,0.38
max,0.67,0.73


In [81]:
def get_portion(df, p1=72, p2=16, p3=12):
    s1 = df.sample(frac = p1/100)
    rest_part_1 = df.drop(s1.index)
    s2 = rest_part_1.sample(frac = p2/(100-p1))
    s3 = rest_part_1.drop(s2.index)
    return s1, s2, s3

In [92]:
name_lsit = ['stage1_train', 'stage2_train', 'val']
por_dict = dict()
data_dict = dict()
for n in name_lsit:
    data_dict[n] = pd.DataFrame() 
for el in missed_ones:
    s1, s2, v = get_portion(missed_topics_with_images[el])
    
    s1 = s1.reset_index(drop=True)
    data_dict['stage1_train'] = pd.concat([data_dict['stage1_train'], s1], ignore_index=True)
    
    s2 = s2.reset_index(drop=True)
    data_dict['stage2_train'] = pd.concat([data_dict['stage2_train'], s2], ignore_index=True)
    
    v = v.reset_index(drop=True)
    data_dict['val'] = pd.concat([data_dict['val'], v], ignore_index=True)
#     data_dict[el] = por_dict

In [93]:
len(data_dict['stage1_train'])

1587

In [94]:
len(data_dict['stage2_train'])

353

In [95]:
len(data_dict['val'])

266

In [96]:
names = ['uniq_id', 'image_id', 'caption', 'labels', 'image']

## SAve Sim

In [130]:
! mkdir -p /raid/AISSEL/htest/datasets/ped_data/wit/missed_q50_over_avg
saved_path = '/raid/AISSEL/htest/datasets/ped_data/wit/missed_q50_over_avg'
for el in data_dict:
    name = f'caption_{el}.tsv'
    df_s = data_dict[el][names]
    df_s.to_csv(f'{saved_path}/{name}', sep="\t", index=False, header=False)

    print(name)

caption_stage1_train.tsv
caption_stage2_train.tsv
caption_val.tsv


## SAve CIDEr human

In [87]:
! mkdir -p /raid/AISSEL/htest/datasets/ped_data/wit/missed_q50_over_avg_cider
saved_path = '/raid/AISSEL/htest/datasets/ped_data/wit/missed_q50_over_avg_cider'
for el in data_dict:
    name = f'caption_{el}.tsv'
    df_s = data_dict[el][names]
    df_s.to_csv(f'{saved_path}/{name}', sep="\t", index=False, header=False)

    print(name)

caption_stage1_train.tsv
caption_stage2_train.tsv
caption_val.tsv


## SAve CIDEr web

In [97]:
! mkdir -p /raid/AISSEL/htest/datasets/ped_data/wit/missed_q50_over_avg_cider_web
saved_path = '/raid/AISSEL/htest/datasets/ped_data/wit/missed_q50_over_avg_cider_web'
for el in data_dict:
    name = f'caption_{el}.tsv'
    df_s = data_dict[el][names]
    df_s.to_csv(f'{saved_path}/{name}', sep="\t", index=False, header=False)

    print(name)

caption_stage1_train.tsv
caption_stage2_train.tsv
caption_val.tsv


In [87]:
len(data_dict['stage2_train'])

611

### get portion for dataset 
```stage 1: 75%, stage 2: %15, test: %5, validation % 5```

In [84]:
name_lsit = ['stage1_train', 'stage2_train', 'val', 'test']
por_dict = dict()
data_dict = dict()
for n in name_lsit:
    data_dict[n] = pd.DataFrame() 
for el in missed_topics_with_images:
    s1, s2, v, t = get_portion(missed_topics_with_images[el])
    
    s1 = s1.reset_index(drop=True)
    data_dict['stage1_train'] = pd.concat([data_dict['stage1_train'], s1], ignore_index=True)
    
    s2 = s2.reset_index(drop=True)
    data_dict['stage2_train'] = pd.concat([data_dict['stage2_train'], s2], ignore_index=True)
    
    t = t.reset_index(drop=True)
    data_dict['test'] = pd.concat([data_dict['test'], t], ignore_index=True)
    
    v = v.reset_index(drop=True)
    data_dict['val'] = pd.concat([data_dict['val'], v], ignore_index=True)
#     data_dict[el] = por_dict

In [105]:
data_dict = dict()

In [106]:
def get_portion(df, p1=75, p2=15, p3=5, p4=5):
    s1 = df.sample(frac = p1/100)
    rest_part_1 = df.drop(s1.index)
    s2 = rest_part_1.sample(frac = p2/(100-p1))
    rest_part_2 = rest_part_1.drop(s2.index)
    s3 = rest_part_2.sample(frac = p3/(100 - p1 - p2))
    s4 = rest_part_2.drop(s3.index)
    return s1, s2, s3, s4

# Create Random Dataset

In [71]:
# df_p = csim_df[csim_df['concept_caption_score'] >= csim_df['concept_caption_score'].mean()]
df_p = csim_df[csim_df['concept_caption_score'] >= 0.134]
df_p = df_p.reset_index(drop=True)
df_p

Unnamed: 0,image_url,caption,context,concept_caption_score,concept_context_score,196_caption_score,412_caption_score,172_caption_score,363_caption_score,198_caption_score,...,137_context_score,441_context_score,192_context_score,20_context_score,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score
0,https://upload.wikimedia.org/wikipedia/commons...,Menachem Mendel Schneerson English: Menachem M...,1994 was a common year starting on Saturday of...,0.166272,0.094395,0.078204,0.065567,0.080967,0.041644,0.075154,...,-0.012260,0.019606,0.024275,0.076669,-0.067856,0.096434,-0.048101,-0.041107,0.026869,0.011452
1,https://upload.wikimedia.org/wikipedia/commons...,English: Image cropped from a baseball card o...,"Donald Robert ""Duffy"" Dyer is an American form...",0.162247,-0.034380,0.052320,0.042743,-0.008497,0.038228,0.071760,...,-0.118144,-0.099278,-0.026665,-0.015405,-0.002073,-0.119949,-0.022791,-0.088388,0.176188,0.028713
2,https://upload.wikimedia.org/wikipedia/commons...,"The special theory of relativity, formulated i...","In relativistic physics, a velocity-addition f...",0.142336,0.106813,0.129890,0.102847,0.105520,0.096102,0.069470,...,-0.017552,0.020111,0.036183,0.007514,0.020004,0.120406,0.044195,0.014316,-0.009373,0.032084
3,https://upload.wikimedia.org/wikipedia/commons...,English: Randy Stonehill Signature,Randall Evan Stonehill is an American singer a...,0.223324,0.092803,0.123127,0.087765,0.120790,0.042908,0.068349,...,0.108193,-0.031020,-0.006661,0.128765,0.160655,-0.025057,0.193492,-0.012994,0.125541,0.050723
4,https://upload.wikimedia.org/wikipedia/commons...,English: United States Senator William Proxmi...,Edward William Proxmire was an American politi...,0.171389,0.067607,0.028057,0.090903,0.127647,0.020825,0.029832,...,0.066258,-0.010974,-0.011341,0.085126,-0.066214,-0.029877,0.043140,-0.093784,0.008349,-0.041261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1355833,https://upload.wikimedia.org/wikipedia/commons...,Massachusetts Bay Transportation Authority 170...,The Urban Transportation Development Corporati...,0.147268,0.091816,0.183336,0.155670,0.233472,0.025314,0.184065,...,0.122498,0.018380,0.130792,-0.011854,0.102683,0.041830,0.039997,0.298343,0.100381,0.069348
1355834,https://upload.wikimedia.org/wikipedia/commons...,A map showing the path of State Highway 22 in...,State Highway 22 is a state highway in Oklahom...,0.207656,0.078112,0.255292,0.206986,0.297474,0.104774,0.176442,...,0.217521,-0.047160,0.008157,0.072613,0.177827,-0.073539,0.180896,0.140562,-0.008423,0.056389
1355835,https://upload.wikimedia.org/wikipedia/commons...,English: CIPET,Central Institute of Petrochemical Engineering...,0.142362,-0.005783,0.031377,0.048821,0.112177,-0.011580,0.040081,...,0.087363,-0.050901,0.013602,0.041040,-0.003768,-0.039447,0.023529,0.135025,-0.000438,-0.107296
1355836,https://upload.wikimedia.org/wikipedia/commons...,"English: Jia Jingde, politician of China. 中文:...",This is a list of Vice Presidents of the Exami...,0.156600,0.093868,-0.021200,0.070536,0.045870,-0.022279,0.064005,...,0.064688,-0.003800,0.079544,0.097345,0.022783,0.035481,-0.009618,0.142356,0.097828,0.056724


In [81]:
df_dict = dict()
for col in caption_score_list:
    num, _ = col.split('_', 1)
    c_list = [col] + [f'{num}_context_score'] + im_c + cap_s + con_s
    df = df_p[c_list]
    
    df = df[df[col] > df[col].quantile(0.75)]
    
#     df = df[df[col] >= 0.39]
#     df = df[df['concept_caption_score'] >= 0.29]

    df = df.sort_values(by=[col], ascending=False)
    df_dict[col] = df

In [82]:
df_dict[col]

Unnamed: 0,376_caption_score,376_context_score,image_url,caption,context,concept_caption_score,concept_context_score
869924,0.567895,0.407775,https://upload.wikimedia.org/wikipedia/commons...,English: Recreation of Minnie Mouse's signatu...,This is a list of characters that have appeare...,0.189813,0.156888
788798,0.567709,0.407775,https://upload.wikimedia.org/wikipedia/commons...,English: Recreation of Mickey Mouse's signatu...,This is a list of characters that have appeare...,0.180582,0.156888
841795,0.543113,0.423695,https://upload.wikimedia.org/wikipedia/commons...,Cars 623 and 717 passing on the Red Car Troll...,"The Red Car Trolley is a 1,000 mm metre gauge ...",0.200099,0.132485
667637,0.537630,0.350208,https://upload.wikimedia.org/wikipedia/commons...,Disney villains at Disneyland's Mickey's Hallo...,Mickey's Halloween Party was an annual Hallowe...,0.199137,0.043375
184343,0.533986,0.539802,https://upload.wikimedia.org/wikipedia/commons...,Theatrical release poster English: Poster for ...,Trolley Troubles is a 1927 animated short subj...,0.172679,0.117095
...,...,...,...,...,...,...,...
97023,0.152664,0.078346,https://upload.wikimedia.org/wikipedia/commons...,English: Adam Walker,Adam Walker is a Scotland international rugby ...,0.305401,0.118617
70261,0.152664,0.014011,https://upload.wikimedia.org/wikipedia/commons...,English: Ishqi-Mari statue (front),"Ishqi-Mari or Ishgi-Mari, previously read Lamg...",0.242632,-0.032017
1154967,0.152663,0.061022,https://upload.wikimedia.org/wikipedia/commons...,Metropolitan Police Ford F450 Jankel Guardian ...,The Jankel group of companies has been continu...,0.225747,0.043164
97914,0.152663,0.023257,http://upload.wikimedia.org/wikipedia/commons/...,Entrance Deutsch: Otterzentrum English: Entran...,The Hankensbüttel Otter Centre is a nature exp...,0.173663,-0.005479


In [83]:
for el in df_dict:
    t,_ = el.split('_', 1)
#     print(t)
    df_dict[el].columns = [f'{t}_caption_sim', f'{t}_context_sim', 'image_url',
                          'caption', 'context', f'ped_caption_sim', f'ped_context_sim']
    display(df_dict[el].describe().round(2))
    print('\n')

Unnamed: 0,196_caption_sim,196_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.18,0.1,0.2,0.08
std,0.05,0.09,0.05,0.07
min,0.13,-0.21,0.13,-0.23
25%,0.14,0.03,0.16,0.04
50%,0.17,0.09,0.19,0.08
75%,0.2,0.16,0.22,0.12
max,0.76,0.75,0.73,0.69






Unnamed: 0,412_caption_sim,412_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.17,0.08,0.2,0.08
std,0.04,0.08,0.05,0.07
min,0.12,-0.24,0.13,-0.23
25%,0.14,0.03,0.17,0.04
50%,0.16,0.08,0.19,0.08
75%,0.18,0.13,0.23,0.13
max,0.61,0.65,0.73,0.69






Unnamed: 0,172_caption_sim,172_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.2,0.09,0.2,0.08
std,0.05,0.09,0.05,0.07
min,0.15,-0.24,0.13,-0.23
25%,0.16,0.03,0.16,0.03
50%,0.19,0.09,0.19,0.08
75%,0.22,0.15,0.22,0.12
max,0.61,0.56,0.73,0.69






Unnamed: 0,363_caption_sim,363_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.13,0.05,0.2,0.07
std,0.03,0.07,0.05,0.07
min,0.09,-0.23,0.13,-0.23
25%,0.1,0.0,0.16,0.03
50%,0.12,0.05,0.19,0.07
75%,0.14,0.09,0.22,0.12
max,0.51,0.5,0.73,0.69






Unnamed: 0,198_caption_sim,198_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.19,0.11,0.19,0.08
std,0.04,0.08,0.05,0.07
min,0.14,-0.24,0.13,-0.2
25%,0.15,0.05,0.16,0.03
50%,0.17,0.11,0.19,0.08
75%,0.21,0.16,0.22,0.12
max,0.61,0.61,0.73,0.69






Unnamed: 0,114_caption_sim,114_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.18,0.09,0.19,0.07
std,0.04,0.08,0.04,0.07
min,0.14,-0.26,0.13,-0.23
25%,0.15,0.04,0.16,0.02
50%,0.17,0.09,0.19,0.07
75%,0.2,0.14,0.22,0.12
max,0.71,0.64,0.73,0.69






Unnamed: 0,419_caption_sim,419_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.22,0.11,0.2,0.08
std,0.06,0.09,0.05,0.07
min,0.16,-0.25,0.13,-0.2
25%,0.18,0.04,0.16,0.04
50%,0.2,0.1,0.19,0.08
75%,0.25,0.17,0.22,0.12
max,0.61,0.61,0.73,0.69






Unnamed: 0,294_caption_sim,294_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.17,0.08,0.2,0.08
std,0.04,0.08,0.05,0.07
min,0.12,-0.22,0.13,-0.23
25%,0.14,0.02,0.17,0.04
50%,0.15,0.07,0.19,0.08
75%,0.18,0.12,0.23,0.13
max,0.61,0.61,0.73,0.69






Unnamed: 0,388_caption_sim,388_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.14,0.06,0.19,0.08
std,0.03,0.07,0.05,0.07
min,0.1,-0.26,0.13,-0.23
25%,0.12,0.02,0.16,0.03
50%,0.13,0.07,0.18,0.08
75%,0.16,0.11,0.22,0.12
max,0.54,0.5,0.73,0.69






Unnamed: 0,314_caption_sim,314_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.2,0.1,0.19,0.08
std,0.05,0.09,0.05,0.07
min,0.15,-0.23,0.13,-0.23
25%,0.17,0.04,0.16,0.03
50%,0.19,0.09,0.19,0.08
75%,0.22,0.15,0.22,0.12
max,0.68,0.67,0.73,0.69






Unnamed: 0,98_caption_sim,98_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.22,0.1,0.2,0.08
std,0.05,0.09,0.05,0.07
min,0.16,-0.19,0.13,-0.23
25%,0.18,0.03,0.16,0.03
50%,0.2,0.09,0.19,0.08
75%,0.24,0.15,0.22,0.12
max,0.73,0.64,0.73,0.69






Unnamed: 0,80_caption_sim,80_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.14,0.07,0.2,0.08
std,0.04,0.08,0.05,0.07
min,0.1,-0.22,0.13,-0.23
25%,0.11,0.02,0.16,0.03
50%,0.13,0.07,0.19,0.08
75%,0.16,0.12,0.22,0.12
max,0.51,0.63,0.73,0.69






Unnamed: 0,134_caption_sim,134_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.24,0.12,0.19,0.08
std,0.05,0.1,0.05,0.07
min,0.18,-0.31,0.13,-0.23
25%,0.2,0.05,0.16,0.03
50%,0.23,0.11,0.19,0.07
75%,0.27,0.17,0.22,0.12
max,0.66,0.65,0.73,0.69






Unnamed: 0,432_caption_sim,432_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.18,0.06,0.19,0.07
std,0.04,0.08,0.05,0.07
min,0.13,-0.25,0.13,-0.23
25%,0.15,0.0,0.16,0.03
50%,0.17,0.05,0.18,0.07
75%,0.2,0.1,0.22,0.11
max,0.53,0.61,0.73,0.69






Unnamed: 0,394_caption_sim,394_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.25,0.13,0.19,0.08
std,0.05,0.09,0.05,0.07
min,0.19,-0.23,0.13,-0.2
25%,0.21,0.07,0.16,0.03
50%,0.24,0.13,0.18,0.07
75%,0.27,0.19,0.22,0.12
max,0.68,0.62,0.73,0.69






Unnamed: 0,53_caption_sim,53_context_sim,ped_caption_sim,ped_context_sim
count,338959.0,338959.0,338959.0,338959.0
mean,0.16,0.07,0.2,0.08
std,0.04,0.08,0.05,0.07
min,0.11,-0.27,0.13,-0.22
25%,0.13,0.02,0.16,0.03
50%,0.15,0.07,0.19,0.07
75%,0.17,0.12,0.22,0.12
max,0.57,0.57,0.73,0.69






Unnamed: 0,56_caption_sim,56_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.22,0.12,0.19,0.08
std,0.06,0.1,0.05,0.07
min,0.15,-0.25,0.13,-0.22
25%,0.17,0.04,0.16,0.04
50%,0.2,0.11,0.19,0.08
75%,0.24,0.18,0.22,0.12
max,0.67,0.61,0.73,0.69






Unnamed: 0,150_caption_sim,150_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.19,0.09,0.2,0.08
std,0.05,0.09,0.05,0.07
min,0.14,-0.24,0.13,-0.23
25%,0.15,0.03,0.16,0.03
50%,0.18,0.09,0.19,0.08
75%,0.21,0.15,0.22,0.12
max,0.64,0.55,0.73,0.69






Unnamed: 0,278_caption_sim,278_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.26,0.12,0.19,0.08
std,0.05,0.09,0.05,0.07
min,0.2,-0.23,0.13,-0.23
25%,0.22,0.06,0.16,0.03
50%,0.25,0.12,0.19,0.08
75%,0.28,0.18,0.22,0.12
max,0.71,0.6,0.73,0.69






Unnamed: 0,-1_caption_sim,-1_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.24,0.09,0.2,0.07
std,0.04,0.08,0.05,0.07
min,0.19,-0.23,0.13,-0.2
25%,0.21,0.04,0.16,0.03
50%,0.23,0.09,0.19,0.07
75%,0.26,0.14,0.22,0.11
max,0.57,0.46,0.73,0.69






Unnamed: 0,43_caption_sim,43_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.18,0.06,0.2,0.07
std,0.04,0.08,0.05,0.07
min,0.14,-0.25,0.13,-0.23
25%,0.16,0.01,0.17,0.03
50%,0.17,0.06,0.19,0.07
75%,0.2,0.11,0.23,0.12
max,0.64,0.58,0.73,0.69






Unnamed: 0,265_caption_sim,265_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.16,0.09,0.19,0.08
std,0.04,0.07,0.05,0.07
min,0.12,-0.22,0.13,-0.23
25%,0.13,0.04,0.16,0.03
50%,0.15,0.09,0.19,0.08
75%,0.18,0.14,0.22,0.12
max,0.56,0.57,0.73,0.69






Unnamed: 0,345_caption_sim,345_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.16,0.09,0.19,0.08
std,0.04,0.08,0.05,0.07
min,0.11,-0.22,0.13,-0.2
25%,0.13,0.03,0.16,0.03
50%,0.15,0.09,0.19,0.08
75%,0.18,0.15,0.22,0.12
max,0.67,0.66,0.73,0.69






Unnamed: 0,378_caption_sim,378_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.23,0.13,0.19,0.07
std,0.05,0.1,0.04,0.07
min,0.17,-0.31,0.13,-0.2
25%,0.19,0.06,0.16,0.03
50%,0.22,0.13,0.18,0.07
75%,0.26,0.19,0.21,0.12
max,0.63,0.64,0.73,0.69






Unnamed: 0,11_caption_sim,11_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.2,0.11,0.19,0.08
std,0.06,0.1,0.05,0.07
min,0.14,-0.26,0.13,-0.23
25%,0.16,0.04,0.16,0.03
50%,0.19,0.1,0.18,0.08
75%,0.23,0.16,0.22,0.12
max,0.61,0.57,0.73,0.69






Unnamed: 0,331_caption_sim,331_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.24,0.13,0.19,0.07
std,0.05,0.09,0.04,0.07
min,0.18,-0.27,0.13,-0.23
25%,0.2,0.07,0.16,0.03
50%,0.23,0.13,0.18,0.07
75%,0.27,0.18,0.21,0.11
max,0.61,0.55,0.73,0.69






Unnamed: 0,202_caption_sim,202_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.15,0.08,0.2,0.08
std,0.05,0.08,0.05,0.07
min,0.1,-0.22,0.13,-0.23
25%,0.11,0.02,0.16,0.03
50%,0.13,0.08,0.19,0.08
75%,0.17,0.14,0.22,0.12
max,0.62,0.61,0.73,0.69






Unnamed: 0,169_caption_sim,169_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.19,0.09,0.19,0.07
std,0.05,0.08,0.05,0.07
min,0.14,-0.24,0.13,-0.22
25%,0.15,0.04,0.16,0.03
50%,0.18,0.09,0.18,0.07
75%,0.21,0.14,0.22,0.11
max,0.62,0.62,0.73,0.69






Unnamed: 0,327_caption_sim,327_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.25,0.11,0.19,0.07
std,0.06,0.1,0.04,0.07
min,0.18,-0.22,0.13,-0.23
25%,0.2,0.04,0.15,0.02
50%,0.23,0.1,0.18,0.06
75%,0.27,0.16,0.21,0.11
max,0.63,0.58,0.73,0.69






Unnamed: 0,309_caption_sim,309_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.2,0.12,0.19,0.08
std,0.06,0.1,0.05,0.07
min,0.13,-0.25,0.13,-0.23
25%,0.15,0.05,0.16,0.03
50%,0.18,0.11,0.18,0.08
75%,0.22,0.18,0.21,0.12
max,0.6,0.61,0.73,0.69






Unnamed: 0,68_caption_sim,68_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.18,0.09,0.19,0.07
std,0.05,0.09,0.04,0.07
min,0.13,-0.28,0.13,-0.23
25%,0.14,0.03,0.16,0.03
50%,0.17,0.08,0.18,0.07
75%,0.2,0.15,0.21,0.12
max,0.57,0.53,0.73,0.69






Unnamed: 0,216_caption_sim,216_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.22,0.12,0.19,0.07
std,0.05,0.09,0.05,0.07
min,0.16,-0.27,0.13,-0.23
25%,0.18,0.06,0.16,0.03
50%,0.21,0.12,0.18,0.07
75%,0.25,0.18,0.22,0.12
max,0.63,0.63,0.73,0.69






Unnamed: 0,40_caption_sim,40_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.2,0.09,0.2,0.08
std,0.04,0.08,0.05,0.07
min,0.16,-0.24,0.13,-0.23
25%,0.18,0.04,0.17,0.03
50%,0.19,0.09,0.19,0.07
75%,0.22,0.14,0.23,0.12
max,0.61,0.59,0.73,0.69






Unnamed: 0,99_caption_sim,99_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.16,0.07,0.19,0.07
std,0.04,0.08,0.04,0.07
min,0.12,-0.24,0.13,-0.22
25%,0.13,0.02,0.16,0.03
50%,0.15,0.07,0.18,0.07
75%,0.18,0.12,0.22,0.12
max,0.57,0.51,0.73,0.69






Unnamed: 0,426_caption_sim,426_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.15,0.07,0.19,0.08
std,0.04,0.07,0.05,0.07
min,0.11,-0.23,0.13,-0.23
25%,0.12,0.02,0.16,0.03
50%,0.14,0.07,0.18,0.07
75%,0.16,0.11,0.22,0.12
max,0.48,0.52,0.73,0.69






Unnamed: 0,344_caption_sim,344_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.22,0.1,0.19,0.07
std,0.04,0.08,0.05,0.07
min,0.17,-0.27,0.13,-0.23
25%,0.19,0.04,0.15,0.02
50%,0.21,0.1,0.18,0.07
75%,0.24,0.15,0.21,0.11
max,0.58,0.52,0.73,0.69






Unnamed: 0,47_caption_sim,47_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.26,0.14,0.19,0.08
std,0.06,0.11,0.04,0.07
min,0.19,-0.26,0.13,-0.2
25%,0.22,0.07,0.16,0.04
50%,0.25,0.14,0.18,0.08
75%,0.29,0.21,0.21,0.12
max,0.74,0.74,0.73,0.69






Unnamed: 0,8_caption_sim,8_context_sim,ped_caption_sim,ped_context_sim
count,338959.0,338959.0,338959.0,338959.0
mean,0.13,0.07,0.19,0.07
std,0.04,0.08,0.05,0.07
min,0.09,-0.25,0.13,-0.22
25%,0.1,0.02,0.16,0.03
50%,0.12,0.07,0.19,0.07
75%,0.15,0.12,0.22,0.12
max,0.58,0.59,0.73,0.69






Unnamed: 0,416_caption_sim,416_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.17,0.07,0.19,0.07
std,0.03,0.06,0.04,0.07
min,0.13,-0.21,0.13,-0.23
25%,0.14,0.02,0.16,0.02
50%,0.16,0.07,0.18,0.07
75%,0.18,0.11,0.22,0.11
max,0.66,0.55,0.73,0.69






Unnamed: 0,112_caption_sim,112_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.18,0.09,0.19,0.08
std,0.06,0.11,0.05,0.07
min,0.12,-0.23,0.13,-0.23
25%,0.14,0.01,0.16,0.04
50%,0.17,0.08,0.19,0.08
75%,0.21,0.16,0.22,0.12
max,0.62,0.57,0.73,0.69






Unnamed: 0,137_caption_sim,137_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.22,0.1,0.2,0.08
std,0.05,0.09,0.05,0.07
min,0.17,-0.26,0.13,-0.23
25%,0.19,0.04,0.16,0.03
50%,0.21,0.1,0.19,0.07
75%,0.24,0.15,0.22,0.12
max,0.61,0.55,0.73,0.69






Unnamed: 0,441_caption_sim,441_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.17,0.05,0.19,0.07
std,0.04,0.08,0.04,0.07
min,0.13,-0.33,0.13,-0.23
25%,0.14,-0.0,0.16,0.02
50%,0.16,0.05,0.18,0.07
75%,0.19,0.1,0.21,0.11
max,0.57,0.56,0.73,0.69






Unnamed: 0,192_caption_sim,192_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.13,0.05,0.19,0.07
std,0.03,0.06,0.04,0.07
min,0.09,-0.22,0.13,-0.23
25%,0.1,0.01,0.15,0.02
50%,0.12,0.05,0.18,0.07
75%,0.14,0.09,0.21,0.11
max,0.48,0.54,0.73,0.69






Unnamed: 0,20_caption_sim,20_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.16,0.06,0.19,0.07
std,0.04,0.07,0.05,0.07
min,0.12,-0.24,0.13,-0.23
25%,0.13,0.02,0.16,0.02
50%,0.15,0.06,0.19,0.07
75%,0.17,0.11,0.22,0.11
max,0.62,0.51,0.73,0.69






Unnamed: 0,296_caption_sim,296_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.23,0.12,0.19,0.07
std,0.06,0.09,0.04,0.07
min,0.16,-0.25,0.13,-0.23
25%,0.18,0.05,0.15,0.02
50%,0.21,0.11,0.18,0.06
75%,0.25,0.18,0.21,0.11
max,0.67,0.56,0.73,0.69






Unnamed: 0,144_caption_sim,144_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.18,0.04,0.2,0.07
std,0.03,0.06,0.05,0.07
min,0.14,-0.21,0.13,-0.23
25%,0.15,0.0,0.16,0.02
50%,0.17,0.04,0.19,0.06
75%,0.19,0.08,0.22,0.11
max,0.44,0.46,0.73,0.69






Unnamed: 0,12_caption_sim,12_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.16,0.09,0.19,0.07
std,0.04,0.08,0.05,0.07
min,0.11,-0.24,0.13,-0.23
25%,0.13,0.03,0.16,0.03
50%,0.15,0.09,0.18,0.07
75%,0.18,0.14,0.22,0.12
max,0.63,0.6,0.73,0.69






Unnamed: 0,224_caption_sim,224_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.22,0.12,0.19,0.07
std,0.05,0.09,0.04,0.07
min,0.16,-0.26,0.13,-0.23
25%,0.18,0.06,0.16,0.03
50%,0.2,0.12,0.18,0.07
75%,0.24,0.18,0.21,0.12
max,0.64,0.59,0.73,0.69






Unnamed: 0,250_caption_sim,250_context_sim,ped_caption_sim,ped_context_sim
count,338959.0,338959.0,338959.0,338959.0
mean,0.15,0.08,0.18,0.07
std,0.03,0.07,0.04,0.07
min,0.11,-0.22,0.13,-0.23
25%,0.12,0.03,0.15,0.02
50%,0.14,0.08,0.18,0.07
75%,0.17,0.12,0.21,0.11
max,0.45,0.42,0.73,0.69






Unnamed: 0,376_caption_sim,376_context_sim,ped_caption_sim,ped_context_sim
count,338960.0,338960.0,338960.0,338960.0
mean,0.2,0.07,0.19,0.07
std,0.04,0.08,0.04,0.07
min,0.15,-0.27,0.13,-0.23
25%,0.17,0.02,0.16,0.02
50%,0.19,0.07,0.18,0.07
75%,0.22,0.13,0.22,0.11
max,0.57,0.54,0.73,0.69






In [84]:
topics = get_topics(concept='pedestrian')

In [85]:
pg_dict = dict()
pg_dict['a'] = [68, 112, 56, 224]
pg_dict['b'] = [309, 344, 47, -1, 53]
pg_dict['c'] = [314, 331, 419, 394]
pg_dict['d'] = [98, 196, 202, 294]
pg_dict['e'] = [172, 137, 11, 134, 150, 216]
pg_dict['f'] = [363, 40, 43]
pg_dict['g'] = [441, 376]
pg_dict['h'] = [144, 416, 327]
pg_dict['i'] = [192, 20, 114]
pg_dict['j'] = [345]
pg_dict['k'] = [8, 265]
pg_dict['l'] = [250, 80, 426]
pg_dict['m'] = [169, 99, 378]
pg_dict['n'] = [432, 388, 412]
pg_dict['o'] = [12, 278, 198, 296] 

In [86]:
k_name = ['image_url', 'caption', 'context', 'concept2caption_sim', 
          'concept2context_sim']
# k_name = ['image_url', 'caption', 'context', 'concept2caption_sim', 
#           'concept2context_sim']
for el in pg_dict:
    pg_dict[el] = [f'{i}_caption_score' for i in pg_dict[el]]
    pg_dict[el] = [df_dict[i] for i in pg_dict[el]]
    pg_dict[el] = [df_.reset_index(drop=True) for df_ in pg_dict[el]]
    

In [87]:
for el in pg_dict:
    t_df = pd.DataFrame(columns = ['topic2caption_sim', 'topic2context_sim'] + k_name)
    for df_ in pg_dict[el]:
        df_.columns = ['topic2caption_sim', 'topic2context_sim'] + k_name
        t_df = pd.concat([t_df, df_], ignore_index=True)
    t_df = t_df.drop_duplicates()
    t_df = t_df.reset_index(drop=True)
    pg_dict[el] = t_df

In [94]:
for el in pg_dict:
    print(el)
    pg_dict[el].drop_duplicates(subset="image_url",
                     keep='first', inplace=True)
    display(pg_dict[el].describe().round(2))


a


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,513249.0,513249.0,513249.0,513249.0
mean,0.18,0.08,0.19,0.07
std,0.04,0.09,0.04,0.07
min,0.12,-0.28,0.13,-0.23
25%,0.15,0.02,0.15,0.03
50%,0.17,0.08,0.18,0.07
75%,0.2,0.14,0.21,0.11
max,0.57,0.53,0.73,0.69


b


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,695643.0,695643.0,695643.0,695643.0
mean,0.21,0.1,0.18,0.07
std,0.05,0.09,0.04,0.07
min,0.11,-0.27,0.13,-0.23
25%,0.17,0.04,0.15,0.03
50%,0.2,0.1,0.17,0.07
75%,0.24,0.16,0.2,0.11
max,0.6,0.72,0.73,0.69


c


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,486016.0,486016.0,486016.0,486016.0
mean,0.21,0.1,0.19,0.07
std,0.04,0.08,0.04,0.07
min,0.15,-0.26,0.13,-0.23
25%,0.17,0.04,0.15,0.03
50%,0.2,0.1,0.18,0.07
75%,0.23,0.15,0.21,0.11
max,0.68,0.67,0.73,0.69


d


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,486807.0,486807.0,486807.0,486807.0
mean,0.19,0.08,0.19,0.07
std,0.06,0.08,0.04,0.07
min,0.1,-0.21,0.13,-0.23
25%,0.14,0.03,0.16,0.03
50%,0.18,0.08,0.18,0.07
75%,0.21,0.14,0.21,0.12
max,0.73,0.64,0.73,0.69


e


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,510572.0,510572.0,510572.0,510572.0
mean,0.19,0.09,0.19,0.07
std,0.04,0.08,0.04,0.07
min,0.14,-0.3,0.13,-0.23
25%,0.17,0.03,0.16,0.03
50%,0.19,0.09,0.18,0.07
75%,0.21,0.14,0.21,0.11
max,0.61,0.56,0.73,0.69


f


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,517685.0,517685.0,517685.0,517685.0
mean,0.16,0.06,0.19,0.07
std,0.04,0.07,0.04,0.07
min,0.09,-0.25,0.13,-0.23
25%,0.12,0.01,0.16,0.03
50%,0.16,0.06,0.18,0.07
75%,0.18,0.11,0.22,0.11
max,0.53,0.57,0.73,0.69


g


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,488750.0,488750.0,488750.0,488750.0
mean,0.18,0.06,0.19,0.07
std,0.04,0.08,0.04,0.07
min,0.13,-0.3,0.13,-0.23
25%,0.16,0.01,0.15,0.02
50%,0.17,0.06,0.18,0.07
75%,0.2,0.11,0.21,0.11
max,0.57,0.56,0.73,0.69


h


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,525576.0,525576.0,525576.0,525576.0
mean,0.19,0.07,0.19,0.07
std,0.05,0.08,0.04,0.07
min,0.13,-0.21,0.13,-0.23
25%,0.15,0.02,0.15,0.02
50%,0.18,0.06,0.18,0.06
75%,0.21,0.11,0.21,0.11
max,0.53,0.57,0.73,0.69


i


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,540471.0,540471.0,540471.0,540471.0
mean,0.14,0.06,0.19,0.07
std,0.03,0.07,0.04,0.07
min,0.09,-0.26,0.13,-0.23
25%,0.12,0.02,0.15,0.02
50%,0.14,0.06,0.18,0.07
75%,0.16,0.1,0.21,0.11
max,0.58,0.57,0.73,0.69


j


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,274472.0,274472.0,274472.0,274472.0
mean,0.16,0.09,0.19,0.08
std,0.05,0.08,0.05,0.07
min,0.11,-0.22,0.13,-0.2
25%,0.13,0.04,0.16,0.03
50%,0.15,0.09,0.19,0.08
75%,0.18,0.15,0.22,0.12
max,0.67,0.66,0.73,0.69


k


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,451090.0,451090.0,451090.0,451090.0
mean,0.14,0.08,0.19,0.07
std,0.04,0.08,0.04,0.07
min,0.09,-0.25,0.13,-0.23
25%,0.11,0.03,0.16,0.03
50%,0.13,0.08,0.18,0.07
75%,0.16,0.13,0.21,0.12
max,0.58,0.59,0.73,0.69


l


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,540822.0,540822.0,540822.0,540822.0
mean,0.14,0.07,0.19,0.07
std,0.03,0.07,0.04,0.07
min,0.1,-0.23,0.13,-0.23
25%,0.12,0.02,0.15,0.03
50%,0.14,0.07,0.18,0.07
75%,0.16,0.12,0.21,0.11
max,0.49,0.59,0.73,0.69


m


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,484329.0,484329.0,484329.0,484329.0
mean,0.18,0.09,0.19,0.07
std,0.05,0.08,0.04,0.07
min,0.12,-0.26,0.13,-0.22
25%,0.15,0.03,0.15,0.03
50%,0.17,0.08,0.18,0.07
75%,0.2,0.14,0.21,0.11
max,0.62,0.62,0.73,0.69


n


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,549633.0,549633.0,549633.0,549633.0
mean,0.16,0.06,0.19,0.07
std,0.04,0.07,0.04,0.07
min,0.1,-0.25,0.13,-0.23
25%,0.14,0.01,0.16,0.03
50%,0.15,0.06,0.18,0.07
75%,0.18,0.11,0.21,0.12
max,0.53,0.55,0.73,0.69


o


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,568776.0,568776.0,568776.0,568776.0
mean,0.19,0.1,0.19,0.07
std,0.05,0.08,0.04,0.07
min,0.11,-0.24,0.13,-0.23
25%,0.15,0.04,0.15,0.03
50%,0.18,0.1,0.18,0.07
75%,0.23,0.15,0.21,0.11
max,0.69,0.6,0.73,0.69


In [95]:
pg_dict[el]

Unnamed: 0,topic2caption_sim,topic2context_sim,image_url,caption,context,concept2caption_sim,concept2context_sim
0,0.628946,0.295938,https://upload.wikimedia.org/wikipedia/commons...,Pedestrian trail English: Pedestrian trail,The Blue Water River Walk is a nearly one mile...,0.704843,0.105616
1,0.591550,0.071280,https://upload.wikimedia.org/wikipedia/commons...,Walking Trail,"Bagh-e-Jinnah, formerly known as Lawrence Gard...",0.422199,0.072473
2,0.591550,0.071280,https://upload.wikimedia.org/wikipedia/commons...,Walking Trail,"Bagh-e-Jinnah, formerly known as Lawrence Gard...",0.422199,0.072473
3,0.591550,0.071280,https://upload.wikimedia.org/wikipedia/commons...,Walking Trail,"Bagh-e-Jinnah, formerly known as Lawrence Gard...",0.422199,0.072473
4,0.589132,0.471493,https://upload.wikimedia.org/wikipedia/commons...,Hikers awaiting southbound train English: Look...,The Appalachian Trail station is a commuter ra...,0.300308,0.143705
...,...,...,...,...,...,...,...
1354966,0.163975,0.088981,https://upload.wikimedia.org/wikipedia/commons...,English: Rhaetina gregaria from Ladinian of I...,Rhaetina is an extinct genus of brachiopods be...,0.173961,0.030292
1354971,0.163974,0.053684,https://upload.wikimedia.org/wikipedia/commons...,"Henry Savile, who owned and bred Cremorne Men ...",Cremorne was British Thoroughbred racehorse an...,0.170176,0.018248
1354975,0.163974,0.012743,https://upload.wikimedia.org/wikipedia/commons...,Reconstruction of a 2nd century AD pugio Pugio...,The pugio was a dagger used by Roman soldiers ...,0.247398,0.092458
1354984,0.163972,0.188481,https://upload.wikimedia.org/wikipedia/commons...,"German paper nativity scene, 1885 Deutsch: Pap...","In the Christian tradition, a nativity scene i...",0.142268,0.075186


In [96]:
df__ = pd.DataFrame()
for el in pg_dict:
    df__ = pd.concat([df_, pg_dict[el]], ignore_index=True)
    df__.drop_duplicates(subset="image_url",
                     keep='first', inplace=True)
df__.drop_duplicates(subset="image_url",
                     keep='first', inplace=True)
display(df__.describe().round(2))

Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,concept2context_sim
count,568776.0,568776.0,568776.0,568776.0
mean,0.21,0.11,0.19,0.07
std,0.06,0.09,0.04,0.07
min,0.11,-0.24,0.13,-0.23
25%,0.17,0.05,0.15,0.03
50%,0.2,0.1,0.18,0.07
75%,0.24,0.16,0.21,0.11
max,0.67,0.59,0.73,0.69


In [97]:
missed_topics_dict = dict()
for el in pg_dict:
    missed_topics_dict[el] = pg_dict[el]
    missed_topics_dict[el]['image_path'] = missed_topics_dict[el]['image_url'].apply(link_to_b64)
    missed_topics_dict[el] = missed_topics_dict[el][missed_topics_dict[el].image_path.notnull()]
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [98]:
missed_topics_dict[el]

Unnamed: 0,topic2caption_sim,topic2context_sim,image_url,caption,context,concept2caption_sim,concept2context_sim,image_path
0,0.628946,0.295938,https://upload.wikimedia.org/wikipedia/commons...,Pedestrian trail English: Pedestrian trail,The Blue Water River Walk is a nearly one mile...,0.704843,0.105616,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
1,0.591550,0.071280,https://upload.wikimedia.org/wikipedia/commons...,Walking Trail,"Bagh-e-Jinnah, formerly known as Lawrence Gard...",0.422199,0.072473,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
2,0.591550,0.071280,https://upload.wikimedia.org/wikipedia/commons...,Walking Trail,"Bagh-e-Jinnah, formerly known as Lawrence Gard...",0.422199,0.072473,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
3,0.591550,0.071280,https://upload.wikimedia.org/wikipedia/commons...,Walking Trail,"Bagh-e-Jinnah, formerly known as Lawrence Gard...",0.422199,0.072473,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
4,0.589132,0.471493,https://upload.wikimedia.org/wikipedia/commons...,Hikers awaiting southbound train English: Look...,The Appalachian Trail station is a commuter ra...,0.300308,0.143705,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
...,...,...,...,...,...,...,...,...
551711,0.163975,0.088981,https://upload.wikimedia.org/wikipedia/commons...,English: Rhaetina gregaria from Ladinian of I...,Rhaetina is an extinct genus of brachiopods be...,0.173961,0.030292,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
551712,0.163974,0.053684,https://upload.wikimedia.org/wikipedia/commons...,"Henry Savile, who owned and bred Cremorne Men ...",Cremorne was British Thoroughbred racehorse an...,0.170176,0.018248,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
551713,0.163974,0.012743,https://upload.wikimedia.org/wikipedia/commons...,Reconstruction of a 2nd century AD pugio Pugio...,The pugio was a dagger used by Roman soldiers ...,0.247398,0.092458,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
551714,0.163972,0.188481,https://upload.wikimedia.org/wikipedia/commons...,"German paper nativity scene, 1885 Deutsch: Pap...","In the Christian tradition, a nativity scene i...",0.142268,0.075186,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...


In [99]:
selected_col = ['image_path','topic2caption_sim', 'concept2caption_sim', 'topic2context_sim', 'caption']
for el in missed_topics_dict:
    missed_topics_dict[el] = missed_topics_dict[el][selected_col]
    missed_topics_dict[el] = missed_topics_dict[el].drop_duplicates(subset=selected_col, keep=False)
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [100]:
for el in missed_topics_dict:
#     missed_topics_dict[el] = missed_topics_dict[el][selected_col]
    missed_topics_dict[el] = missed_topics_dict[el].drop_duplicates(keep=False)
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [101]:
missed_topics_dict['a']

Unnamed: 0,image_path,topic2caption_sim,concept2caption_sim,topic2context_sim,caption
0,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...,0.572484,0.208802,0.389450,English: Chariot Wheel
1,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.536091,0.212263,0.355876,"Solid wheels on a heavy temple car, contrasted..."
2,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.532109,0.195246,0.476177,English: Mitsubishi Chariot
3,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.526225,0.161056,0.473637,1992–1994 Mitsubishi Chariot English: Mitsubis...
4,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.520849,0.194514,0.296062,"Replica of a south-pointing chariot, 2005"
...,...,...,...,...,...
498377,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.157879,0.159651,0.074182,"Vyasa, at middle of the picture English: Vyasa..."
498378,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.157877,0.159958,0.148378,View of the J. H. Dodson Residence in San Pedr...
498379,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.157876,0.182478,0.030891,Matt Ford Headshot of Matt Ford Lighting Designer
498380,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.157876,0.165309,0.119214,English: Waltham Fire Station. Manned by reta...


In [102]:
for el in missed_topics_dict:
    print(el, len(missed_topics_dict[el]))

a 498382
b 677042
c 471389
d 470832
e 494671
f 497406
g 479709
h 508003
i 526649
j 266079
k 438599
l 524994
m 467491
n 535998
o 551716


In [103]:
sorted_key = []
for k in sorted(missed_topics_dict, key=lambda k: len(missed_topics_dict[k])):
    print(k, len(missed_topics_dict[k]))
    sorted_key.append(k)

j 266079
k 438599
m 467491
d 470832
c 471389
g 479709
e 494671
f 497406
a 498382
h 508003
l 524994
i 526649
n 535998
o 551716
b 677042


In [104]:
def df2_df1(df1, df2):
    cond = df2['image_path'].isin(df1['image_path'])
    df2.drop(df2[cond].index, inplace = True)
    return df2

In [105]:
for idx, k in enumerate(sorted_key):
    print(sorted_key[idx:], k)
    for el in sorted_key[idx:]:
        if el == k:
            continue
        print(f'we are going to calc: {el} - {k}')
        missed_topics_dict[el] = df2_df1(missed_topics_dict[k], missed_topics_dict[el])
#     break

['j', 'k', 'm', 'd', 'c', 'g', 'e', 'f', 'a', 'h', 'l', 'i', 'n', 'o', 'b'] j
we are going to calc: k - j
we are going to calc: m - j
we are going to calc: d - j
we are going to calc: c - j
we are going to calc: g - j
we are going to calc: e - j
we are going to calc: f - j
we are going to calc: a - j
we are going to calc: h - j
we are going to calc: l - j
we are going to calc: i - j
we are going to calc: n - j
we are going to calc: o - j
we are going to calc: b - j
['k', 'm', 'd', 'c', 'g', 'e', 'f', 'a', 'h', 'l', 'i', 'n', 'o', 'b'] k
we are going to calc: m - k
we are going to calc: d - k
we are going to calc: c - k
we are going to calc: g - k
we are going to calc: e - k
we are going to calc: f - k
we are going to calc: a - k
we are going to calc: h - k
we are going to calc: l - k
we are going to calc: i - k
we are going to calc: n - k
we are going to calc: o - k
we are going to calc: b - k
['m', 'd', 'c', 'g', 'e', 'f', 'a', 'h', 'l', 'i', 'n', 'o', 'b'] m
we are going to calc: d -

In [106]:
sorted_key = []
for k in sorted(missed_topics_dict, key=lambda k: len(missed_topics_dict[k])):
    print(k, len(missed_topics_dict[k]))
    sorted_key.append(k)

o 1825
b 3728
n 4904
i 9772
e 10146
a 11106
l 14489
c 27015
f 29601
h 31606
d 66386
g 99068
m 189160
k 258164
j 266079


In [107]:
missed_topics_dict[el]

Unnamed: 0,image_path,topic2caption_sim,concept2caption_sim,topic2context_sim,caption
16645,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...,0.297875,0.144801,0.039931,Anchor from H.M.S. Investigator. Dropped by M...
19605,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.290006,0.135893,0.235146,Newman playing for Melbourne in March 2017 Eng...
21511,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.285580,0.136557,0.099045,Collingwood plays down the ground during the f...
22570,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.283370,0.157820,0.274062,Victoria in Australia
23687,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.280998,0.136919,-0.000578,English: Melbourne with the Yarra river looki...
...,...,...,...,...,...
676814,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.113594,0.145010,0.086696,English: Hedyotis purpurea var montana syn Ho...
676841,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.113571,0.163208,0.084670,English: Yun Hu-myeong
676845,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.113568,0.174736,0.110685,English: Hashem Yekezareh
676988,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.113454,0.143493,0.070206,English: Rudy Rupak Founder of PlanetHospital


In [108]:
random_count = {'a': 1426, 'f': 838, 'g': 145, 'i':349, 'j': 275, 'k':302, 'l':151, 'm':797, 'n':290, 'o':1081}

In [109]:
for el in random_count:
    missed_topics_dict[el] = missed_topics_dict[el].sample(n =random_count[el])
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [110]:
sorted_key = []
for k in sorted(missed_topics_dict, key=lambda k: len(missed_topics_dict[k])):
    print(k, len(missed_topics_dict[k]))
    sorted_key.append(k)

g 145
l 151
j 275
n 290
k 302
i 349
m 797
f 838
o 1081
a 1426
b 3728
e 10146
c 27015
h 31606
d 66386


In [111]:
missed_topics_with_images = dict()
count = 0
for el in random_count:
    missed_topics_with_images[el] = create_data(missed_topics_dict[el], count)
    count = len(missed_topics_with_images[el])

/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy8zLzNkL0EtN0VfQ29yc2Fpcl9JSV9WQS0xMi5qcGc=.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9kL2RkL0phZmFyYWJhZGlfYnVmZmFsby5qcGc=.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy8zLzNlL0hvbmR1cmFzLlJpY2FyZG9NYWR1cm8uMDEuanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy8wLzA3LzIwMjAwNjAzX2tpbWJvcmEuanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy8zLzNkL0NPX0FUTC5qcGc=.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy84Lzg2LzIwMXN0X0FpcmxpZnRfU3F1YWRyb25fZW1ibGVtLmpwZw==.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy8xLzFmL1RydDF5ZW5pbG9nby5wb

/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvZW4vMS8xOC9CdXJqX0lzbWFpbF9NYWxsX1NhcmdvZGhhLmpwZw==.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9lL2VhL1NDX1dpZW5lcl9OZXVzdGFkdF92cy5fU0tOX1N0Ll9QJUMzJUI2bHRlbl8yMDE4LTA1LTMxXyUyODEyOSUyOS5qcGc=.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9lL2VhL0ZvbGxvd190aGVfY2hpbGRyZW4uanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy84LzhiL05DU0hQXzIwMThfRG9kZ2VfQ2hhcmdlcl9QdXJzaXV0LmpwZw==.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy82LzYzL0hlcml0YWdlX2luZGlhLkpQRw==.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy81LzVjL1BvcnRzbW91dGhfQWZyaWNhbl9CdXJ5aW5nX0dyb3VuZF9NZW1vcmlhbF9QYXJrXzA1LmpwZw==.jpg
/raid/AISSEL

In [112]:
missed_topics_with_images[el]

Unnamed: 0,uniq_id,image_id,caption,labels,image,topic2caption_sim,concept2caption_sim,topic2context_sim
0,285,285,Español Thelma Rodriguez Miss Nicaragua 2008,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.170526,0.151025,0.118578
1,286,286,The Sackhorn from the Gasterental west side En...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.192444,0.145502,0.223443
2,287,287,English In the Café dHarcourt in Paris oil on...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.166845,0.153415,-0.056385
3,288,288,English Old Tom Morris on St Andrews Links oi...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.174976,0.143859,0.044253
4,289,289,González training with Las Palmas in 2009 Espa...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.115614,0.154553,0.028286
...,...,...,...,...,...,...,...,...
1065,1361,1361,A pine stand almost 200 years old on the north...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.208542,0.140356,0.075292
1066,1362,1362,English Blank physical map of the department ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.137954,0.143225,-0.161839
1067,1363,1363,English Bandstand Magdalen Green looking south,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.128150,0.189176,0.038640
1068,1364,1364,The Zschirnsteine from the side of the Großer ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.128722,0.135879,0.106423


In [113]:
s = 0
for el in missed_topics_with_images:
    missed_topics_with_images[el] = missed_topics_with_images[el].dropna(subset=['caption'])
    missed_topics_with_images[el]['topics']  = [el] * len(missed_topics_with_images[el])
    print(el, len(missed_topics_with_images[el]))
    s = s + len(missed_topics_with_images[el])
print(s)

a 1397
f 824
g 140
i 345
j 266
k 299
l 147
m 787
n 285
o 1070
5560


In [114]:
scol = ['topic2caption_sim', 'concept2caption_sim']
for el in missed_topics_with_images:
    print(el)
    display(missed_topics_with_images[el][scol].describe().round(2))

a


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,1397.0,1397.0
mean,0.16,0.15
std,0.03,0.02
min,0.12,0.13
25%,0.14,0.14
50%,0.16,0.15
75%,0.18,0.16
max,0.29,0.23


f


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,824.0,824.0
mean,0.15,0.16
std,0.04,0.02
min,0.09,0.13
25%,0.12,0.14
50%,0.15,0.16
75%,0.18,0.18
max,0.36,0.27


g


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,140.0,140.0
mean,0.18,0.16
std,0.03,0.02
min,0.13,0.13
25%,0.15,0.15
50%,0.17,0.16
75%,0.2,0.17
max,0.28,0.25


i


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,345.0,345.0
mean,0.14,0.15
std,0.03,0.02
min,0.09,0.13
25%,0.12,0.14
50%,0.14,0.15
75%,0.16,0.16
max,0.3,0.22


j


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,266.0,266.0
mean,0.17,0.19
std,0.05,0.05
min,0.11,0.13
25%,0.13,0.16
50%,0.15,0.18
75%,0.19,0.22
max,0.34,0.35


k


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,299.0,299.0
mean,0.13,0.18
std,0.03,0.04
min,0.09,0.13
25%,0.11,0.15
50%,0.13,0.17
75%,0.15,0.2
max,0.37,0.29


l


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,147.0,147.0
mean,0.14,0.16
std,0.03,0.02
min,0.1,0.13
25%,0.12,0.14
50%,0.13,0.15
75%,0.16,0.16
max,0.25,0.24


m


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,787.0,787.0
mean,0.18,0.17
std,0.04,0.03
min,0.12,0.13
25%,0.15,0.15
50%,0.17,0.16
75%,0.21,0.18
max,0.4,0.32


n


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,285.0,285.0
mean,0.15,0.16
std,0.04,0.02
min,0.1,0.13
25%,0.12,0.14
50%,0.14,0.15
75%,0.16,0.16
max,0.32,0.23


o


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,1070.0,1070.0
mean,0.17,0.15
std,0.04,0.01
min,0.11,0.13
25%,0.14,0.14
50%,0.17,0.15
75%,0.2,0.16
max,0.31,0.23


In [115]:
df_2 = pd.DataFrame()
for el in missed_topics_with_images:
    df_2 = pd.concat([df_2, missed_topics_with_images[el]], ignore_index=True)
# df_
display(df_2[scol].describe().round(2))

Unnamed: 0,topic2caption_sim,concept2caption_sim
count,5560.0,5560.0
mean,0.16,0.16
std,0.04,0.03
min,0.09,0.13
25%,0.13,0.14
50%,0.16,0.15
75%,0.18,0.17
max,0.4,0.35


In [116]:
random_count_wit = {'a': 1226, 'f': 638, 'g': 45, 'i':249, 'j': 175, 'k':202, 'l':51, 'm':497, 'n':190, 'o':881}

In [117]:
for el in random_count_wit:
    missed_topics_with_images[el] = missed_topics_with_images[el].drop_duplicates(keep=False)
    missed_topics_with_images[el] = missed_topics_with_images[el].sample(n =random_count_wit[el])
    missed_topics_with_images[el] = missed_topics_with_images[el].reset_index(drop=True)

In [118]:
for el in random_count_wit:
    print(el)

    display(missed_topics_with_images[el][['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

a


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,1226.0,1226.0
mean,0.16,0.15
std,0.03,0.02
min,0.12,0.13
25%,0.14,0.14
50%,0.16,0.15
75%,0.18,0.16
max,0.29,0.23


f


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,638.0,638.0
mean,0.15,0.16
std,0.04,0.02
min,0.09,0.13
25%,0.12,0.14
50%,0.15,0.16
75%,0.18,0.18
max,0.32,0.25


g


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,45.0,45.0
mean,0.18,0.16
std,0.03,0.02
min,0.13,0.14
25%,0.15,0.15
50%,0.17,0.16
75%,0.2,0.18
max,0.28,0.23


i


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,249.0,249.0
mean,0.14,0.15
std,0.03,0.02
min,0.09,0.13
25%,0.12,0.14
50%,0.14,0.15
75%,0.16,0.16
max,0.3,0.21


j


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,175.0,175.0
mean,0.17,0.19
std,0.04,0.05
min,0.11,0.13
25%,0.13,0.16
50%,0.15,0.18
75%,0.19,0.22
max,0.33,0.35


k


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,202.0,202.0
mean,0.13,0.18
std,0.04,0.04
min,0.09,0.14
25%,0.11,0.15
50%,0.12,0.17
75%,0.14,0.2
max,0.37,0.29


l


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,51.0,51.0
mean,0.14,0.16
std,0.03,0.02
min,0.1,0.13
25%,0.12,0.14
50%,0.13,0.15
75%,0.16,0.17
max,0.25,0.24


m


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,497.0,497.0
mean,0.18,0.17
std,0.05,0.03
min,0.12,0.13
25%,0.14,0.15
50%,0.17,0.16
75%,0.21,0.18
max,0.4,0.32


n


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,190.0,190.0
mean,0.15,0.16
std,0.04,0.02
min,0.1,0.13
25%,0.12,0.14
50%,0.14,0.15
75%,0.16,0.16
max,0.31,0.22


o


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,881.0,881.0
mean,0.17,0.15
std,0.04,0.01
min,0.11,0.13
25%,0.14,0.14
50%,0.17,0.14
75%,0.2,0.16
max,0.31,0.23


In [119]:
df_l = pd.DataFrame()
for el in random_count_wit:
    df_l = pd.concat([df_l, missed_topics_with_images[el]], ignore_index=True)
# df_
display(df_l[['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

Unnamed: 0,topic2caption_sim,concept2caption_sim
count,4154.0,4154.0
mean,0.16,0.16
std,0.04,0.02
min,0.09,0.13
25%,0.13,0.14
50%,0.16,0.15
75%,0.18,0.17
max,0.4,0.35


In [120]:
def get_portion(df, p1=72, p2=16, p3=12):
    s1 = df.sample(frac = p1/100)
    rest_part_1 = df.drop(s1.index)
    s2 = rest_part_1.sample(frac = p2/(100-p1))
    s3 = rest_part_1.drop(s2.index)
    return s1, s2, s3

In [121]:
name_lsit = ['stage1_train', 'stage2_train', 'val']
por_dict = dict()
data_dict = dict()
for n in name_lsit:
    data_dict[n] = pd.DataFrame() 
for el in random_count_wit:
    s1, s2, v = get_portion(missed_topics_with_images[el])
    
    s1 = s1.reset_index(drop=True)
    data_dict['stage1_train'] = pd.concat([data_dict['stage1_train'], s1], ignore_index=True)
    
    s2 = s2.reset_index(drop=True)
    data_dict['stage2_train'] = pd.concat([data_dict['stage2_train'], s2], ignore_index=True)
    
    v = v.reset_index(drop=True)
    data_dict['val'] = pd.concat([data_dict['val'], v], ignore_index=True)
#     data_dict[el] = por_dict

In [122]:
print(len(data_dict['stage1_train']))
print(len(data_dict['stage2_train']))
print(len(data_dict['val']))

2990
664
500


In [124]:
names = ['uniq_id', 'image_id', 'caption', 'labels', 'image']

In [125]:
! mkdir -p /raid/AISSEL/htest/datasets/ped_data/wit/random_missed
saved_path = '/raid/AISSEL/htest/datasets/ped_data/wit/random_missed'
for el in data_dict:
    name = f'caption_{el}.tsv'
    df_s = data_dict[el][names]
    df_s.to_csv(f'{saved_path}/{name}', sep="\t", index=False, header=False)

    print(name)

caption_stage1_train.tsv
caption_stage2_train.tsv
caption_val.tsv


In [1]:
(31 / 88) * 100

35.22727272727273

In [2]:
(12 / 88) * 100

13.636363636363635

In [3]:
(14 /64) * 100

21.875