In [1]:
from tqdm.notebook import tqdm, trange

In [2]:
tqdm.pandas()

In [3]:
import sys

In [4]:
import glob
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [5]:
import os

# General packages
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
from PIL import Image

from IPython.display import Image as IImage
from IPython.display import display
import warnings
warnings.filterwarnings("ignore")

In [6]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/test/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/test/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
def find_gpus(nums=6):
    os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp_free_gpus')
    with open('tmp_free_gpus', 'r') as lines_txt:
        frees = lines_txt.readlines()
        idx_freeMemory_pair = [ (idx,int(x.split()[2]))
                              for idx,x in enumerate(frees) ]
    idx_freeMemory_pair.sort(key=lambda my_tuple:my_tuple[1],reverse=True)
    usingGPUs = [str(idx_memory_pair[0])
                    for idx_memory_pair in idx_freeMemory_pair[:nums] ]
    usingGPUs =  ','.join(usingGPUs)
    print('using GPU idx: #', usingGPUs)
    return usingGPUs

In [8]:
os.environ['CUDA_VISIBLE_DEVICES'] = find_gpus(nums=2)

using GPU idx: # 0,1


In [9]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## get topics

In [10]:
from bertopic import BERTopic

In [11]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [12]:
lemmatizer = WordNetLemmatizer()

In [13]:
def get_topics(concept='pedestrian'):
    if concept == 'pedestrian':
        model_path = '../ped_auto_model'
    elif concept == 'aircraft':
        model_path = '../aircraft_auto_model'
    elif concept == 'car':
        model_path = '../car_auto_model'
    auto_model = BERTopic(nr_topics="auto")
    auto_model = BERTopic.load(model_path)
    similar_topics, similarity = auto_model.find_topics(concept, top_n=50)

    topics = dict()
    for t in similar_topics:
        l1 = auto_model.get_topic(t)
        el_list = []
        el_list.append(concept)
        for el in l1:
            w = lemmatizer.lemmatize(el[0], get_wordnet_pos(el[0]))
            el_list.append(w)
        topics[t]= set(el_list)
    return topics

# Load files

In [14]:
os.listdir('/raid/AISSEL/htest/datasets/wit/')

['wit_v1.train.all-00007-of-00010_context_caption_en_sbert_c.tsv',
 'wit_v1.train.all-00009-of-00010_context_caption_en_sbert_c.tsv',
 '__MACOSX',
 'wit_v1.train.all-1percent_sample.tsv',
 'wit_v1.train.all-00000-of-00010_en_csim.tsv',
 'wit_v1.train.all-00006-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00008-of-00010_context_caption_en_sbert_cpa.tsv',
 'aircraft_q75_en_sbert.tsv',
 'images',
 'wit_v1.train.all-00000-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00001-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00002-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00003-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00004-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00005-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00007-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00009-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00000-of-00010

In [15]:
root_path = '/raid/AISSEL/htest/datasets/wit'

In [16]:
f_names =[el for el in os.listdir('/raid/AISSEL/htest/datasets/wit/') if el.endswith('_en_sbert_cpa.tsv')]
f_names

['wit_v1.train.all-00006-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00008-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00000-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00001-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00002-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00003-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00004-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00005-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00007-of-00010_context_caption_en_sbert_cpa.tsv',
 'wit_v1.train.all-00009-of-00010_context_caption_en_sbert_cpa.tsv']

In [17]:
df = pd.read_csv(f'{root_path}/{f_names[0]}', sep='\t')
df = df.drop('Unnamed: 0', 1)
df = df[df["language"]=='en']
df

Unnamed: 0,index,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,...,296_context_score,144_context_score,12_context_score,224_context_score,250_context_score,376_context_score,concept_caption_score,concept_context_score,car_caption_score,aircraft_caption_score
0,14,en,https://en.wikipedia.org/wiki/LibreOffice,https://upload.wikimedia.org/wikipedia/commons...,LibreOffice,Included applications,LibreOffice / Features / Included applications,LibreOffice Math running on Ubuntu,English: Screenshots of LibreOffice Math 6.4 r...,,...,-0.037750,-0.025831,-0.045994,-0.067079,0.128117,-0.067173,-0.040416,-0.020415,-0.032282,-0.006119
1,28,en,https://en.wikipedia.org/wiki/Dalian,https://upload.wikimedia.org/wikipedia/commons...,Dalian,Research centres,Dalian / Education / Research centres,"Dalian Institute of Chemical Physics, of the C...",,,...,0.070782,0.014916,0.060768,0.137923,0.051047,0.125953,0.004426,0.083636,0.018341,0.077601
2,41,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,...,0.048340,0.031444,-0.178475,0.042817,0.119373,0.045854,0.090404,-0.012327,0.029681,0.181104
3,50,en,https://en.wikipedia.org/wiki/List_of_Bermuda_...,https://upload.wikimedia.org/wikipedia/commons...,List of Bermuda hurricanes,1960s,List of Bermuda hurricanes / List of storms / ...,"Hurricane Inga, one of the longest-lived Atlan...",Saffir-Simpson Hurricane ScaleTDTS12345 Englis...,Map showing the path and intensity of Hurrican...,...,-0.104979,-0.117618,-0.052297,-0.031839,-0.167693,-0.054938,0.005365,-0.095421,0.068992,0.103214
4,52,en,https://en.wikipedia.org/wiki/Parimelalhagar,https://upload.wikimedia.org/wikipedia/commons...,Parimelalhagar,Early life,Parimelalhagar / Early life,A page from the Parimelalhagar's commentary on...,English: A page from Arumuka Navalar's 1861 ed...,,...,0.013991,-0.050420,-0.043610,-0.030726,0.036060,-0.098819,0.025255,-0.061983,-0.008257,0.057020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540382,3704673,en,https://en.wikipedia.org/wiki/Standard_Electri...,https://upload.wikimedia.org/wikipedia/commons...,Standard Electric Time Company,,Standard Electric Time Company,A Standard 200177 fire alarm pull station,English: A Standard 200177 pull station in Har...,,...,-0.033670,-0.089871,-0.052723,-0.037692,-0.037566,0.038821,0.130745,-0.079322,0.099379,0.133040
540383,3704674,en,https://en.wikipedia.org/wiki/Malaysia_Airline...,https://upload.wikimedia.org/wikipedia/commons...,Malaysia Airlines Flight 370,Analysis,Malaysia Airlines Flight 370 / Investigation /...,A heat map indicating the probable location of...,English: Probability of the location where Mal...,,...,-0.060926,-0.013728,-0.057422,0.144767,-0.017854,-0.034770,-0.007270,0.046857,0.060193,0.218703
540384,3704675,en,https://en.wikipedia.org/wiki/Jackson_Plan,https://upload.wikimedia.org/wikipedia/commons...,Jackson Plan,Overall layout,Jackson Plan / Layout and effect of the plan /...,"Map of Singapore in 1914, the layout is now mo...","English: Map of the city of Singapore, ca 1914...",,...,0.071714,0.023376,0.013262,0.339486,0.056978,0.012006,0.018857,0.090633,-0.003516,-0.021940
540385,3704677,en,https://en.wikipedia.org/wiki/Candiacervus,https://upload.wikimedia.org/wikipedia/commons...,Candiacervus,Taxonomy,Candiacervus / Taxonomy,Hippopotamus creutzburgi and C. cretensis,English: My drawings of the two subspecies of ...,,...,0.145484,-0.006134,0.115790,-0.039335,0.091558,0.093865,0.052330,0.046950,0.023961,-0.041651


In [18]:
col_list = ['language', 'page_url', 'image_url', 'page_title', 'section_title', 
            'hierarchical_section_title', 'caption_reference_description', 
            'caption_attribution_description', 'caption_alt_text_description', 
            'mime_type', 'original_height', 'original_width', 'is_main_image', 
            'attribution_passes_lang_id', 'page_changed_recently', 
            'context_page_description', 'context_section_description', 'caption', 
            'context', 'aircraft_caption_score']

In [19]:
df = df[col_list]
df

Unnamed: 0,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,mime_type,original_height,original_width,is_main_image,attribution_passes_lang_id,page_changed_recently,context_page_description,context_section_description,caption,context,aircraft_caption_score
0,en,https://en.wikipedia.org/wiki/LibreOffice,https://upload.wikimedia.org/wikipedia/commons...,LibreOffice,Included applications,LibreOffice / Features / Included applications,LibreOffice Math running on Ubuntu,English: Screenshots of LibreOffice Math 6.4 r...,,image/png,801,1400,False,True,True,LibreOffice is a free and open-source office s...,,LibreOffice Math running on Ubuntu English: Sc...,LibreOffice is a free and open-source office s...,-0.006119
1,en,https://en.wikipedia.org/wiki/Dalian,https://upload.wikimedia.org/wikipedia/commons...,Dalian,Research centres,Dalian / Education / Research centres,"Dalian Institute of Chemical Physics, of the C...",,,image/jpeg,2136,3216,False,False,True,Dalian is a major sub-provincial port city in ...,Dalian Institute of Chemical Physics of the Ch...,"Dalian Institute of Chemical Physics, of the C...",Dalian is a major sub-provincial port city in ...,0.077601
2,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,image/jpeg,3000,4000,True,False,False,FMW 7th Anniversary Show was a professional wr...,FMW 7th Anniversary Show was a professional wr...,Kawasaki Stadium English: kawasaki_fujimi Stad...,FMW 7th Anniversary Show was a professional wr...,0.181104
3,en,https://en.wikipedia.org/wiki/List_of_Bermuda_...,https://upload.wikimedia.org/wikipedia/commons...,List of Bermuda hurricanes,1960s,List of Bermuda hurricanes / List of storms / ...,"Hurricane Inga, one of the longest-lived Atlan...",Saffir-Simpson Hurricane ScaleTDTS12345 Englis...,Map showing the path and intensity of Hurrican...,image/png,1900,3000,False,True,True,The British Overseas Territory of Bermuda has ...,"October 7, 1961 – Category 3 Hurricane Frances...","Hurricane Inga, one of the longest-lived Atlan...",The British Overseas Territory of Bermuda has ...,0.103214
4,en,https://en.wikipedia.org/wiki/Parimelalhagar,https://upload.wikimedia.org/wikipedia/commons...,Parimelalhagar,Early life,Parimelalhagar / Early life,A page from the Parimelalhagar's commentary on...,English: A page from Arumuka Navalar's 1861 ed...,,image/png,4112,2338,False,True,True,"Parimelalhagar, also known as Vanthuvarai Peru...",Parimelalhagar was born in Kancheepuram in the...,A page from the Parimelalhagar's commentary on...,"Parimelalhagar, also known as Vanthuvarai Peru...",0.057020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540382,en,https://en.wikipedia.org/wiki/Standard_Electri...,https://upload.wikimedia.org/wikipedia/commons...,Standard Electric Time Company,,Standard Electric Time Company,A Standard 200177 fire alarm pull station,English: A Standard 200177 pull station in Har...,,image/jpeg,1704,2272,False,True,False,The Standard Electric Time Company was a Sprin...,The Standard Electric Time Company was a Sprin...,A Standard 200177 fire alarm pull station Engl...,The Standard Electric Time Company was a Sprin...,0.133040
540383,en,https://en.wikipedia.org/wiki/Malaysia_Airline...,https://upload.wikimedia.org/wikipedia/commons...,Malaysia Airlines Flight 370,Analysis,Malaysia Airlines Flight 370 / Investigation /...,A heat map indicating the probable location of...,English: Probability of the location where Mal...,,image/jpeg,1061,1500,False,True,True,Malaysia Airlines Flight 370 was a scheduled i...,Two parameters associated with these transmiss...,A heat map indicating the probable location of...,Malaysia Airlines Flight 370 was a scheduled i...,0.218703
540384,en,https://en.wikipedia.org/wiki/Jackson_Plan,https://upload.wikimedia.org/wikipedia/commons...,Jackson Plan,Overall layout,Jackson Plan / Layout and effect of the plan /...,"Map of Singapore in 1914, the layout is now mo...","English: Map of the city of Singapore, ca 1914...",,image/jpeg,700,602,False,True,True,"The Jackson Plan or Raffles Town Plan, an urba...",The plan is an idealised scheme of how Singapo...,"Map of Singapore in 1914, the layout is now mo...","The Jackson Plan or Raffles Town Plan, an urba...",-0.021940
540385,en,https://en.wikipedia.org/wiki/Candiacervus,https://upload.wikimedia.org/wikipedia/commons...,Candiacervus,Taxonomy,Candiacervus / Taxonomy,Hippopotamus creutzburgi and C. cretensis,English: My drawings of the two subspecies of ...,,image/jpeg,643,600,False,True,False,Candiacervus is an extinct genus of deer nativ...,The Cretan deer is a typical example of taxono...,Hippopotamus creutzburgi and C. cretensis Engl...,Candiacervus is an extinct genus of deer nativ...,-0.041651


In [20]:
csim_df = pd.DataFrame()
df = df.reset_index(drop=True)
csim_df = pd.concat([csim_df, df], ignore_index=True)
csim_df

Unnamed: 0,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,mime_type,original_height,original_width,is_main_image,attribution_passes_lang_id,page_changed_recently,context_page_description,context_section_description,caption,context,aircraft_caption_score
0,en,https://en.wikipedia.org/wiki/LibreOffice,https://upload.wikimedia.org/wikipedia/commons...,LibreOffice,Included applications,LibreOffice / Features / Included applications,LibreOffice Math running on Ubuntu,English: Screenshots of LibreOffice Math 6.4 r...,,image/png,801,1400,False,True,True,LibreOffice is a free and open-source office s...,,LibreOffice Math running on Ubuntu English: Sc...,LibreOffice is a free and open-source office s...,-0.006119
1,en,https://en.wikipedia.org/wiki/Dalian,https://upload.wikimedia.org/wikipedia/commons...,Dalian,Research centres,Dalian / Education / Research centres,"Dalian Institute of Chemical Physics, of the C...",,,image/jpeg,2136,3216,False,False,True,Dalian is a major sub-provincial port city in ...,Dalian Institute of Chemical Physics of the Ch...,"Dalian Institute of Chemical Physics, of the C...",Dalian is a major sub-provincial port city in ...,0.077601
2,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,image/jpeg,3000,4000,True,False,False,FMW 7th Anniversary Show was a professional wr...,FMW 7th Anniversary Show was a professional wr...,Kawasaki Stadium English: kawasaki_fujimi Stad...,FMW 7th Anniversary Show was a professional wr...,0.181104
3,en,https://en.wikipedia.org/wiki/List_of_Bermuda_...,https://upload.wikimedia.org/wikipedia/commons...,List of Bermuda hurricanes,1960s,List of Bermuda hurricanes / List of storms / ...,"Hurricane Inga, one of the longest-lived Atlan...",Saffir-Simpson Hurricane ScaleTDTS12345 Englis...,Map showing the path and intensity of Hurrican...,image/png,1900,3000,False,True,True,The British Overseas Territory of Bermuda has ...,"October 7, 1961 – Category 3 Hurricane Frances...","Hurricane Inga, one of the longest-lived Atlan...",The British Overseas Territory of Bermuda has ...,0.103214
4,en,https://en.wikipedia.org/wiki/Parimelalhagar,https://upload.wikimedia.org/wikipedia/commons...,Parimelalhagar,Early life,Parimelalhagar / Early life,A page from the Parimelalhagar's commentary on...,English: A page from Arumuka Navalar's 1861 ed...,,image/png,4112,2338,False,True,True,"Parimelalhagar, also known as Vanthuvarai Peru...",Parimelalhagar was born in Kancheepuram in the...,A page from the Parimelalhagar's commentary on...,"Parimelalhagar, also known as Vanthuvarai Peru...",0.057020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540382,en,https://en.wikipedia.org/wiki/Standard_Electri...,https://upload.wikimedia.org/wikipedia/commons...,Standard Electric Time Company,,Standard Electric Time Company,A Standard 200177 fire alarm pull station,English: A Standard 200177 pull station in Har...,,image/jpeg,1704,2272,False,True,False,The Standard Electric Time Company was a Sprin...,The Standard Electric Time Company was a Sprin...,A Standard 200177 fire alarm pull station Engl...,The Standard Electric Time Company was a Sprin...,0.133040
540383,en,https://en.wikipedia.org/wiki/Malaysia_Airline...,https://upload.wikimedia.org/wikipedia/commons...,Malaysia Airlines Flight 370,Analysis,Malaysia Airlines Flight 370 / Investigation /...,A heat map indicating the probable location of...,English: Probability of the location where Mal...,,image/jpeg,1061,1500,False,True,True,Malaysia Airlines Flight 370 was a scheduled i...,Two parameters associated with these transmiss...,A heat map indicating the probable location of...,Malaysia Airlines Flight 370 was a scheduled i...,0.218703
540384,en,https://en.wikipedia.org/wiki/Jackson_Plan,https://upload.wikimedia.org/wikipedia/commons...,Jackson Plan,Overall layout,Jackson Plan / Layout and effect of the plan /...,"Map of Singapore in 1914, the layout is now mo...","English: Map of the city of Singapore, ca 1914...",,image/jpeg,700,602,False,True,True,"The Jackson Plan or Raffles Town Plan, an urba...",The plan is an idealised scheme of how Singapo...,"Map of Singapore in 1914, the layout is now mo...","The Jackson Plan or Raffles Town Plan, an urba...",-0.021940
540385,en,https://en.wikipedia.org/wiki/Candiacervus,https://upload.wikimedia.org/wikipedia/commons...,Candiacervus,Taxonomy,Candiacervus / Taxonomy,Hippopotamus creutzburgi and C. cretensis,English: My drawings of the two subspecies of ...,,image/jpeg,643,600,False,True,False,Candiacervus is an extinct genus of deer nativ...,The Cretan deer is a typical example of taxono...,Hippopotamus creutzburgi and C. cretensis Engl...,Candiacervus is an extinct genus of deer nativ...,-0.041651


In [21]:
csim_df = pd.DataFrame()
for idx, fn in tqdm(enumerate(f_names)):
#     if idx == 2:
#         break
    df = pd.read_csv(f'{root_path}/{fn}', sep='\t')
    df = df.drop('Unnamed: 0', 1)
    df = df[df["language"]=='en']
    df = df[col_list]
    df = df.reset_index(drop=True)
    csim_df = pd.concat([csim_df, df], ignore_index=True)
    

0it [00:00, ?it/s]

In [34]:
# csim_df.sort_values(by=['196_score'], ascending=False)
csim_df

Unnamed: 0,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,mime_type,original_height,original_width,is_main_image,attribution_passes_lang_id,page_changed_recently,context_page_description,context_section_description,caption,context,aircraft_caption_score
0,en,https://en.wikipedia.org/wiki/LibreOffice,https://upload.wikimedia.org/wikipedia/commons...,LibreOffice,Included applications,LibreOffice / Features / Included applications,LibreOffice Math running on Ubuntu,English: Screenshots of LibreOffice Math 6.4 r...,,image/png,801,1400,False,True,True,LibreOffice is a free and open-source office s...,,LibreOffice Math running on Ubuntu English: Sc...,LibreOffice is a free and open-source office s...,-0.006119
1,en,https://en.wikipedia.org/wiki/Dalian,https://upload.wikimedia.org/wikipedia/commons...,Dalian,Research centres,Dalian / Education / Research centres,"Dalian Institute of Chemical Physics, of the C...",,,image/jpeg,2136,3216,False,False,True,Dalian is a major sub-provincial port city in ...,Dalian Institute of Chemical Physics of the Ch...,"Dalian Institute of Chemical Physics, of the C...",Dalian is a major sub-provincial port city in ...,0.077601
2,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,image/jpeg,3000,4000,True,False,False,FMW 7th Anniversary Show was a professional wr...,FMW 7th Anniversary Show was a professional wr...,Kawasaki Stadium English: kawasaki_fujimi Stad...,FMW 7th Anniversary Show was a professional wr...,0.181104
3,en,https://en.wikipedia.org/wiki/List_of_Bermuda_...,https://upload.wikimedia.org/wikipedia/commons...,List of Bermuda hurricanes,1960s,List of Bermuda hurricanes / List of storms / ...,"Hurricane Inga, one of the longest-lived Atlan...",Saffir-Simpson Hurricane ScaleTDTS12345 Englis...,Map showing the path and intensity of Hurrican...,image/png,1900,3000,False,True,True,The British Overseas Territory of Bermuda has ...,"October 7, 1961 – Category 3 Hurricane Frances...","Hurricane Inga, one of the longest-lived Atlan...",The British Overseas Territory of Bermuda has ...,0.103214
4,en,https://en.wikipedia.org/wiki/Parimelalhagar,https://upload.wikimedia.org/wikipedia/commons...,Parimelalhagar,Early life,Parimelalhagar / Early life,A page from the Parimelalhagar's commentary on...,English: A page from Arumuka Navalar's 1861 ed...,,image/png,4112,2338,False,True,True,"Parimelalhagar, also known as Vanthuvarai Peru...",Parimelalhagar was born in Kancheepuram in the...,A page from the Parimelalhagar's commentary on...,"Parimelalhagar, also known as Vanthuvarai Peru...",0.057020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5411973,en,https://en.wikipedia.org/wiki/List_of_vice_pre...,https://upload.wikimedia.org/wikipedia/commons...,List of vice presidents of the Examination Yuan,List,List of vice presidents of the Examination Yua...,,"English: Jia Jingde, politician of China. 中文: ...",,image/jpeg,1311,983,False,False,True,This is a list of Vice Presidents of the Exami...,Non-partisan Kuomintang (Nationalist) People ...,"English: Jia Jingde, politician of China. 中文:...",This is a list of Vice Presidents of the Exami...,0.074501
5411974,en,https://en.wikipedia.org/wiki/Brennen_Beyer,https://upload.wikimedia.org/wikipedia/commons...,Brennen Beyer,,Brennen Beyer,Beyer in 2013,DSC_9909.jpg,,image/jpeg,1561,1216,True,True,False,Brennen Beyer is an American football outside ...,"Brennen Beyer (born November 25, 1992) is an A...",Beyer in 2013 DSC_9909.jpg,Brennen Beyer is an American football outside ...,0.189365
5411975,en,https://en.wikipedia.org/wiki/Kannagi,https://upload.wikimedia.org/wikipedia/commons...,Kannagi,,Kannagi,Kannagi in Tamil Nadu.,"English: Idol of Kannaki Amman at Sanctum, Tha...",,image/jpeg,720,540,False,False,True,"Kannagi, sometimes spelled Kannaki, is a legen...","Kannagi, sometimes spelled Kannaki, is a legen...",Kannagi in Tamil Nadu. English: Idol of Kannak...,"Kannagi, sometimes spelled Kannaki, is a legen...",0.074853
5411976,en,https://en.wikipedia.org/wiki/Don_Det,https://upload.wikimedia.org/wikipedia/commons...,Don Det,Climate,Don Det / Climate,,English: Landscape with stormy clouds and a pi...,,image/jpeg,3720,6613,False,False,True,"Don Det, is an island in the Mekong River in t...",Don Det features a tropical wet and dry climat...,English: Landscape with stormy clouds and a p...,"Don Det, is an island in the Mekong River in t...",0.096638


In [35]:
csim_df.describe().round(3)

Unnamed: 0,original_height,original_width,aircraft_caption_score
count,5411978.0,5411978.0,5411978.0
mean,1525.582,1823.668,0.085
std,1191.184,1508.342,0.085
min,100.0,100.0,-0.256
25%,576.0,640.0,0.029
50%,1165.0,1299.0,0.079
75%,2281.0,2640.0,0.131
max,29820.0,44250.0,0.723


In [36]:
# df_p = csim_df[csim_df['aircraft_caption_score'] >= csim_df['aircraft_caption_score'].mean()]
df_p = csim_df[csim_df['aircraft_caption_score'] >= 0.13145]
df_p = df_p.reset_index(drop=True)
df_p

Unnamed: 0,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,mime_type,original_height,original_width,is_main_image,attribution_passes_lang_id,page_changed_recently,context_page_description,context_section_description,caption,context,aircraft_caption_score
0,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,image/jpeg,3000,4000,True,False,False,FMW 7th Anniversary Show was a professional wr...,FMW 7th Anniversary Show was a professional wr...,Kawasaki Stadium English: kawasaki_fujimi Stad...,FMW 7th Anniversary Show was a professional wr...,0.181104
1,en,https://en.wikipedia.org/wiki/1994,https://upload.wikimedia.org/wikipedia/commons...,1994,June,1994 / Deaths / June,Menachem Mendel Schneerson,English: Menachem Mendel Schneerson - the Luba...,,image/jpeg,2105,1579,False,False,True,1994 was a common year starting on Saturday of...,"June 4\nRoberto Burle Marx, Brazilian landscap...",Menachem Mendel Schneerson English: Menachem M...,1994 was a common year starting on Saturday of...,0.158601
2,en,https://en.wikipedia.org/wiki/Duffy_Dyer,https://upload.wikimedia.org/wikipedia/commons...,Duffy Dyer,,Duffy Dyer,,English: Image cropped from a baseball card of...,,image/jpeg,904,736,True,True,True,"Donald Robert ""Duffy"" Dyer is an American form...","Donald Robert ""Duffy"" Dyer (born August 15, 19...",English: Image cropped from a baseball card o...,"Donald Robert ""Duffy"" Dyer is an American form...",0.146282
3,en,https://en.wikipedia.org/wiki/Anguera,http://upload.wikimedia.org/wikipedia/commons/...,Anguera,,Anguera,,Português: Anguera,Official seal of Anguera,image/jpeg,160,135,True,False,False,Anguera is a municipality in the Brazilian Sta...,Anguera is a municipality in the Brazilian Sta...,Português: Anguera,Anguera is a municipality in the Brazilian Sta...,0.160984
4,en,https://en.wikipedia.org/wiki/Comparison_of_HT...,https://upload.wikimedia.org/wikipedia/commons...,Comparison of HTC devices,S Series (Windows Mobile),Comparison of HTC devices / S Series (Windows ...,,HTC S710,,image/jpeg,1704,2272,False,True,True,HTC is the original design manufacturer for ma...,,HTC S710,HTC is the original design manufacturer for ma...,0.141916
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352996,en,https://en.wikipedia.org/wiki/Urban_Transporta...,https://upload.wikimedia.org/wikipedia/commons...,Urban Transportation Development Corporation,UTDC products,Urban Transportation Development Corporation /...,Massachusetts Bay Transportation Authority 170...,English: UTDC/Bombardier 1700 series Red Line ...,,image/jpeg,1200,1600,False,True,True,The Urban Transportation Development Corporati...,,Massachusetts Bay Transportation Authority 170...,The Urban Transportation Development Corporati...,0.156601
1352997,en,https://en.wikipedia.org/wiki/Central_Institut...,https://upload.wikimedia.org/wikipedia/commons...,Central Institute of Plastics Engineering & Te...,,Central Institute of Plastics Engineering & Te...,,English: CIPET,,image/jpeg,301,530,True,True,False,Central Institute of Petrochemical Engineering...,Central Institute of Petrochemical Engineering...,English: CIPET,Central Institute of Petrochemical Engineering...,0.141036
1352998,en,https://en.wikipedia.org/wiki/SUN_workstation,https://upload.wikimedia.org/wikipedia/commons...,SUN workstation,History,SUN workstation / History,The three boards (plus memory extension) as la...,Sun 100Y Cardcage and Powersupply Photo taken ...,,image/jpeg,825,1027,False,True,True,The SUN workstation was a modular computer sys...,"In 1979 Xerox donated some Alto computers, dev...",The three boards (plus memory extension) as la...,The SUN workstation was a modular computer sys...,0.160523
1352999,en,https://en.wikipedia.org/wiki/Pyatigorsk,https://upload.wikimedia.org/wikipedia/commons...,Pyatigorsk,,Pyatigorsk,,Русский: снимок июля 2009 г.,,image/jpeg,2448,3264,True,False,True,Pyatigorsk is a city in Stavropol Krai located...,Pyatigorsk (Russian: Пятиго́рск; Circassian: П...,Русский: снимок июля 2009 г.,Pyatigorsk is a city in Stavropol Krai located...,0.161499


In [37]:
df_p.describe().round(3)

Unnamed: 0,original_height,original_width,aircraft_caption_score
count,1353001.0,1353001.0,1353001.0
mean,1432.105,1759.064,0.194
std,1156.547,1476.363,0.065
min,100.0,100.0,0.131
25%,536.0,640.0,0.149
50%,1024.0,1243.0,0.174
75%,2085.0,2580.0,0.215
max,21851.0,38148.0,0.723


# SBERT Semantic Search

In [24]:
from sentence_transformers import SentenceTransformer, util
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [25]:
def calc_all_sim(queries, corpus, doc_len):
    query_embeddings = embedder.encode(queries, convert_to_tensor=True)
    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
    
    corpus_embeddings = corpus_embeddings.to('cuda')
    corpus_embeddings = util.normalize_embeddings(corpus_embeddings)

    query_embeddings = query_embeddings.to('cuda')
    query_embeddings = util.normalize_embeddings(query_embeddings)
    
    hits = util.semantic_search(query_embeddings, corpus_embeddings, score_function=util.dot_score, top_k=doc_len)
    return hits

In [26]:
auto_model = BERTopic(nr_topics="auto")
auto_model = BERTopic.load('../aircraft_auto_model')

In [27]:
av_similar_topics, av_similarity =auto_model.find_topics("aircraft", top_n=50)
print(av_similar_topics)

[63, 3, 271, 145, 298, 11, 45, 201, 35, 108, 58, 144, 157, 186, 230, 159, 220, 77, 256, 104, 86, 2, 34, 247, 23, 80, 165, 32, 319, 126, 66, 225, 25, 8, 235, 295, 227, 44, 234, 50, 251, 31, 10, 258, 72, 93, 118, 19, 139, 237]


In [28]:
topics = get_topics(concept='aircraft')

In [29]:
el_list = [list(topics[el]) for el in topics]
q_id = [el for el in topics]
queries = [' '.join(el) for el in el_list]
queries

['fuselage wing aviation aerodynamic plane airplane aircraft fly',
 'flew aviation flight fighter squadron lindbergh aircraft pilot fly',
 'pilot aviation aerial flight icao licensing fly aerodrome aircraft airport airspace',
 'airport volgadnepr aerosvit superjet flight aeroflot aeroflots aircraft boeing airline',
 'stakeholder shareholders13 remuneration aviation openairplane corporate aircraft pilot airline',
 'lufthansa airport flight airbus southwest passenger aircraft boeing airline airliner',
 'fuselage armament wing navy bomber corsair fighter squadron aircraft',
 'aeronautics aerobraking glide aerodynamic spacecraft balloon rocket spaceflight aerobot aircraft fly',
 'beech 1900d gear wing beechcraft cabin engine cessna aircraft skyhawk',
 'fuselage monoplane wing cantilever sesquiplane design biplane aircraft strut',
 '747400 747 freighter 747400s 737 boeings 7478 aircraft boeing airline airliner',
 '1943 armament raf v72 bomber 1944 squadron vengeance aircraft',
 'aerial ball

In [30]:
for el in topics:
    print(el, topics[el])

63 {'fuselage', 'wing', 'aviation', 'aerodynamic', 'plane', 'airplane', 'aircraft', 'fly'}
3 {'flew', 'aviation', 'flight', 'fighter', 'squadron', 'lindbergh', 'aircraft', 'pilot', 'fly'}
271 {'pilot', 'aviation', 'aerial', 'flight', 'icao', 'licensing', 'fly', 'aerodrome', 'aircraft', 'airport', 'airspace'}
145 {'airport', 'volgadnepr', 'aerosvit', 'superjet', 'flight', 'aeroflot', 'aeroflots', 'aircraft', 'boeing', 'airline'}
298 {'stakeholder', 'shareholders13', 'remuneration', 'aviation', 'openairplane', 'corporate', 'aircraft', 'pilot', 'airline'}
11 {'lufthansa', 'airport', 'flight', 'airbus', 'southwest', 'passenger', 'aircraft', 'boeing', 'airline', 'airliner'}
45 {'fuselage', 'armament', 'wing', 'navy', 'bomber', 'corsair', 'fighter', 'squadron', 'aircraft'}
201 {'aeronautics', 'aerobraking', 'glide', 'aerodynamic', 'spacecraft', 'balloon', 'rocket', 'spaceflight', 'aerobot', 'aircraft', 'fly'}
35 {'beech', '1900d', 'gear', 'wing', 'beechcraft', 'cabin', 'engine', 'cessna', 'a

In [31]:
print(q_id)

[63, 3, 271, 145, 298, 11, 45, 201, 35, 108, 58, 144, 157, 186, 230, 159, 220, 77, 256, 104, 86, 2, 34, 247, 23, 80, 165, 32, 319, 126, 66, 225, 25, 8, 235, 295, 227, 44, 234, 50, 251, 31, 10, 258, 72, 93, 118, 19, 139, 237]


In [32]:
df = df_p

NameError: name 'df_p' is not defined

In [None]:
corpus = df['caption'].tolist()
doc_len = len(corpus)
doc_len

In [40]:
caption_hits = calc_all_sim(queries, corpus, doc_len)

In [41]:
context_hits = calc_all_sim(queries, df['context'].tolist(), doc_len)

In [None]:
def extend_df(df, hits, q_id, test='caption'):
    for idx, hit in enumerate(hits):
        t1 = sorted(hit, key=lambda x: x['corpus_id'])
        df_ = pd.DataFrame(t1) 
        df_ = df_.drop(['corpus_id'], axis = 1)
        df_.columns = [f'{q_id[idx]}_{test}_score']
        df = pd.concat([df, df_], axis=1)
    return df

In [43]:
df_cap = extend_df(df, caption_hits, q_id, test='caption')
df_cap

Unnamed: 0,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,mime_type,...,251_caption_score,31_caption_score,10_caption_score,258_caption_score,72_caption_score,93_caption_score,118_caption_score,19_caption_score,139_caption_score,237_caption_score
0,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,image/jpeg,...,0.108077,0.092764,0.139887,0.140107,0.179678,0.077330,0.142969,0.124351,0.111633,-0.021907
1,en,https://en.wikipedia.org/wiki/1994,https://upload.wikimedia.org/wikipedia/commons...,1994,June,1994 / Deaths / June,Menachem Mendel Schneerson,English: Menachem Mendel Schneerson - the Luba...,,image/jpeg,...,0.056441,-0.026531,-0.005277,0.054293,0.215754,0.059717,0.034773,0.148453,0.015285,0.091922
2,en,https://en.wikipedia.org/wiki/Duffy_Dyer,https://upload.wikimedia.org/wikipedia/commons...,Duffy Dyer,,Duffy Dyer,,English: Image cropped from a baseball card of...,,image/jpeg,...,0.086839,0.094456,0.069409,0.132228,0.186534,0.035307,0.002116,0.143167,0.125465,0.068858
3,en,https://en.wikipedia.org/wiki/Anguera,http://upload.wikimedia.org/wikipedia/commons/...,Anguera,,Anguera,,Português: Anguera,Official seal of Anguera,image/jpeg,...,0.170173,0.107306,0.070983,-0.003365,-0.005805,-0.041658,0.000240,0.159975,0.128010,0.013330
4,en,https://en.wikipedia.org/wiki/Comparison_of_HT...,https://upload.wikimedia.org/wikipedia/commons...,Comparison of HTC devices,S Series (Windows Mobile),Comparison of HTC devices / S Series (Windows ...,,HTC S710,,image/jpeg,...,0.125734,0.021245,0.005219,0.128913,0.022409,-0.066821,0.029201,0.030505,0.109818,-0.046520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352996,en,https://en.wikipedia.org/wiki/Urban_Transporta...,https://upload.wikimedia.org/wikipedia/commons...,Urban Transportation Development Corporation,UTDC products,Urban Transportation Development Corporation /...,Massachusetts Bay Transportation Authority 170...,English: UTDC/Bombardier 1700 series Red Line ...,,image/jpeg,...,0.249414,0.029005,0.106842,0.189523,0.163123,0.075096,0.267071,0.022319,0.173130,0.166965
1352997,en,https://en.wikipedia.org/wiki/Central_Institut...,https://upload.wikimedia.org/wikipedia/commons...,Central Institute of Plastics Engineering & Te...,,Central Institute of Plastics Engineering & Te...,,English: CIPET,,image/jpeg,...,0.025620,-0.076409,-0.066370,-0.127198,-0.051657,-0.007898,0.023288,0.001848,0.042834,0.054752
1352998,en,https://en.wikipedia.org/wiki/SUN_workstation,https://upload.wikimedia.org/wikipedia/commons...,SUN workstation,History,SUN workstation / History,The three boards (plus memory extension) as la...,Sun 100Y Cardcage and Powersupply Photo taken ...,,image/jpeg,...,0.128697,0.110979,-0.026966,0.066499,0.210703,0.063461,0.011475,0.211570,0.031957,0.031842
1352999,en,https://en.wikipedia.org/wiki/Pyatigorsk,https://upload.wikimedia.org/wikipedia/commons...,Pyatigorsk,,Pyatigorsk,,Русский: снимок июля 2009 г.,,image/jpeg,...,-0.001803,-0.120245,0.030910,-0.027620,0.149079,-0.058150,-0.036154,-0.018769,-0.059776,-0.139725


In [44]:
df_cap = extend_df(df_cap, context_hits, q_id, test='context')
df_cap

Unnamed: 0,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,mime_type,...,251_context_score,31_context_score,10_context_score,258_context_score,72_context_score,93_context_score,118_context_score,19_context_score,139_context_score,237_context_score
0,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,image/jpeg,...,0.031788,0.048239,0.098701,0.069938,0.160925,0.029526,0.020314,0.034604,0.078298,0.036085
1,en,https://en.wikipedia.org/wiki/1994,https://upload.wikimedia.org/wikipedia/commons...,1994,June,1994 / Deaths / June,Menachem Mendel Schneerson,English: Menachem Mendel Schneerson - the Luba...,,image/jpeg,...,0.064421,-0.034564,-0.044926,-0.050824,0.254387,0.040912,-0.003759,0.043950,-0.059116,0.055109
2,en,https://en.wikipedia.org/wiki/Duffy_Dyer,https://upload.wikimedia.org/wikipedia/commons...,Duffy Dyer,,Duffy Dyer,,English: Image cropped from a baseball card of...,,image/jpeg,...,-0.045711,0.015057,0.043360,0.081332,0.007223,-0.018978,-0.007571,0.166518,0.090759,0.093823
3,en,https://en.wikipedia.org/wiki/Anguera,http://upload.wikimedia.org/wikipedia/commons/...,Anguera,,Anguera,,Português: Anguera,Official seal of Anguera,image/jpeg,...,0.061626,0.117883,0.080256,-0.053732,-0.011020,-0.000014,-0.027682,0.068539,0.050138,-0.079525
4,en,https://en.wikipedia.org/wiki/Comparison_of_HT...,https://upload.wikimedia.org/wikipedia/commons...,Comparison of HTC devices,S Series (Windows Mobile),Comparison of HTC devices / S Series (Windows ...,,HTC S710,,image/jpeg,...,0.049964,0.030099,-0.111654,0.093280,-0.079855,-0.049397,0.020085,0.046659,-0.038166,-0.073559
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352996,en,https://en.wikipedia.org/wiki/Urban_Transporta...,https://upload.wikimedia.org/wikipedia/commons...,Urban Transportation Development Corporation,UTDC products,Urban Transportation Development Corporation /...,Massachusetts Bay Transportation Authority 170...,English: UTDC/Bombardier 1700 series Red Line ...,,image/jpeg,...,0.201863,-0.010046,0.135240,0.089641,0.061012,0.080667,0.138056,-0.041145,-0.009785,0.058994
1352997,en,https://en.wikipedia.org/wiki/Central_Institut...,https://upload.wikimedia.org/wikipedia/commons...,Central Institute of Plastics Engineering & Te...,,Central Institute of Plastics Engineering & Te...,,English: CIPET,,image/jpeg,...,0.102313,0.134876,0.010525,-0.166722,-0.102252,0.125517,0.003007,-0.055076,0.041394,-0.050013
1352998,en,https://en.wikipedia.org/wiki/SUN_workstation,https://upload.wikimedia.org/wikipedia/commons...,SUN workstation,History,SUN workstation / History,The three boards (plus memory extension) as la...,Sun 100Y Cardcage and Powersupply Photo taken ...,,image/jpeg,...,0.094403,0.062767,0.055319,0.125043,0.097522,0.195547,-0.010939,0.046301,0.098331,0.037743
1352999,en,https://en.wikipedia.org/wiki/Pyatigorsk,https://upload.wikimedia.org/wikipedia/commons...,Pyatigorsk,,Pyatigorsk,,Русский: снимок июля 2009 г.,,image/jpeg,...,-0.030889,0.012876,0.272131,-0.055729,0.012502,-0.004225,0.004322,0.044155,0.002797,-0.078014


# Read data here

In [23]:
df = pd.read_csv(f'{root_path}/aircraft_q75_en_sbert.tsv', sep="\t")
df = df.drop('Unnamed: 0', 1)
df

Unnamed: 0,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,mime_type,...,251_context_score,31_context_score,10_context_score,258_context_score,72_context_score,93_context_score,118_context_score,19_context_score,139_context_score,237_context_score
0,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,image/jpeg,...,0.020924,0.033392,0.091493,0.065447,0.147460,0.013493,0.008091,0.039506,0.079395,0.018479
1,en,https://en.wikipedia.org/wiki/1994,https://upload.wikimedia.org/wikipedia/commons...,1994,June,1994 / Deaths / June,Menachem Mendel Schneerson,English: Menachem Mendel Schneerson - the Luba...,,image/jpeg,...,0.044889,-0.039786,-0.068970,-0.041991,0.244107,0.039615,0.005427,0.041240,-0.072191,0.058443
2,en,https://en.wikipedia.org/wiki/Duffy_Dyer,https://upload.wikimedia.org/wikipedia/commons...,Duffy Dyer,,Duffy Dyer,,English: Image cropped from a baseball card of...,,image/jpeg,...,-0.056391,0.009440,0.066075,0.074756,0.015237,-0.005682,-0.019572,0.151901,0.103661,0.111322
3,en,https://en.wikipedia.org/wiki/Anguera,http://upload.wikimedia.org/wikipedia/commons/...,Anguera,,Anguera,,Português: Anguera,Official seal of Anguera,image/jpeg,...,0.043653,0.124242,0.076659,-0.061763,-0.013700,-0.013047,-0.066803,0.054065,0.030850,-0.076256
4,en,https://en.wikipedia.org/wiki/Comparison_of_HT...,https://upload.wikimedia.org/wikipedia/commons...,Comparison of HTC devices,S Series (Windows Mobile),Comparison of HTC devices / S Series (Windows ...,,HTC S710,,image/jpeg,...,0.019863,0.030068,-0.109320,0.052087,-0.068640,-0.038589,-0.016587,0.056286,-0.053078,-0.071031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352996,en,https://en.wikipedia.org/wiki/Urban_Transporta...,https://upload.wikimedia.org/wikipedia/commons...,Urban Transportation Development Corporation,UTDC products,Urban Transportation Development Corporation /...,Massachusetts Bay Transportation Authority 170...,English: UTDC/Bombardier 1700 series Red Line ...,,image/jpeg,...,0.185282,-0.023973,0.133213,0.088106,0.064406,0.066175,0.129016,-0.035138,-0.027193,0.038367
1352997,en,https://en.wikipedia.org/wiki/Central_Institut...,https://upload.wikimedia.org/wikipedia/commons...,Central Institute of Plastics Engineering & Te...,,Central Institute of Plastics Engineering & Te...,,English: CIPET,,image/jpeg,...,0.091530,0.142491,-0.009801,-0.159589,-0.089324,0.106132,-0.015100,-0.045212,0.013892,-0.063472
1352998,en,https://en.wikipedia.org/wiki/SUN_workstation,https://upload.wikimedia.org/wikipedia/commons...,SUN workstation,History,SUN workstation / History,The three boards (plus memory extension) as la...,Sun 100Y Cardcage and Powersupply Photo taken ...,,image/jpeg,...,0.081194,0.053022,0.059844,0.125492,0.102487,0.197474,0.002423,0.060117,0.070297,0.034687
1352999,en,https://en.wikipedia.org/wiki/Pyatigorsk,https://upload.wikimedia.org/wikipedia/commons...,Pyatigorsk,,Pyatigorsk,,Русский: снимок июля 2009 г.,,image/jpeg,...,-0.053566,0.017492,0.253884,-0.063466,0.014604,-0.015967,-0.007643,0.060969,-0.028550,-0.083007


In [24]:
df_cap = df

In [25]:
df_cap

Unnamed: 0,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,mime_type,...,251_context_score,31_context_score,10_context_score,258_context_score,72_context_score,93_context_score,118_context_score,19_context_score,139_context_score,237_context_score
0,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,image/jpeg,...,0.020924,0.033392,0.091493,0.065447,0.147460,0.013493,0.008091,0.039506,0.079395,0.018479
1,en,https://en.wikipedia.org/wiki/1994,https://upload.wikimedia.org/wikipedia/commons...,1994,June,1994 / Deaths / June,Menachem Mendel Schneerson,English: Menachem Mendel Schneerson - the Luba...,,image/jpeg,...,0.044889,-0.039786,-0.068970,-0.041991,0.244107,0.039615,0.005427,0.041240,-0.072191,0.058443
2,en,https://en.wikipedia.org/wiki/Duffy_Dyer,https://upload.wikimedia.org/wikipedia/commons...,Duffy Dyer,,Duffy Dyer,,English: Image cropped from a baseball card of...,,image/jpeg,...,-0.056391,0.009440,0.066075,0.074756,0.015237,-0.005682,-0.019572,0.151901,0.103661,0.111322
3,en,https://en.wikipedia.org/wiki/Anguera,http://upload.wikimedia.org/wikipedia/commons/...,Anguera,,Anguera,,Português: Anguera,Official seal of Anguera,image/jpeg,...,0.043653,0.124242,0.076659,-0.061763,-0.013700,-0.013047,-0.066803,0.054065,0.030850,-0.076256
4,en,https://en.wikipedia.org/wiki/Comparison_of_HT...,https://upload.wikimedia.org/wikipedia/commons...,Comparison of HTC devices,S Series (Windows Mobile),Comparison of HTC devices / S Series (Windows ...,,HTC S710,,image/jpeg,...,0.019863,0.030068,-0.109320,0.052087,-0.068640,-0.038589,-0.016587,0.056286,-0.053078,-0.071031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352996,en,https://en.wikipedia.org/wiki/Urban_Transporta...,https://upload.wikimedia.org/wikipedia/commons...,Urban Transportation Development Corporation,UTDC products,Urban Transportation Development Corporation /...,Massachusetts Bay Transportation Authority 170...,English: UTDC/Bombardier 1700 series Red Line ...,,image/jpeg,...,0.185282,-0.023973,0.133213,0.088106,0.064406,0.066175,0.129016,-0.035138,-0.027193,0.038367
1352997,en,https://en.wikipedia.org/wiki/Central_Institut...,https://upload.wikimedia.org/wikipedia/commons...,Central Institute of Plastics Engineering & Te...,,Central Institute of Plastics Engineering & Te...,,English: CIPET,,image/jpeg,...,0.091530,0.142491,-0.009801,-0.159589,-0.089324,0.106132,-0.015100,-0.045212,0.013892,-0.063472
1352998,en,https://en.wikipedia.org/wiki/SUN_workstation,https://upload.wikimedia.org/wikipedia/commons...,SUN workstation,History,SUN workstation / History,The three boards (plus memory extension) as la...,Sun 100Y Cardcage and Powersupply Photo taken ...,,image/jpeg,...,0.081194,0.053022,0.059844,0.125492,0.102487,0.197474,0.002423,0.060117,0.070297,0.034687
1352999,en,https://en.wikipedia.org/wiki/Pyatigorsk,https://upload.wikimedia.org/wikipedia/commons...,Pyatigorsk,,Pyatigorsk,,Русский: снимок июля 2009 г.,,image/jpeg,...,-0.053566,0.017492,0.253884,-0.063466,0.014604,-0.015967,-0.007643,0.060969,-0.028550,-0.083007


In [26]:
# df.to_csv(f'{root_path}/aircraft_q75_en_sbert.tsv', sep="\t")

In [27]:
caption_score_list = [el for el in df.columns if 'caption_score' in el]
caption_score_list = caption_score_list[1:]
print(caption_score_list)
print(len(caption_score_list))

['63_caption_score', '3_caption_score', '271_caption_score', '145_caption_score', '298_caption_score', '11_caption_score', '45_caption_score', '201_caption_score', '35_caption_score', '108_caption_score', '58_caption_score', '144_caption_score', '157_caption_score', '186_caption_score', '230_caption_score', '159_caption_score', '220_caption_score', '77_caption_score', '256_caption_score', '104_caption_score', '86_caption_score', '2_caption_score', '34_caption_score', '247_caption_score', '23_caption_score', '80_caption_score', '165_caption_score', '32_caption_score', '319_caption_score', '126_caption_score', '66_caption_score', '225_caption_score', '25_caption_score', '8_caption_score', '235_caption_score', '295_caption_score', '227_caption_score', '44_caption_score', '234_caption_score', '50_caption_score', '251_caption_score', '31_caption_score', '10_caption_score', '258_caption_score', '72_caption_score', '93_caption_score', '118_caption_score', '19_caption_score', '139_caption_scor

In [28]:
context_score_list = [el for el in df.columns if 'context_score' in el]
# context_score_list = context_score_list[:-1]
print(context_score_list)
print(len(context_score_list))

['63_context_score', '3_context_score', '271_context_score', '145_context_score', '298_context_score', '11_context_score', '45_context_score', '201_context_score', '35_context_score', '108_context_score', '58_context_score', '144_context_score', '157_context_score', '186_context_score', '230_context_score', '159_context_score', '220_context_score', '77_context_score', '256_context_score', '104_context_score', '86_context_score', '2_context_score', '34_context_score', '247_context_score', '23_context_score', '80_context_score', '165_context_score', '32_context_score', '319_context_score', '126_context_score', '66_context_score', '225_context_score', '25_context_score', '8_context_score', '235_context_score', '295_context_score', '227_context_score', '44_context_score', '234_context_score', '50_context_score', '251_context_score', '31_context_score', '10_context_score', '258_context_score', '72_context_score', '93_context_score', '118_context_score', '19_context_score', '139_context_scor

In [29]:
other_list = [el for el in df.columns if el not in context_score_list and el not in caption_score_list]
other_list

['language',
 'page_url',
 'image_url',
 'page_title',
 'section_title',
 'hierarchical_section_title',
 'caption_reference_description',
 'caption_attribution_description',
 'caption_alt_text_description',
 'mime_type',
 'original_height',
 'original_width',
 'is_main_image',
 'attribution_passes_lang_id',
 'page_changed_recently',
 'context_page_description',
 'context_section_description',
 'caption',
 'context',
 'aircraft_caption_score']

In [30]:
im_c = ['image_url', 'caption', 'context']
cap_s = ['aircraft_caption_score']
col_list = im_c + cap_s + caption_score_list + context_score_list

### select above avrage images !

In [31]:
df_dict = dict()
for col_id in q_id:
#     num, _ = col.split('_', 1)
    col = f'{col_id}_caption_score'
    c_list = [col] + [f'{col_id}_context_score'] + im_c + cap_s
    df_n = df_cap[c_list]
    df_n = df_n[df_n[col] >= 0.40]
    df_n = df_n[df_n['aircraft_caption_score'] >= 0.48]
#     df = df[df[col] > 0.55]
    df_n = df_n.sort_values(by=[col], ascending=False)
    df_dict[col] = df_n

In [32]:
df_dict[col]

Unnamed: 0,237_caption_score,237_context_score,image_url,caption,context,aircraft_caption_score
882473,0.473679,0.276954,https://upload.wikimedia.org/wikipedia/commons...,The pilot's seat and controls,Gliders are aircrafts which do not have a moto...,0.563156
214365,0.470062,0.529734,https://upload.wikimedia.org/wikipedia/commons...,A pilot preparing to board a vessel by helicop...,"A maritime pilot, marine pilot, harbor pilot, ...",0.509813
631817,0.451976,0.030265,https://upload.wikimedia.org/wikipedia/commons...,A powered paraglider pilot preparing his equi...,Kites are tethered flying objects which fly by...,0.487279
290168,0.449389,0.356556,http://upload.wikimedia.org/wikipedia/commons/...,United States Air Force Aero Commander U-4B.,Aero Commander was an aircraft manufacturer fo...,0.526371
855302,0.444628,0.365209,https://upload.wikimedia.org/wikipedia/commons...,"English: The new ""pilot wings"" for operators ...",Badges of the United States Air Force are spec...,0.685061
875016,0.444628,0.298189,https://upload.wikimedia.org/wikipedia/commons...,"English: The new ""pilot wings"" for operators ...",U.S. Air Force aeronautical ratings are milita...,0.685061
898472,0.444628,0.423746,https://upload.wikimedia.org/wikipedia/commons...,"English: The new ""pilot wings"" for operators ...",The Air Force Specialty Code is an alphanumeri...,0.685061
251015,0.436343,0.42826,https://upload.wikimedia.org/wikipedia/commons...,65th Airlift Squadron Special Air Mission airc...,The 15th Operations Group is the flying compon...,0.555047
530388,0.436139,0.364064,https://upload.wikimedia.org/wikipedia/commons...,English: An airliner pilot.,Captain Beverly Lynn Burns is the first woman ...,0.594817
575245,0.432836,0.103714,https://upload.wikimedia.org/wikipedia/commons...,Aircraft and personnel required for a WWII Air...,RAF Harrowbeer is former Royal Air Force airfi...,0.486628


In [33]:
from IPython.display import display

In [34]:
for el in df_dict:
    t,_ = el.split('_', 1)
#     print(t)
    df_dict[el].columns = [f'{t}_caption_sim', f'{t}_context_sim', 'image_url',
                          'caption', 'context', f'aircraft_caption_sim']
    display(df_dict[el].describe().round(2))
    print('\n')

Unnamed: 0,63_caption_sim,63_context_sim,aircraft_caption_sim
count,825.0,825.0,825.0
mean,0.44,0.27,0.54
std,0.04,0.13,0.05
min,0.4,-0.13,0.48
25%,0.41,0.2,0.5
50%,0.43,0.28,0.53
75%,0.46,0.35,0.57
max,0.67,0.63,0.72






Unnamed: 0,3_caption_sim,3_context_sim,aircraft_caption_sim
count,1654.0,1654.0,1654.0
mean,0.45,0.32,0.52
std,0.04,0.11,0.04
min,0.4,-0.14,0.48
25%,0.42,0.27,0.49
50%,0.44,0.34,0.51
75%,0.47,0.39,0.54
max,0.67,0.58,0.72






Unnamed: 0,271_caption_sim,271_context_sim,aircraft_caption_sim
count,187.0,187.0,187.0
mean,0.42,0.3,0.53
std,0.02,0.11,0.04
min,0.4,-0.09,0.48
25%,0.41,0.24,0.49
50%,0.42,0.32,0.51
75%,0.43,0.38,0.54
max,0.49,0.5,0.69






Unnamed: 0,145_caption_sim,145_context_sim,aircraft_caption_sim
count,1173.0,1173.0,1173.0
mean,0.44,0.28,0.53
std,0.03,0.1,0.04
min,0.4,-0.16,0.48
25%,0.41,0.23,0.5
50%,0.43,0.3,0.52
75%,0.45,0.35,0.55
max,0.7,0.64,0.71






Unnamed: 0,298_caption_sim,298_context_sim,aircraft_caption_sim
count,26.0,26.0,26.0
mean,0.42,0.31,0.54
std,0.01,0.08,0.04
min,0.4,0.16,0.48
25%,0.41,0.27,0.5
50%,0.42,0.3,0.54
75%,0.43,0.33,0.56
max,0.46,0.54,0.62






Unnamed: 0,11_caption_sim,11_context_sim,aircraft_caption_sim
count,490.0,490.0,490.0
mean,0.45,0.3,0.52
std,0.04,0.1,0.04
min,0.4,-0.12,0.48
25%,0.41,0.25,0.49
50%,0.44,0.32,0.51
75%,0.47,0.36,0.54
max,0.61,0.53,0.72






Unnamed: 0,45_caption_sim,45_context_sim,aircraft_caption_sim
count,1296.0,1296.0,1296.0
mean,0.45,0.3,0.52
std,0.04,0.11,0.04
min,0.4,-0.15,0.48
25%,0.42,0.23,0.49
50%,0.44,0.32,0.51
75%,0.47,0.38,0.54
max,0.62,0.54,0.72






Unnamed: 0,201_caption_sim,201_context_sim,aircraft_caption_sim
count,311.0,311.0,311.0
mean,0.43,0.28,0.54
std,0.03,0.11,0.04
min,0.4,-0.06,0.48
25%,0.41,0.21,0.5
50%,0.42,0.29,0.53
75%,0.45,0.35,0.56
max,0.55,0.56,0.69






Unnamed: 0,35_caption_sim,35_context_sim,aircraft_caption_sim
count,599.0,599.0,599.0
mean,0.44,0.27,0.53
std,0.04,0.11,0.04
min,0.4,-0.08,0.48
25%,0.41,0.19,0.5
50%,0.43,0.28,0.52
75%,0.46,0.36,0.55
max,0.68,0.59,0.72






Unnamed: 0,108_caption_sim,108_context_sim,aircraft_caption_sim
count,243.0,243.0,243.0
mean,0.43,0.27,0.53
std,0.04,0.11,0.04
min,0.4,-0.09,0.48
25%,0.41,0.2,0.5
50%,0.42,0.28,0.53
75%,0.45,0.35,0.56
max,0.63,0.49,0.68






Unnamed: 0,58_caption_sim,58_context_sim,aircraft_caption_sim
count,276.0,276.0,276.0
mean,0.45,0.26,0.53
std,0.04,0.11,0.05
min,0.4,-0.04,0.48
25%,0.42,0.19,0.5
50%,0.44,0.25,0.52
75%,0.46,0.33,0.55
max,0.63,0.61,0.72






Unnamed: 0,144_caption_sim,144_context_sim,aircraft_caption_sim
count,1452.0,1452.0,1452.0
mean,0.46,0.32,0.52
std,0.05,0.11,0.04
min,0.4,-0.1,0.48
25%,0.42,0.26,0.49
50%,0.45,0.33,0.51
75%,0.49,0.4,0.53
max,0.68,0.64,0.71






Unnamed: 0,157_caption_sim,157_context_sim,aircraft_caption_sim
count,1101.0,1101.0,1101.0
mean,0.44,0.27,0.53
std,0.04,0.1,0.04
min,0.4,-0.09,0.48
25%,0.42,0.22,0.5
50%,0.43,0.28,0.52
75%,0.46,0.34,0.55
max,0.6,0.59,0.71






Unnamed: 0,186_caption_sim,186_context_sim,aircraft_caption_sim
count,1029.0,1029.0,1029.0
mean,0.44,0.27,0.53
std,0.03,0.11,0.04
min,0.4,-0.15,0.48
25%,0.41,0.2,0.5
50%,0.43,0.27,0.52
75%,0.45,0.34,0.55
max,0.62,0.53,0.72






Unnamed: 0,230_caption_sim,230_context_sim,aircraft_caption_sim
count,193.0,193.0,193.0
mean,0.43,0.27,0.53
std,0.03,0.1,0.04
min,0.4,-0.11,0.48
25%,0.41,0.21,0.5
50%,0.42,0.27,0.52
75%,0.44,0.33,0.55
max,0.57,0.58,0.66






Unnamed: 0,159_caption_sim,159_context_sim,aircraft_caption_sim
count,1189.0,1189.0,1189.0
mean,0.44,0.3,0.52
std,0.04,0.1,0.04
min,0.4,-0.09,0.48
25%,0.42,0.25,0.49
50%,0.44,0.32,0.51
75%,0.46,0.38,0.54
max,0.66,0.58,0.72






Unnamed: 0,220_caption_sim,220_context_sim,aircraft_caption_sim
count,673.0,673.0,673.0
mean,0.44,0.3,0.53
std,0.04,0.11,0.04
min,0.4,-0.15,0.48
25%,0.41,0.25,0.5
50%,0.43,0.32,0.52
75%,0.46,0.38,0.55
max,0.59,0.64,0.72






Unnamed: 0,77_caption_sim,77_context_sim,aircraft_caption_sim
count,294.0,294.0,294.0
mean,0.44,0.31,0.52
std,0.05,0.11,0.04
min,0.4,-0.01,0.48
25%,0.41,0.25,0.49
50%,0.43,0.31,0.51
75%,0.46,0.37,0.54
max,0.67,0.64,0.7






Unnamed: 0,256_caption_sim,256_context_sim,aircraft_caption_sim
count,244.0,244.0,244.0
mean,0.43,0.28,0.52
std,0.03,0.08,0.03
min,0.4,-0.02,0.48
25%,0.41,0.24,0.49
50%,0.43,0.29,0.51
75%,0.45,0.35,0.53
max,0.53,0.46,0.68






Unnamed: 0,104_caption_sim,104_context_sim,aircraft_caption_sim
count,600.0,600.0,600.0
mean,0.44,0.28,0.52
std,0.03,0.1,0.04
min,0.4,-0.04,0.48
25%,0.41,0.22,0.49
50%,0.43,0.3,0.51
75%,0.46,0.36,0.54
max,0.6,0.51,0.72






Unnamed: 0,86_caption_sim,86_context_sim,aircraft_caption_sim
count,258.0,258.0,258.0
mean,0.45,0.29,0.52
std,0.05,0.11,0.04
min,0.4,-0.11,0.48
25%,0.42,0.24,0.5
50%,0.44,0.28,0.51
75%,0.47,0.34,0.54
max,0.72,0.69,0.72






Unnamed: 0,2_caption_sim,2_context_sim,aircraft_caption_sim
count,124.0,124.0,124.0
mean,0.44,0.37,0.52
std,0.04,0.11,0.04
min,0.4,0.04,0.48
25%,0.41,0.31,0.5
50%,0.43,0.39,0.51
75%,0.46,0.45,0.55
max,0.63,0.53,0.64






Unnamed: 0,34_caption_sim,34_context_sim,aircraft_caption_sim
count,127.0,127.0,127.0
mean,0.43,0.32,0.52
std,0.02,0.09,0.04
min,0.4,0.03,0.48
25%,0.41,0.28,0.49
50%,0.42,0.34,0.51
75%,0.44,0.39,0.54
max,0.48,0.53,0.72






Unnamed: 0,247_caption_sim,247_context_sim,aircraft_caption_sim
count,142.0,142.0,142.0
mean,0.44,0.26,0.53
std,0.04,0.11,0.06
min,0.4,-0.0,0.48
25%,0.41,0.17,0.49
50%,0.42,0.26,0.51
75%,0.45,0.33,0.55
max,0.57,0.45,0.71






Unnamed: 0,23_caption_sim,23_context_sim,aircraft_caption_sim
count,216.0,216.0,216.0
mean,0.43,0.25,0.56
std,0.02,0.11,0.06
min,0.4,-0.09,0.48
25%,0.41,0.19,0.51
50%,0.42,0.26,0.55
75%,0.43,0.31,0.59
max,0.51,0.59,0.72






Unnamed: 0,80_caption_sim,80_context_sim,aircraft_caption_sim
count,67.0,67.0,67.0
mean,0.43,0.24,0.54
std,0.03,0.1,0.04
min,0.4,-0.0,0.48
25%,0.41,0.17,0.51
50%,0.42,0.27,0.53
75%,0.45,0.32,0.56
max,0.53,0.42,0.7






Unnamed: 0,165_caption_sim,165_context_sim,aircraft_caption_sim
count,894.0,894.0,894.0
mean,0.44,0.31,0.53
std,0.03,0.12,0.04
min,0.4,-0.08,0.48
25%,0.41,0.23,0.5
50%,0.43,0.32,0.52
75%,0.46,0.4,0.55
max,0.61,0.59,0.72






Unnamed: 0,32_caption_sim,32_context_sim,aircraft_caption_sim
count,580.0,580.0,580.0
mean,0.45,0.28,0.54
std,0.05,0.13,0.05
min,0.4,-0.14,0.48
25%,0.41,0.2,0.5
50%,0.44,0.28,0.52
75%,0.47,0.36,0.56
max,0.7,0.72,0.72






Unnamed: 0,319_caption_sim,319_context_sim,aircraft_caption_sim
count,133.0,133.0,133.0
mean,0.42,0.27,0.54
std,0.02,0.09,0.04
min,0.4,-0.0,0.48
25%,0.41,0.22,0.51
50%,0.42,0.29,0.53
75%,0.44,0.34,0.56
max,0.51,0.44,0.68






Unnamed: 0,126_caption_sim,126_context_sim,aircraft_caption_sim
count,8.0,8.0,8.0
mean,0.43,0.19,0.61
std,0.02,0.07,0.08
min,0.4,0.1,0.48
25%,0.42,0.16,0.54
50%,0.43,0.18,0.62
75%,0.43,0.21,0.69
max,0.47,0.31,0.69






Unnamed: 0,66_caption_sim,66_context_sim,aircraft_caption_sim
count,16.0,16.0,16.0
mean,0.42,0.26,0.53
std,0.02,0.12,0.04
min,0.4,0.02,0.49
25%,0.41,0.23,0.5
50%,0.42,0.3,0.52
75%,0.44,0.34,0.53
max,0.47,0.48,0.68






Unnamed: 0,225_caption_sim,225_context_sim,aircraft_caption_sim
count,65.0,65.0,65.0
mean,0.43,0.28,0.52
std,0.02,0.09,0.03
min,0.4,0.05,0.48
25%,0.41,0.21,0.5
50%,0.42,0.29,0.51
75%,0.44,0.34,0.54
max,0.52,0.46,0.6






Unnamed: 0,25_caption_sim,25_context_sim,aircraft_caption_sim
count,906.0,906.0,906.0
mean,0.46,0.32,0.52
std,0.05,0.12,0.04
min,0.4,-0.18,0.48
25%,0.42,0.25,0.49
50%,0.44,0.33,0.51
75%,0.48,0.4,0.54
max,0.68,0.6,0.71






Unnamed: 0,8_caption_sim,8_context_sim,aircraft_caption_sim
count,1411.0,1411.0,1411.0
mean,0.45,0.34,0.52
std,0.05,0.12,0.04
min,0.4,-0.08,0.48
25%,0.42,0.27,0.49
50%,0.44,0.35,0.51
75%,0.47,0.42,0.54
max,0.74,0.65,0.7






Unnamed: 0,235_caption_sim,235_context_sim,aircraft_caption_sim
count,467.0,467.0,467.0
mean,0.46,0.26,0.53
std,0.05,0.1,0.04
min,0.4,-0.12,0.48
25%,0.42,0.2,0.5
50%,0.44,0.27,0.52
75%,0.49,0.33,0.55
max,0.69,0.61,0.72






Unnamed: 0,295_caption_sim,295_context_sim,aircraft_caption_sim
count,482.0,482.0,482.0
mean,0.45,0.32,0.53
std,0.04,0.11,0.05
min,0.4,-0.13,0.48
25%,0.42,0.27,0.5
50%,0.44,0.33,0.51
75%,0.48,0.39,0.54
max,0.7,0.66,0.72






Unnamed: 0,227_caption_sim,227_context_sim,aircraft_caption_sim
count,344.0,344.0,344.0
mean,0.44,0.29,0.52
std,0.04,0.09,0.03
min,0.4,-0.02,0.48
25%,0.41,0.24,0.49
50%,0.43,0.3,0.51
75%,0.46,0.34,0.54
max,0.61,0.51,0.72






Unnamed: 0,44_caption_sim,44_context_sim,aircraft_caption_sim
count,788.0,788.0,788.0
mean,0.44,0.29,0.52
std,0.03,0.12,0.04
min,0.4,-0.06,0.48
25%,0.41,0.21,0.5
50%,0.43,0.3,0.51
75%,0.46,0.38,0.54
max,0.62,0.56,0.71






Unnamed: 0,234_caption_sim,234_context_sim,aircraft_caption_sim
count,2488.0,2488.0,2488.0
mean,0.47,0.32,0.52
std,0.05,0.12,0.04
min,0.4,-0.2,0.48
25%,0.43,0.26,0.49
50%,0.46,0.34,0.51
75%,0.5,0.4,0.54
max,0.8,0.66,0.72






Unnamed: 0,50_caption_sim,50_context_sim,aircraft_caption_sim
count,175.0,175.0,175.0
mean,0.44,0.26,0.52
std,0.05,0.11,0.04
min,0.4,-0.08,0.48
25%,0.41,0.18,0.49
50%,0.42,0.25,0.51
75%,0.44,0.33,0.54
max,0.67,0.62,0.72






Unnamed: 0,251_caption_sim,251_context_sim,aircraft_caption_sim
count,936.0,936.0,936.0
mean,0.44,0.26,0.52
std,0.03,0.09,0.04
min,0.4,-0.13,0.48
25%,0.41,0.22,0.49
50%,0.43,0.27,0.52
75%,0.45,0.32,0.54
max,0.58,0.49,0.72






Unnamed: 0,31_caption_sim,31_context_sim,aircraft_caption_sim
count,54.0,54.0,54.0
mean,0.45,0.32,0.53
std,0.05,0.14,0.05
min,0.4,0.08,0.48
25%,0.41,0.22,0.5
50%,0.43,0.31,0.52
75%,0.49,0.4,0.55
max,0.57,0.59,0.69






Unnamed: 0,10_caption_sim,10_context_sim,aircraft_caption_sim
count,352.0,352.0,352.0
mean,0.44,0.3,0.52
std,0.03,0.12,0.04
min,0.4,-0.05,0.48
25%,0.41,0.23,0.49
50%,0.43,0.3,0.51
75%,0.46,0.37,0.54
max,0.56,0.56,0.72






Unnamed: 0,258_caption_sim,258_context_sim,aircraft_caption_sim
count,402.0,402.0,402.0
mean,0.43,0.26,0.53
std,0.03,0.09,0.05
min,0.4,-0.07,0.48
25%,0.41,0.21,0.5
50%,0.43,0.26,0.52
75%,0.44,0.32,0.55
max,0.56,0.54,0.71






Unnamed: 0,72_caption_sim,72_context_sim,aircraft_caption_sim
count,2288.0,2288.0,2288.0
mean,0.46,0.3,0.52
std,0.04,0.1,0.04
min,0.4,-0.16,0.48
25%,0.42,0.25,0.49
50%,0.45,0.32,0.51
75%,0.48,0.37,0.54
max,0.63,0.69,0.72






Unnamed: 0,93_caption_sim,93_context_sim,aircraft_caption_sim
count,87.0,87.0,87.0
mean,0.42,0.26,0.55
std,0.02,0.1,0.05
min,0.4,0.0,0.48
25%,0.41,0.2,0.51
50%,0.42,0.26,0.54
75%,0.43,0.33,0.59
max,0.52,0.54,0.7






Unnamed: 0,118_caption_sim,118_context_sim,aircraft_caption_sim
count,143.0,143.0,143.0
mean,0.44,0.24,0.52
std,0.03,0.12,0.03
min,0.4,-0.11,0.48
25%,0.41,0.18,0.49
50%,0.43,0.26,0.51
75%,0.45,0.3,0.53
max,0.56,0.53,0.62






Unnamed: 0,19_caption_sim,19_context_sim,aircraft_caption_sim
count,577.0,577.0,577.0
mean,0.44,0.27,0.54
std,0.04,0.11,0.05
min,0.4,-0.1,0.48
25%,0.41,0.2,0.5
50%,0.43,0.28,0.53
75%,0.45,0.35,0.56
max,0.66,0.63,0.72






Unnamed: 0,139_caption_sim,139_context_sim,aircraft_caption_sim
count,51.0,51.0,51.0
mean,0.43,0.27,0.52
std,0.03,0.1,0.04
min,0.4,-0.1,0.48
25%,0.41,0.2,0.49
50%,0.43,0.29,0.51
75%,0.45,0.32,0.53
max,0.52,0.48,0.61






Unnamed: 0,237_caption_sim,237_context_sim,aircraft_caption_sim
count,37.0,37.0,37.0
mean,0.42,0.25,0.55
std,0.02,0.16,0.06
min,0.4,-0.07,0.48
25%,0.41,0.1,0.49
50%,0.42,0.32,0.54
75%,0.43,0.36,0.59
max,0.47,0.53,0.69






In [35]:
def simple_search(context, t):
    percentage = 0
    words = list(topics[t])
    for w in words:
        if w in context:
            percentage = percentage + 1
    return float(percentage/len(words))

In [36]:
list(topics[3])

['fighter',
 'pilot',
 'aircraft',
 'fly',
 'flew',
 'lindbergh',
 'squadron',
 'aviation',
 'flight']

In [37]:
df_dict[el]['context'].tolist()[0]

"Gliders are aircrafts which do not have a motor. Gliders are controlled by their pilots by using control-sticks. Some gliders can only carry one person; others can carry two. In gliders with two seats, each pilot has a control-stick. Gliders always have seats for the pilots.\n'Sailplanes' are gliders with long wings so that they will only lose height slowly. In some places a vertical draft makes the air go up faster than the glider is going down. The pilot of a glider can make it climb by flying to these places. This is called soaring. Good pilots can travel long distances by always finding rising air. Some pilots race each other over hundreds of kilometres each day. Other pilots just fly for fun.\nGliders cannot get into the air by themselves. They are pulled into the air by an aircraft with a motor or they are pulled up by motor on the ground.\nThere are two other types of gliders. Hang-gliders have frames to give the wings their shape, but do not have seats for their pilots and do 

In [38]:
print(el)
simple_search(df_dict[el]['context'].tolist()[0], 3)

237_caption_score


0.3333333333333333

In [39]:
for t in topics:
    df_key = str(t) + '_caption_score'
    df_dict[df_key]['context_gt'] = df_dict[df_key]['context'].apply(simple_search, t=t)
    df_dict[df_key]['topic_id'] = t


In [40]:
df_dict['3_caption_score']

Unnamed: 0,3_caption_sim,3_context_sim,image_url,caption,context,aircraft_caption_sim,context_gt,topic_id
416539,0.666607,-0.025179,https://upload.wikimedia.org/wikipedia/commons...,Flight Squadron,"Legoland California Resort is a theme park, mi...",0.675919,0.000000,3
1297757,0.600752,0.050619,https://upload.wikimedia.org/wikipedia/commons...,"English: Palmach Pilots at Nir Am, 1948",Nir Am is a kibbutz in southern Israel. Locate...,0.489747,0.000000,3
1126601,0.598147,0.211940,https://upload.wikimedia.org/wikipedia/commons...,English: A U.S. Navy Curtiss SC-1 Seahawk sco...,Operation Beleaguer was a major United States ...,0.485429,0.000000,3
575245,0.595281,0.370130,https://upload.wikimedia.org/wikipedia/commons...,Aircraft and personnel required for a WWII Air...,RAF Harrowbeer is former Royal Air Force airfi...,0.486628,0.000000,3
9256,0.589440,0.491482,https://upload.wikimedia.org/wikipedia/commons...,"A VF-172 F2H-2 on USS Essex off Korea, 1951 En...",The McDonnell F2H Banshee was a single-seat ca...,0.509946,0.666667,3
...,...,...,...,...,...,...,...,...
1191934,0.400102,0.410751,https://upload.wikimedia.org/wikipedia/commons...,"English: A Short Calcutta or S.8, a civilian ...",The Short Calcutta or S.8 was a civilian bipla...,0.565612,0.111111,3
823831,0.400085,0.305435,https://upload.wikimedia.org/wikipedia/commons...,Four Fairchild Republic A-10C Thunderbolt IIs ...,The 355th Wing is a United States Air Force un...,0.516216,0.000000,3
533593,0.400061,0.345619,http://upload.wikimedia.org/wikipedia/commons/...,English: Image title: Pilot and crew members ...,"The Boeing L-15 Scout or YL-15 was a small, pi...",0.506776,0.222222,3
39742,0.400046,0.409461,https://upload.wikimedia.org/wikipedia/commons...,Merlin-powered prototype K7208 (converted Whit...,The Armstrong Whitworth A.W.38 Whitley was one...,0.483269,0.666667,3


In [41]:
df_dict['3_caption_score'].describe().round(2)

Unnamed: 0,3_caption_sim,3_context_sim,aircraft_caption_sim,context_gt,topic_id
count,1654.0,1654.0,1654.0,1654.0,1654.0
mean,0.45,0.32,0.52,0.24,3.0
std,0.04,0.11,0.04,0.19,0.0
min,0.4,-0.14,0.48,0.0,3.0
25%,0.42,0.27,0.49,0.11,3.0
50%,0.44,0.34,0.51,0.22,3.0
75%,0.47,0.39,0.54,0.33,3.0
max,0.67,0.58,0.72,0.89,3.0


In [42]:
print(q_id)

[63, 3, 271, 145, 298, 11, 45, 201, 35, 108, 58, 144, 157, 186, 230, 159, 220, 77, 256, 104, 86, 2, 34, 247, 23, 80, 165, 32, 319, 126, 66, 225, 25, 8, 235, 295, 227, 44, 234, 50, 251, 31, 10, 258, 72, 93, 118, 19, 139, 237]


In [43]:
pg_dict = dict()
pg_dict['a'] = [31]
pg_dict['b'] = [19, 32]
pg_dict['c'] = [157, 230, 63, 319, 80, 201]
pg_dict['d'] = [144, 45, 104, 35, 44, 165, 186]
pg_dict['e'] = [8, 25, 159, 108, 50, 234]

pg_dict['f'] = [235, 225, 58, 227]
pg_dict['g'] = [93, 258, 139, 251]
pg_dict['h'] = [145, 256, 34, 11, 77, 86, 10, 118, 295]
pg_dict['i'] = [3, 271, 298]
pg_dict['j'] = [247, 2, 237, 72, 23, 126, 66, 220]

In [44]:
k_name = ['image_url', 'caption', 'context', 'concept2caption_sim', 
          'context_gt', 'topic_id']
# k_name = ['image_url', 'caption', 'context', 'concept2caption_sim', 
#           'concept2context_sim']
for el in pg_dict:
    pg_dict[el] = [f'{i}_caption_score' for i in pg_dict[el]]
    pg_dict[el] = [df_dict[i] for i in pg_dict[el]]
    pg_dict[el] = [df_.reset_index(drop=True) for df_ in pg_dict[el]]
    

In [45]:
for el in pg_dict:
    t_df = pd.DataFrame(columns = ['topic2caption_sim', 'topic2context_sim'] + k_name)
    for df_ in pg_dict[el]:
        df_.columns = ['topic2caption_sim', 'topic2context_sim'] + k_name
        t_df = pd.concat([t_df, df_], ignore_index=True)
    pg_dict[el] = t_df

In [46]:
for el in pg_dict:
    print(el)
    display(pg_dict[el].describe().round(2))


a


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,54.0,54.0,54.0,54.0
mean,0.45,0.32,0.53,0.18
std,0.05,0.14,0.05,0.16
min,0.4,0.08,0.48,0.0
25%,0.41,0.22,0.5,0.0
50%,0.43,0.31,0.52,0.1
75%,0.49,0.4,0.55,0.3
max,0.57,0.59,0.69,0.5


b


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,1157.0,1157.0,1157.0,1157.0
mean,0.45,0.27,0.54,0.16
std,0.05,0.12,0.05,0.15
min,0.4,-0.14,0.48,0.0
25%,0.41,0.2,0.5,0.0
50%,0.43,0.28,0.53,0.14
75%,0.46,0.35,0.56,0.29
max,0.7,0.72,0.72,0.86


c


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,2630.0,2630.0,2630.0,2630.0
mean,0.44,0.27,0.53,0.17
std,0.04,0.11,0.04,0.16
min,0.4,-0.13,0.48,0.0
25%,0.41,0.21,0.5,0.09
50%,0.43,0.28,0.52,0.12
75%,0.45,0.34,0.55,0.25
max,0.67,0.63,0.72,0.88


d


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,6658.0,6658.0,6658.0,6658.0
mean,0.45,0.29,0.52,0.21
std,0.04,0.11,0.04,0.16
min,0.4,-0.15,0.48,0.0
25%,0.42,0.23,0.49,0.1
50%,0.43,0.31,0.51,0.2
75%,0.46,0.38,0.54,0.33
max,0.68,0.64,0.72,0.9


e


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,6412.0,6412.0,6412.0,6412.0
mean,0.46,0.32,0.52,0.24
std,0.05,0.12,0.04,0.17
min,0.4,-0.2,0.48,0.0
25%,0.42,0.25,0.49,0.11
50%,0.44,0.33,0.51,0.25
75%,0.48,0.4,0.54,0.33
max,0.8,0.66,0.72,1.0


f


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,1152.0,1152.0,1152.0,1152.0
mean,0.45,0.27,0.53,0.16
std,0.05,0.1,0.04,0.13
min,0.4,-0.12,0.48,0.0
25%,0.42,0.21,0.5,0.09
50%,0.44,0.28,0.51,0.11
75%,0.47,0.33,0.54,0.22
max,0.69,0.61,0.72,0.67


g


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,1476.0,1476.0,1476.0,1476.0
mean,0.43,0.26,0.53,0.09
std,0.03,0.09,0.04,0.08
min,0.4,-0.13,0.48,0.0
25%,0.41,0.22,0.5,0.0
50%,0.43,0.27,0.52,0.09
75%,0.45,0.32,0.55,0.1
max,0.58,0.54,0.72,0.5


h


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,3563.0,3563.0,3563.0,3563.0
mean,0.44,0.29,0.52,0.16
std,0.04,0.11,0.04,0.12
min,0.4,-0.16,0.48,0.0
25%,0.41,0.24,0.49,0.1
50%,0.43,0.3,0.51,0.1
75%,0.46,0.36,0.54,0.2
max,0.72,0.69,0.72,0.88


i


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,1867.0,1867.0,1867.0,1867.0
mean,0.45,0.32,0.52,0.23
std,0.04,0.11,0.04,0.19
min,0.4,-0.14,0.48,0.0
25%,0.42,0.27,0.49,0.11
50%,0.44,0.34,0.51,0.22
75%,0.47,0.39,0.54,0.33
max,0.67,0.58,0.72,0.89


j


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,3504.0,3504.0,3504.0,3504.0
mean,0.45,0.3,0.53,0.14
std,0.04,0.11,0.04,0.13
min,0.4,-0.16,0.48,0.0
25%,0.42,0.24,0.49,0.09
50%,0.44,0.31,0.51,0.09
75%,0.47,0.37,0.55,0.18
max,0.63,0.69,0.72,0.88


In [47]:
pg_dict[el]

Unnamed: 0,topic2caption_sim,topic2context_sim,image_url,caption,context,concept2caption_sim,context_gt,topic_id
0,0.574294,0.182000,https://upload.wikimedia.org/wikipedia/commons...,Cockpit,The Antonov An-24 is a 44-seat twin turboprop ...,0.594450,0.125,247
1,0.574294,0.158678,https://upload.wikimedia.org/wikipedia/commons...,Cockpit,"The Sukhoi Su-25 Grach is a single-seat, twin-...",0.594450,0.500,247
2,0.574294,0.277860,https://upload.wikimedia.org/wikipedia/commons...,Cockpit,The Audi Q2 is a subcompact luxury crossover S...,0.594450,0.250,247
3,0.558939,0.132581,https://upload.wikimedia.org/wikipedia/commons...,Cockpit Cockpit. Mig-27,The Mikoyan MiG-27 is a variable-geometry grou...,0.527617,0.375,247
4,0.549168,0.257903,https://upload.wikimedia.org/wikipedia/commons...,B-24 cockpit,The Consolidated B-24 Liberator is an American...,0.509501,0.250,247
...,...,...,...,...,...,...,...,...
3499,0.400389,0.319766,https://upload.wikimedia.org/wikipedia/commons...,Sikorsky VS-44A Excambian Excambian is the onl...,The Sikorsky VS-44 was a large four-engined fl...,0.519452,0.100,220
3500,0.400362,0.304889,https://upload.wikimedia.org/wikipedia/commons...,Scandinavian Airlines Boeing B-17. SAS Boeing ...,This list of Boeing B-17 Flying Fortress opera...,0.560772,0.100,220
3501,0.400106,0.042777,https://upload.wikimedia.org/wikipedia/commons...,"Air Force 3701, the presidential aircraft of t...",The President of the Republic of China commonl...,0.551530,0.000,220
3502,0.400070,0.318413,http://upload.wikimedia.org/wikipedia/commons/...,"A Fokker F.VIII, similar to the accident aircr...",The 1927 KLM Fokker F.VIII crash happened on 2...,0.533410,0.200,220


In [48]:
df_ = pd.DataFrame()
for el in pg_dict:
    df_ = pd.concat([df_, pg_dict[el]], ignore_index=True)
df_

Unnamed: 0,topic2caption_sim,topic2context_sim,image_url,caption,context,concept2caption_sim,context_gt,topic_id
0,0.567402,0.157547,https://upload.wikimedia.org/wikipedia/commons...,Quadcopter camera drone in flight English: A q...,Human bycatch is a term for people who are uni...,0.499038,0.0,31
1,0.541941,0.313474,https://upload.wikimedia.org/wikipedia/commons...,"The RQ-2 Pioneer, an unmanned reconnaissance a...",AAI Corporation is an aerospace and defense de...,0.485552,0.2,31
2,0.541577,0.209477,https://upload.wikimedia.org/wikipedia/commons...,Early Air Drone,This list of aircraft at the Imperial War Muse...,0.528770,0.1,31
3,0.535151,0.452562,https://upload.wikimedia.org/wikipedia/commons...,Flying prototype of the Parrot AR.Drone Parrot...,A quadcopter or quadrotor is a type of helicop...,0.507844,0.1,31
4,0.526887,0.508497,http://upload.wikimedia.org/wikipedia/commons/...,English: Wasp IIII small unmanned aircraft sy...,The AeroVironment Wasp III Small Unmanned Airc...,0.574095,0.1,31
...,...,...,...,...,...,...,...,...
28468,0.400389,0.319766,https://upload.wikimedia.org/wikipedia/commons...,Sikorsky VS-44A Excambian Excambian is the onl...,The Sikorsky VS-44 was a large four-engined fl...,0.519452,0.1,220
28469,0.400362,0.304889,https://upload.wikimedia.org/wikipedia/commons...,Scandinavian Airlines Boeing B-17. SAS Boeing ...,This list of Boeing B-17 Flying Fortress opera...,0.560772,0.1,220
28470,0.400106,0.042777,https://upload.wikimedia.org/wikipedia/commons...,"Air Force 3701, the presidential aircraft of t...",The President of the Republic of China commonl...,0.551530,0.0,220
28471,0.400070,0.318413,http://upload.wikimedia.org/wikipedia/commons/...,"A Fokker F.VIII, similar to the accident aircr...",The 1927 KLM Fokker F.VIII crash happened on 2...,0.533410,0.2,220


In [49]:
display(df_.describe().round(2))

Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,28473.0,28473.0,28473.0,28473.0
mean,0.45,0.3,0.53,0.19
std,0.04,0.11,0.04,0.16
min,0.4,-0.2,0.48,0.0
25%,0.42,0.23,0.49,0.09
50%,0.43,0.31,0.51,0.18
75%,0.47,0.37,0.54,0.27
max,0.8,0.72,0.72,1.0


## Download AV images

In [202]:
import urllib
import base64
import copy
from os.path import exists
from time import sleep
import requests

In [52]:
session = requests.session()

In [135]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 '
                         'Safari/537.11',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
           'Accept-Encoding': 'none',
           'Accept-Language': 'en-US,en;q=0.8',
           'Connection': 'keep-alive'
           }

In [136]:
img_root = '/raid/AISSEL/htest/datasets/wit/images/'

In [137]:
def get_image(link):
    URL = copy.copy(link)
    link = link.encode("utf-8")
    b64 = base64.b64encode(link)
    s_64 = str(b64)
    s_64 = s_64[2:-1]
    im_path = img_root + s_64 + '.jpg'
    if not exists(im_path):
        try:
            r = session.get(URL, headers=headers)
            with open(im_path, 'wb') as f:
                f.write(r.content)
#             print(f'we download {str(base64.b64decode(s_64))[2:-1]}')
        except OSError as e:
#             print(e)
            pass
        except Exception as e:
            print(e)
#             print(f'I could not download {str(base64.b64decode(s_64))[2:-1]}')
            sleep(5)

In [132]:
df_url = df_[['image_url']]
df_url

Unnamed: 0,image_url
0,https://upload.wikimedia.org/wikipedia/commons...
1,https://upload.wikimedia.org/wikipedia/commons...
2,https://upload.wikimedia.org/wikipedia/commons...
3,https://upload.wikimedia.org/wikipedia/commons...
4,https://upload.wikimedia.org/wikipedia/commons...
...,...
6786,https://upload.wikimedia.org/wikipedia/commons...
6787,https://upload.wikimedia.org/wikipedia/commons...
6788,https://upload.wikimedia.org/wikipedia/commons...
6789,https://upload.wikimedia.org/wikipedia/commons...


In [138]:
for index, row in tqdm(df_url.iterrows(), total=df_url.shape[0]):
    get_image(row["image_url"])

  0%|          | 0/6791 [00:00<?, ?it/s]

In [56]:
display(df_.describe().round(2))

Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,6791.0,6791.0,6791.0,6791.0
mean,0.54,0.34,0.41,0.22
std,0.07,0.15,0.09,0.19
min,0.36,-0.18,0.13,0.0
25%,0.49,0.25,0.35,0.1
50%,0.54,0.35,0.41,0.2
75%,0.58,0.44,0.47,0.33
max,0.8,0.73,0.71,0.9


In [52]:
for el in pg_dict:
    print(el)
    display(pg_dict[el].describe().round(2))

a


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,136.0,136.0,136.0,136.0
mean,0.51,0.24,0.41,0.12
std,0.03,0.19,0.08,0.14
min,0.47,-0.13,0.17,0.0
25%,0.48,0.09,0.36,0.0
50%,0.5,0.23,0.42,0.1
75%,0.52,0.4,0.46,0.2
max,0.68,0.59,0.62,0.5


b


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,272.0,272.0,272.0,272.0
mean,0.57,0.38,0.41,0.3
std,0.03,0.15,0.08,0.2
min,0.53,-0.12,0.22,0.0
25%,0.55,0.32,0.36,0.14
50%,0.57,0.41,0.41,0.29
75%,0.59,0.48,0.46,0.43
max,0.7,0.7,0.64,0.86


c


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,814.0,814.0,814.0,814.0
mean,0.52,0.31,0.44,0.21
std,0.06,0.16,0.1,0.19
min,0.42,-0.11,0.18,0.0
25%,0.47,0.2,0.37,0.09
50%,0.51,0.33,0.43,0.18
75%,0.58,0.43,0.51,0.29
max,0.74,0.66,0.71,0.89


d


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,951.0,951.0,951.0,951.0
mean,0.55,0.35,0.43,0.25
std,0.04,0.12,0.08,0.19
min,0.5,-0.14,0.19,0.0
25%,0.52,0.28,0.37,0.1
50%,0.54,0.37,0.43,0.22
75%,0.57,0.44,0.48,0.4
max,0.71,0.64,0.68,0.8


e


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,813.0,813.0,813.0,813.0
mean,0.6,0.4,0.41,0.3
std,0.06,0.16,0.07,0.23
min,0.46,-0.07,0.15,0.0
25%,0.57,0.3,0.36,0.11
50%,0.6,0.42,0.41,0.33
75%,0.63,0.51,0.46,0.44
max,0.8,0.73,0.68,0.89


f


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,544.0,544.0,544.0,544.0
mean,0.53,0.32,0.42,0.2
std,0.06,0.11,0.07,0.14
min,0.42,-0.01,0.24,0.0
25%,0.5,0.25,0.37,0.09
50%,0.54,0.32,0.42,0.18
75%,0.57,0.39,0.47,0.27
max,0.78,0.61,0.64,0.67


g


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,544.0,544.0,544.0,544.0
mean,0.49,0.29,0.43,0.11
std,0.05,0.11,0.1,0.11
min,0.41,-0.05,0.15,0.0
25%,0.46,0.22,0.37,0.0
50%,0.5,0.28,0.44,0.09
75%,0.53,0.36,0.49,0.18
max,0.64,0.55,0.7,0.5


h


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,1223.0,1223.0,1223.0,1223.0
mean,0.56,0.38,0.39,0.21
std,0.05,0.15,0.09,0.14
min,0.45,-0.12,0.13,0.0
25%,0.52,0.29,0.33,0.1
50%,0.56,0.38,0.39,0.2
75%,0.59,0.49,0.44,0.3
max,0.75,0.72,0.64,0.88


k


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,408.0,408.0,408.0,408.0
mean,0.48,0.33,0.42,0.21
std,0.07,0.12,0.09,0.19
min,0.39,-0.05,0.13,0.0
25%,0.42,0.27,0.37,0.09
50%,0.46,0.35,0.42,0.11
75%,0.57,0.42,0.47,0.33
max,0.67,0.59,0.68,0.78


i


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim,context_gt
count,1086.0,1086.0,1086.0,1086.0
mean,0.5,0.32,0.38,0.24
std,0.07,0.14,0.11,0.21
min,0.36,-0.18,0.13,0.0
25%,0.47,0.24,0.31,0.09
50%,0.51,0.32,0.39,0.2
75%,0.55,0.42,0.45,0.38
max,0.69,0.69,0.7,0.9


## select image for retrain from wit 


In [62]:
from io import BytesIO
import pillow_avif
from svglib.svglib import svg2rlg
from reportlab.graphics import renderPM
im_test = '/raid/AISSEL/htest/datasets/wit/images'

In [63]:
from os.path import exists
import base64
im_root_path = '/raid/AISSEL/htest/datasets/wit/images'

In [64]:
# def b64(link):
#     link = link.encode("utf-8")
#     im_path = f'{im_root_path}/{str(base64.b64encode(link))[2:-1]}.jpg'
#     if exists(im_path):
#         return im_path
#     return 
    

In [65]:
def link_to_b64(link):
    im_root_path = '/raid/AISSEL/htest/datasets/wit/images'
    link = link.encode("utf-8")
    _, pos = str(link)[:-1].rsplit('.', 1)
    if pos == 'svg':
        return
    im_path = f'{str(base64.b64encode(link))[2:-1]}'
    if exists(f'{im_root_path}/{im_path}.jpg'):
        return f'{im_path}.jpg'
    return 
    

In [66]:
def b64(fn):
    im_test = '/raid/AISSEL/htest/datasets/wit/images'
    fn = f'{im_test}/{fn}'
    base64_str = None
    if exists(fn):
        try:
            img = Image.open(fn)
            img_buffer = BytesIO()
            img.save(img_buffer, format=img.format)
            byte_data = img_buffer.getvalue()
            base64_str = str(base64.b64encode(byte_data))[2:-1]
        except:
            base64_str = None
            print(fn)

    return base64_str


def remove_special(input_string):
    final_string = ""
    for character in input_string:
        if  character == " ":
            final_string = final_string + character
        else:
            if(character.isalnum()):
                final_string = final_string + character
    return final_string


def remove_special(input_string):
    final_string = ""
    for character in input_string:
        if  character == " ":
            final_string = final_string + character
        else:
            if(character.isalnum()):
                final_string = final_string + character
    return final_string



def create_data(df, count):
#     col = ['uniq_id', 'image_id', 'caption', 'context', 'topic_id', 'labels', 'image']
    col = ['uniq_id', 'image_id', 'caption', 'labels', 'image', 'topic2caption_sim', 'concept2caption_sim']
    t_df = pd.DataFrame(columns=col)
    t_df['caption'] = df['caption'].apply(remove_special)
#     t_df['context'] = df['context'].apply(remove_special)
#     t_df['topic_id'] = df['topic_id']
    t_df['topic2caption_sim'] = df['topic2caption_sim']
    t_df['concept2caption_sim'] = df['concept2caption_sim']
    t_df['topic2context_sim'] = df['topic2context_sim']
    t_df['uniq_id'] = df.index + count
    t_df['image_id'] = df.index + count
    t_df['image'] = df['image_path'].apply(b64)
    t_df['labels'] = ' '
    t_df = t_df[t_df.image.notnull()]
    t_df = t_df.reset_index(drop=True)
    
    return t_df

In [67]:
def df2_df1(df1, df2):
    df = df1.merge(df2, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']
    df = df[selected_col].reset_index(drop=True)
    return df

In [68]:
def df2_df1(df1, df2):
    cond = df2['image_path'].isin(df1['image_path'])
    df2.drop(df2[cond].index, inplace = True)
    return df2

In [69]:
def get_portion(df, p1=75, p2=15, p3=5, p4=5):
    s1 = df.sample(frac = p1/100)
    rest_part_1 = df.drop(s1.index)
    s2 = rest_part_1.sample(frac = p2/(100-p1))
    rest_part_2 = rest_part_1.drop(s2.index)
    s3 = rest_part_2.sample(frac = p3/(100 - p1 - p2))
    s4 = rest_part_2.drop(s3.index)
    return s1, s2, s3, s4

### Consider All topics

In [58]:
missed_topics_dict = dict()
for el in pg_dict:
    missed_topics_dict[el] = pg_dict[el]
    missed_topics_dict[el]['image_path'] = missed_topics_dict[el]['image_url'].apply(link_to_b64)
    missed_topics_dict[el] = missed_topics_dict[el][missed_topics_dict[el].image_path.notnull()]
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [59]:
missed_topics_dict['a']

Unnamed: 0,topic2caption_sim,topic2context_sim,image_url,caption,context,concept2caption_sim,context_gt,topic_id,image_path
0,0.567402,0.157547,https://upload.wikimedia.org/wikipedia/commons...,Quadcopter camera drone in flight English: A q...,Human bycatch is a term for people who are uni...,0.499038,0.0,31,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
1,0.541941,0.313474,https://upload.wikimedia.org/wikipedia/commons...,"The RQ-2 Pioneer, an unmanned reconnaissance a...",AAI Corporation is an aerospace and defense de...,0.485552,0.2,31,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
2,0.541577,0.209477,https://upload.wikimedia.org/wikipedia/commons...,Early Air Drone,This list of aircraft at the Imperial War Muse...,0.52877,0.1,31,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
3,0.535151,0.452562,https://upload.wikimedia.org/wikipedia/commons...,Flying prototype of the Parrot AR.Drone Parrot...,A quadcopter or quadrotor is a type of helicop...,0.507844,0.1,31,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
4,0.526887,0.508497,http://upload.wikimedia.org/wikipedia/commons/...,English: Wasp IIII small unmanned aircraft sy...,The AeroVironment Wasp III Small Unmanned Airc...,0.574095,0.1,31,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...
5,0.526887,0.096267,http://upload.wikimedia.org/wikipedia/commons/...,English: Wasp IIII small unmanned aircraft sy...,This is a list of military equipment of the Cz...,0.574095,0.0,31,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...
6,0.526887,0.124755,http://upload.wikimedia.org/wikipedia/commons/...,English: Wasp IIII small unmanned aircraft sy...,This is a list of the equipment of the Royal N...,0.574095,0.0,31,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...
7,0.520849,0.568642,https://upload.wikimedia.org/wikipedia/commons...,English: This remote controlled toy aircraft ...,An unmanned aerial vehicle is an aircraft that...,0.482129,0.5,31,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
8,0.504371,0.569343,https://upload.wikimedia.org/wikipedia/commons...,English: U.S. Army Hunter (RQ-5) unmanned air...,The IAI RQ-5 Hunter unmanned aerial vehicle wa...,0.619249,0.3,31,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
9,0.502673,0.376664,http://upload.wikimedia.org/wikipedia/commons/...,English: Naval Aircraft Factory TDN-1 assault...,The Naval Aircraft Factory TDN was an early un...,0.55226,0.3,31,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...


In [60]:
col = ['topic2caption_sim', 'concept2caption_sim', 'topic2context_sim', 'context_gt', 'caption']


In [61]:
# selected_col = ['caption', 'context', 'topic_id', 'image_path']
selected_col = ['image_path','topic2caption_sim', 'concept2caption_sim', 'topic2context_sim', 'context_gt', 'caption']
for el in missed_topics_dict:
    missed_topics_dict[el] = missed_topics_dict[el][selected_col]
    missed_topics_dict[el] = missed_topics_dict[el].drop_duplicates(subset=selected_col, keep=False)
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [62]:
for el in missed_topics_dict:
#     missed_topics_dict[el] = missed_topics_dict[el][selected_col]
    missed_topics_dict[el] = missed_topics_dict[el].drop_duplicates(keep=False)
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [63]:
missed_topics_dict['b']

Unnamed: 0,image_path,topic2caption_sim,concept2caption_sim,topic2context_sim,context_gt,caption
0,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.657047,0.488641,0.375870,0.285714,"The Glidersport LightHawk, with its complex wi..."
1,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.614542,0.487890,0.466881,0.428571,Glidersport LightHawk. Source: Danny Howell I...
2,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.614162,0.523776,0.418855,0.428571,"PIK-20E, finnish glider. Used by NASA Dryden ..."
3,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.614162,0.523776,0.465150,0.571429,"PIK-20E, finnish glider. Used by NASA Dryden ..."
4,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.614162,0.523776,0.332000,0.142857,"PIK-20E, finnish glider. Used by NASA Dryden ..."
...,...,...,...,...,...,...
671,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.400778,0.596466,0.293323,0.142857,English: Aviation in Britain Before the First...
672,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.400698,0.572550,0.032985,0.142857,"A Farman-Goliath aircraft, similar to the one ..."
673,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.400264,0.496464,0.347239,0.285714,"The Flying Squadron, 1898. English: US Flying ..."
674,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.400120,0.508751,0.676892,0.714286,Noorduyn Norseman bushplane English: CF-FQI Fl...


In [64]:
for el in missed_topics_dict:
    print(el, len(missed_topics_dict[el]))

a 36
b 676
c 1551
d 3548
e 3231
f 758
g 850
h 2151
i 969
j 1970


In [65]:
sorted_key = []
for k in sorted(missed_topics_dict, key=lambda k: len(missed_topics_dict[k])):
    print(k, len(missed_topics_dict[k]))
    sorted_key.append(k)

a 36
b 676
f 758
g 850
i 969
c 1551
j 1970
h 2151
e 3231
d 3548


In [66]:
sorted_key

['a', 'b', 'f', 'g', 'i', 'c', 'j', 'h', 'e', 'd']

In [67]:
def df2_df1(df1, df2):
    cond = df2['image_path'].isin(df1['image_path'])
    df2.drop(df2[cond].index, inplace = True)
    return df2

In [68]:
for idx, k in enumerate(sorted_key):
    print(sorted_key[idx:], k)
    for el in sorted_key[idx:]:
        if el == k:
            continue
        print(f'we are going to calc: {el} - {k}')
        missed_topics_dict[el] = df2_df1(missed_topics_dict[k], missed_topics_dict[el])
#     break

['a', 'b', 'f', 'g', 'i', 'c', 'j', 'h', 'e', 'd'] a
we are going to calc: b - a
we are going to calc: f - a
we are going to calc: g - a
we are going to calc: i - a
we are going to calc: c - a
we are going to calc: j - a
we are going to calc: h - a
we are going to calc: e - a
we are going to calc: d - a
['b', 'f', 'g', 'i', 'c', 'j', 'h', 'e', 'd'] b
we are going to calc: f - b
we are going to calc: g - b
we are going to calc: i - b
we are going to calc: c - b
we are going to calc: j - b
we are going to calc: h - b
we are going to calc: e - b
we are going to calc: d - b
['f', 'g', 'i', 'c', 'j', 'h', 'e', 'd'] f
we are going to calc: g - f
we are going to calc: i - f
we are going to calc: c - f
we are going to calc: j - f
we are going to calc: h - f
we are going to calc: e - f
we are going to calc: d - f
['g', 'i', 'c', 'j', 'h', 'e', 'd'] g
we are going to calc: i - g
we are going to calc: c - g
we are going to calc: j - g
we are going to calc: h - g
we are going to calc: e - g
we are

In [69]:
sorted_key = []
for k in sorted(missed_topics_dict, key=lambda k: len(missed_topics_dict[k])):
    print(k, len(missed_topics_dict[k]))
    sorted_key.append(k)

a 36
d 42
h 72
e 85
c 252
j 252
g 343
i 432
f 595
b 666


In [70]:
missed_topics_dict['i']

Unnamed: 0,image_path,topic2caption_sim,concept2caption_sim,topic2context_sim,context_gt,caption
1,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.600752,0.489747,0.050619,0.000000,"English: Palmach Pilots at Nir Am, 1948"
3,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.595281,0.486628,0.370130,0.000000,Aircraft and personnel required for a WWII Air...
4,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.589440,0.509946,0.491482,0.666667,"A VF-172 F2H-2 on USS Essex off Korea, 1951 En..."
13,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.571025,0.528487,0.368774,0.222222,SAAF Lockheed PV1 Ventura. This was the only ...
16,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.569951,0.510757,0.417252,0.222222,A Royal Air Force Armstrong Whitworth Whitley...
...,...,...,...,...,...,...
941,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.400339,0.493250,0.397041,0.000000,A V Bird Airbus A320 English: Airbus A320-214 ...
942,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.400151,0.539210,0.180117,0.000000,"Depart of Commerce, Bureau of Aeronautics pla..."
949,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...,0.430870,0.556423,0.377635,0.111111,World Aircraft Company Vision English: World A...
955,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...,0.417771,0.559165,0.250192,0.111111,English: World Aircraft Company Vision


In [71]:
# def create_data(df, count):
#     col = ['topic2caption_sim', 'concept2caption_sim', 'topic2context_sim', 'context_gt', 'caption']
#     t_df = pd.DataFrame(columns=col)
#     t_df['caption'] = df['caption'].apply(remove_special)
# #     t_df['context'] = df['context'].apply(remove_special)
# #     t_df['topic_id'] = df['topic_id']
#     t_df['topic2caption_sim'] = df['topic2caption_sim']
#     t_df['concept2caption_sim'] = df['concept2caption_sim']
#     t_df['topic2context_sim'] = df['topic2context_sim']
#     t_df['context_gt'] = df['context_gt']
# #     t_df['uniq_id'] = df.index + count
# #     t_df['image_id'] = df.index + count
# #     t_df['image'] = df['image_path'].apply(b64)
# #     t_df['labels'] = ' '
# #     t_df = t_df[t_df.image.notnull()]
#     t_df = t_df.reset_index(drop=True)
    
#     return t_df

In [72]:
missed_topics_dict[el]

Unnamed: 0,image_path,topic2caption_sim,concept2caption_sim,topic2context_sim,context_gt,caption
599,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.41292,0.499738,0.363781,0.222222,A Soviet Yak-38 Forger with its landing gear d...
605,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.412575,0.531537,0.161412,0.222222,Missile
692,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.400595,0.51842,0.447617,0.333333,A two-seater TAV-8B Harrier II from VMAT-203 i...
980,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.450818,0.495318,0.257555,0.0,A Python 4 missile under the wing F-15D Baz '9...
989,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.447981,0.51842,0.377651,0.333333,A two-seater TAV-8B Harrier II from VMAT-203 i...
999,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.445865,0.521429,0.218003,0.222222,English: LVG E.I prototype serial E.600/15. T...
1013,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...,0.442833,0.509075,0.326662,0.222222,Soviet heavy bomber Petlyakov Pe-8. English: P...
1065,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.433928,0.529284,0.336553,0.111111,Type 4 anti-aircraft gun. Japanese Type 4 75 m...
1196,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.41577,0.526971,0.398127,0.111111,English: Italian Breda Ba.88 ground-attack ai...
1298,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.405826,0.503801,0.213427,0.111111,A NAMC YS-11 similar to the accident aircraft ...


In [73]:
missed_topics_with_images = dict()
count = 0
for el in missed_topics_dict:
    missed_topics_with_images[el] = create_data(missed_topics_dict[el], count)
    count = len(missed_topics_with_images[el])

/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9iL2IyLzNyZF9VU19NYXJpbmVfQWlyY3JhZnRfV2luZy5wbmc=.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9mL2Y5L0ZseWJlJTJDX0FUUl83Mi01MDAlMkNfRUktUkVNXyUyODI3MzIxNjE1OTUwJTI5LmpwZw==.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy83LzcxLzFzdF9VU19NYXJpbmVfQWlyY3JhZnRfV2luZy5wbmc=.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy81LzU5LzR0aF9VU19NYXJpbmVfQWlyY3JhZnRfV2luZy5wbmc=.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9mL2Y5L0ZseWJlJTJDX0FUUl83Mi01MDAlMkNfRUktUkVNXyUyODI3MzIxNjE1OTUwJTI5LmpwZw==.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy8xLzE1L0dvc3NhbWVyX2NhYmluLmpwZw==.jpg
/raid/AISSEL/htest/datas

In [74]:
missed_topics_with_images[el]

Unnamed: 0,uniq_id,image_id,caption,labels,image,topic2caption_sim,concept2caption_sim,topic2context_sim
0,428,428,A17A cockpit English Picture take at the US Ai...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.533386,0.489819,0.199344
1,485,485,A powered paraglider pilot preparing his equi...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.403713,0.487279,-0.003739
2,495,495,Wreckage of aircraft after impact English Avia...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.527216,0.481262,0.435577
3,502,502,N73711 the aircraft involved in the accident,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.490395,0.545516,0.407958
4,511,511,TCJES the aircraft involved the accident,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.473275,0.525275,0.472720
...,...,...,...,...,...,...,...,...
245,2357,2357,Control tower of the Airport parked planes and...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.405332,0.492046,0.142041
246,2359,2359,A US Marshal on a Con Air flight English US Ma...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.404742,0.501877,0.257710
247,2362,2362,An Atlantique plane belonging to the Italian N...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.403967,0.530053,0.294949
248,2377,2377,An Aeroflot Tupolev Tu154B2 similar to the acc...,,iVBORw0KGgoAAAANSUhEUgAAA/8AAAKYCAIAAACq5t9XAA...,0.402549,0.523945,0.316884


In [75]:
s = 0
for el in missed_topics_with_images:
    missed_topics_with_images[el] = missed_topics_with_images[el].dropna(subset=['caption'])
    missed_topics_with_images[el]['topics']  = [el] * len(missed_topics_with_images[el])
    print(el, len(missed_topics_with_images[el]))
    s = s + len(missed_topics_with_images[el])
print(s)

a 36
b 649
c 248
d 42
e 85
f 591
g 333
h 72
i 421
j 250
2727


In [76]:
missed_topics_with_images['a']

Unnamed: 0,uniq_id,image_id,caption,labels,image,topic2caption_sim,concept2caption_sim,topic2context_sim,topics
0,0,0,Quadcopter camera drone in flight English A qu...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.567402,0.499038,0.157547,a
1,1,1,The RQ2 Pioneer an unmanned reconnaissance air...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.541941,0.485552,0.313474,a
2,2,2,Early Air Drone,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.541577,0.52877,0.209477,a
3,3,3,Flying prototype of the Parrot ARDrone Parrot ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.535151,0.507844,0.452562,a
4,4,4,English Wasp IIII small unmanned aircraft system,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.526887,0.574095,0.508497,a
5,5,5,English Wasp IIII small unmanned aircraft system,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.526887,0.574095,0.096267,a
6,6,6,English Wasp IIII small unmanned aircraft system,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.526887,0.574095,0.124755,a
7,7,7,English This remote controlled toy aircraft h...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.520849,0.482129,0.568642,a
8,8,8,English US Army Hunter RQ5 unmanned aircraft,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.504371,0.619249,0.569343,a
9,9,9,English Naval Aircraft Factory TDN1 assault d...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.502673,0.55226,0.376664,a


In [77]:
scol = ['topic2caption_sim', 'concept2caption_sim']
for el in missed_topics_with_images:
    print(el)
    display(missed_topics_with_images[el][scol].describe().round(2))

a


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,36.0,36.0
mean,0.47,0.54
std,0.05,0.06
min,0.4,0.48
25%,0.42,0.51
50%,0.48,0.52
75%,0.5,0.57
max,0.57,0.69


b


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,649.0,649.0
mean,0.45,0.55
std,0.06,0.05
min,0.4,0.48
25%,0.41,0.5
50%,0.43,0.53
75%,0.47,0.57
max,0.7,0.72


c


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,248.0,248.0
mean,0.43,0.52
std,0.03,0.03
min,0.4,0.48
25%,0.41,0.5
50%,0.43,0.52
75%,0.44,0.54
max,0.57,0.64


d


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,42.0,42.0
mean,0.43,0.52
std,0.02,0.03
min,0.4,0.48
25%,0.41,0.5
50%,0.42,0.52
75%,0.44,0.53
max,0.5,0.62


e


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,85.0,85.0
mean,0.43,0.5
std,0.03,0.02
min,0.4,0.48
25%,0.41,0.48
50%,0.42,0.49
75%,0.45,0.51
max,0.56,0.58


f


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,591.0,591.0
mean,0.46,0.52
std,0.05,0.03
min,0.4,0.48
25%,0.42,0.49
50%,0.45,0.51
75%,0.49,0.53
max,0.69,0.62


g


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,333.0,333.0
mean,0.44,0.52
std,0.03,0.04
min,0.4,0.48
25%,0.41,0.49
50%,0.43,0.51
75%,0.45,0.54
max,0.57,0.69


h


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,72.0,72.0
mean,0.44,0.51
std,0.05,0.02
min,0.4,0.48
25%,0.41,0.49
50%,0.42,0.5
75%,0.45,0.52
max,0.61,0.58


i


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,421.0,421.0
mean,0.45,0.52
std,0.04,0.03
min,0.4,0.48
25%,0.42,0.49
50%,0.44,0.51
75%,0.47,0.53
max,0.6,0.7


j


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,250.0,250.0
mean,0.45,0.52
std,0.04,0.03
min,0.4,0.48
25%,0.41,0.49
50%,0.44,0.51
75%,0.47,0.53
max,0.58,0.65


In [78]:
df_ = pd.DataFrame()
for el in missed_topics_with_images:
    df_ = pd.concat([df_, missed_topics_with_images[el]], ignore_index=True)
# df_
display(df_[scol].describe().round(2))

Unnamed: 0,topic2caption_sim,concept2caption_sim
count,2727.0,2727.0
mean,0.45,0.52
std,0.05,0.04
min,0.4,0.48
25%,0.41,0.5
50%,0.43,0.51
75%,0.47,0.54
max,0.7,0.72


In [79]:
def get_portion(df, p1=72, p2=16, p3=12):
    s1 = df.sample(frac = p1/100)
    rest_part_1 = df.drop(s1.index)
    s2 = rest_part_1.sample(frac = p2/(100-p1))
    s3 = rest_part_1.drop(s2.index)
    return s1, s2, s3

In [80]:
names = ['uniq_id', 'image_id', 'caption', 'labels', 'image']

## add missing concept:


In [259]:
missed_ones = ['a', 'e', 'g', 'i', 'j']

In [261]:
for el in missed_ones:
    print(el)

    display(missed_topics_with_images[el][['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

a


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,36.0,36.0
mean,0.47,0.54
std,0.05,0.06
min,0.4,0.48
25%,0.42,0.51
50%,0.48,0.52
75%,0.5,0.57
max,0.57,0.69


e


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,85.0,85.0
mean,0.43,0.5
std,0.03,0.02
min,0.4,0.48
25%,0.41,0.48
50%,0.42,0.49
75%,0.45,0.51
max,0.56,0.58


g


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,333.0,333.0
mean,0.44,0.52
std,0.03,0.04
min,0.4,0.48
25%,0.41,0.49
50%,0.43,0.51
75%,0.45,0.54
max,0.57,0.69


i


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,421.0,421.0
mean,0.45,0.52
std,0.04,0.03
min,0.4,0.48
25%,0.42,0.49
50%,0.44,0.51
75%,0.47,0.53
max,0.6,0.7


j


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,250.0,250.0
mean,0.45,0.52
std,0.04,0.03
min,0.4,0.48
25%,0.41,0.49
50%,0.44,0.51
75%,0.47,0.53
max,0.58,0.65


In [264]:
df_l = pd.DataFrame()
for el in missed_ones:
    df_l = pd.concat([df_l, missed_topics_with_images[el]], ignore_index=True)
# df_
display(df_l[['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

Unnamed: 0,topic2caption_sim,concept2caption_sim
count,1125.0,1125.0
mean,0.44,0.52
std,0.04,0.03
min,0.4,0.48
25%,0.41,0.49
50%,0.43,0.51
75%,0.47,0.53
max,0.6,0.7


In [266]:
name_lsit = ['stage1_train', 'stage2_train', 'val']
por_dict = dict()
data_dict = dict()
for n in name_lsit:
    data_dict[n] = pd.DataFrame() 
for el in missed_ones:
    s1, s2, v = get_portion(missed_topics_with_images[el])
    
    s1 = s1.reset_index(drop=True)
    data_dict['stage1_train'] = pd.concat([data_dict['stage1_train'], s1], ignore_index=True)
    
    s2 = s2.reset_index(drop=True)
    data_dict['stage2_train'] = pd.concat([data_dict['stage2_train'], s2], ignore_index=True)
    
    v = v.reset_index(drop=True)
    data_dict['val'] = pd.concat([data_dict['val'], v], ignore_index=True)
#     data_dict[el] = por_dict

In [267]:
len(data_dict['stage1_train'])

810

In [268]:
len(data_dict['stage2_train'])

180

In [269]:
len(data_dict['val'])

135

In [273]:
! mkdir -p /raid/AISSEL/htest/datasets/av_data/wit/missed_q50_over_avg
saved_path = '/raid/AISSEL/htest/datasets/av_data/wit/missed_q50_over_avg'
for el in data_dict:
    name = f'caption_{el}.tsv'
    df_s = data_dict[el][names]
    df_s.to_csv(f'{saved_path}/{name}', sep="\t", index=False, header=False)

    print(name)

caption_stage1_train.tsv
caption_stage2_train.tsv
caption_val.tsv


### based on CIDEr and SPICE


In [88]:
missed_ones = ['c', 'd', 'e', 'h', 'i']

In [89]:
for el in missed_ones:
    print(el)
    display(missed_topics_with_images[el][['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

c


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,248.0,248.0
mean,0.43,0.52
std,0.03,0.03
min,0.4,0.48
25%,0.41,0.5
50%,0.43,0.52
75%,0.44,0.54
max,0.57,0.64


d


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,42.0,42.0
mean,0.43,0.52
std,0.02,0.03
min,0.4,0.48
25%,0.41,0.5
50%,0.42,0.52
75%,0.44,0.53
max,0.5,0.62


e


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,85.0,85.0
mean,0.43,0.5
std,0.03,0.02
min,0.4,0.48
25%,0.41,0.48
50%,0.42,0.49
75%,0.45,0.51
max,0.56,0.58


h


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,72.0,72.0
mean,0.44,0.51
std,0.05,0.02
min,0.4,0.48
25%,0.41,0.49
50%,0.42,0.5
75%,0.45,0.52
max,0.61,0.58


i


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,421.0,421.0
mean,0.45,0.52
std,0.04,0.03
min,0.4,0.48
25%,0.42,0.49
50%,0.44,0.51
75%,0.47,0.53
max,0.6,0.7


In [90]:
df_l = pd.DataFrame()
for el in missed_ones:
    df_l = pd.concat([df_l, missed_topics_with_images[el]], ignore_index=True)
# df_
display(df_l[['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

Unnamed: 0,topic2caption_sim,concept2caption_sim
count,868.0,868.0
mean,0.44,0.52
std,0.04,0.03
min,0.4,0.48
25%,0.41,0.49
50%,0.43,0.51
75%,0.46,0.53
max,0.61,0.7


In [91]:
name_lsit = ['stage1_train', 'stage2_train', 'val']
por_dict = dict()
data_dict = dict()
for n in name_lsit:
    data_dict[n] = pd.DataFrame() 
for el in missed_ones:
    s1, s2, v = get_portion(missed_topics_with_images[el])
    
    s1 = s1.reset_index(drop=True)
    data_dict['stage1_train'] = pd.concat([data_dict['stage1_train'], s1], ignore_index=True)
    
    s2 = s2.reset_index(drop=True)
    data_dict['stage2_train'] = pd.concat([data_dict['stage2_train'], s2], ignore_index=True)
    
    v = v.reset_index(drop=True)
    data_dict['val'] = pd.concat([data_dict['val'], v], ignore_index=True)
#     data_dict[el] = por_dict

In [92]:
print(len(data_dict['stage1_train']))
print(len(data_dict['stage2_train']))
print(len(data_dict['val']))

625
138
105


In [93]:
#Cider
! mkdir -p /raid/AISSEL/htest/datasets/av_data/wit/missed_q50_over_avg_cider
saved_path = '/raid/AISSEL/htest/datasets/av_data/wit/missed_q50_over_avg_cider'
for el in data_dict:
    name = f'caption_{el}.tsv'
    df_s = data_dict[el][names]
    df_s.to_csv(f'{saved_path}/{name}', sep="\t", index=False, header=False)

    print(name)

caption_stage1_train.tsv
caption_stage2_train.tsv
caption_val.tsv


### CIDEr web

In [91]:
missed_ones = ['c', 'd', 'e', 'f']

In [92]:
for el in missed_ones:
    print(el)
    display(missed_topics_with_images[el][['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

c


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,248.0,248.0
mean,0.43,0.52
std,0.03,0.03
min,0.4,0.48
25%,0.41,0.5
50%,0.43,0.52
75%,0.44,0.54
max,0.57,0.64


d


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,42.0,42.0
mean,0.43,0.52
std,0.02,0.03
min,0.4,0.48
25%,0.41,0.5
50%,0.42,0.52
75%,0.44,0.53
max,0.5,0.62


e


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,85.0,85.0
mean,0.43,0.5
std,0.03,0.02
min,0.4,0.48
25%,0.41,0.48
50%,0.42,0.49
75%,0.45,0.51
max,0.56,0.58


f


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,591.0,591.0
mean,0.46,0.52
std,0.05,0.03
min,0.4,0.48
25%,0.42,0.49
50%,0.45,0.51
75%,0.49,0.53
max,0.69,0.62


In [93]:
df_l = pd.DataFrame()
for el in missed_ones:
    df_l = pd.concat([df_l, missed_topics_with_images[el]], ignore_index=True)
# df_
display(df_l[['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

Unnamed: 0,topic2caption_sim,concept2caption_sim
count,966.0,966.0
mean,0.45,0.52
std,0.05,0.03
min,0.4,0.48
25%,0.42,0.49
50%,0.43,0.51
75%,0.47,0.53
max,0.69,0.64


In [94]:
name_lsit = ['stage1_train', 'stage2_train', 'val']
por_dict = dict()
data_dict = dict()
for n in name_lsit:
    data_dict[n] = pd.DataFrame() 
for el in missed_ones:
    s1, s2, v = get_portion(missed_topics_with_images[el])
    
    s1 = s1.reset_index(drop=True)
    data_dict['stage1_train'] = pd.concat([data_dict['stage1_train'], s1], ignore_index=True)
    
    s2 = s2.reset_index(drop=True)
    data_dict['stage2_train'] = pd.concat([data_dict['stage2_train'], s2], ignore_index=True)
    
    v = v.reset_index(drop=True)
    data_dict['val'] = pd.concat([data_dict['val'], v], ignore_index=True)
#     data_dict[el] = por_dict

In [95]:
print(len(data_dict['stage1_train']))
print(len(data_dict['stage2_train']))
print(len(data_dict['val']))

696
154
116


In [96]:
#Cider
! mkdir -p /raid/AISSEL/htest/datasets/av_data/wit/missed_q50_over_avg_cider_web
saved_path = '/raid/AISSEL/htest/datasets/av_data/wit/missed_q50_over_avg_cider_web'
for el in data_dict:
    name = f'caption_{el}.tsv'
    df_s = data_dict[el][names]
    df_s.to_csv(f'{saved_path}/{name}', sep="\t", index=False, header=False)

    print(name)

caption_stage1_train.tsv
caption_stage2_train.tsv
caption_val.tsv


In [161]:
name_lsit = ['stage1_train', 'stage2_train', 'val', 'test']
por_dict = dict()
data_dict = dict()
for n in name_lsit:
    data_dict[n] = pd.DataFrame() 
for el in missed_topics_with_images:
    s1, s2, v, t = get_portion(missed_topics_with_images[el])
    
    s1 = s1.reset_index(drop=True)
    data_dict['stage1_train'] = pd.concat([data_dict['stage1_train'], s1], ignore_index=True)
    
    s2 = s2.reset_index(drop=True)
    data_dict['stage2_train'] = pd.concat([data_dict['stage2_train'], s2], ignore_index=True)
    
    t = t.reset_index(drop=True)
    data_dict['test'] = pd.concat([data_dict['test'], t], ignore_index=True)
    
    v = v.reset_index(drop=True)
    data_dict['val'] = pd.concat([data_dict['val'], v], ignore_index=True)
#     data_dict[el] = por_dict

In [162]:
len(data_dict['stage1_train'])

4121

In [163]:
len(data_dict['stage2_train'])

825

In [164]:
len(data_dict['test'])

273

In [165]:
len(data_dict['val'])

274

In [295]:
! mkdir -p /raid/AISSEL/htest/datasets/av_data

In [171]:
data_dict['test']

Unnamed: 0,uniq_id,image_id,caption,context,topic_id,labels,image,topics
0,12,12,This drone is an example of UAV that could be ...,Humanrobot interaction is the study of interac...,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a
1,25,25,Ryan BQM34 Firebee jetpropelled drone used as ...,A target drone is an unmanned aerial vehicle g...,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a
2,28,28,ARDrone preproduction prototype Parrot ARDrone...,Parrot ARDrone is a discontinued remote contro...,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a
3,63,63,English Naval Aircraft Factory TDN1 assault d...,The Naval Aircraft Factory TDN was an early un...,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a
4,76,76,Unmanned Aerial Vehicle trainer at Holloman Ai...,The 429th Attack Squadron is a classic associa...,31,,iVBORw0KGgoAAAANSUhEUgAABAAAAAKpCAIAAACtvMnIAA...,a
...,...,...,...,...,...,...,...,...
268,1312,1312,A Pakistan International Airlines Boeing 720 s...,In March 1981 AlZulfiqar led by Murtaza Bhutto...,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,i
269,1321,1321,A photo of the monument shortly before it open...,The Pentagon Memorial located just southwest o...,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,i
270,1339,1339,English Satam Muhammed Abdel Rahman alSuqami ...,Satam Muhammed Abdel Rahman alSuqami was a Sau...,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,i
271,1368,1368,Overview of the accident site Empire Airlines ...,Empire Airlines Flight 8284 was a cargo flight...,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,i


In [172]:
names = ['uniq_id', 'image_id', 'caption', 'labels', 'image']

In [173]:
data_dict['test'][names]

Unnamed: 0,uniq_id,image_id,caption,labels,image
0,12,12,This drone is an example of UAV that could be ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,25,25,Ryan BQM34 Firebee jetpropelled drone used as ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,28,28,ARDrone preproduction prototype Parrot ARDrone...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,63,63,English Naval Aircraft Factory TDN1 assault d...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,76,76,Unmanned Aerial Vehicle trainer at Holloman Ai...,,iVBORw0KGgoAAAANSUhEUgAABAAAAAKpCAIAAACtvMnIAA...
...,...,...,...,...,...
268,1312,1312,A Pakistan International Airlines Boeing 720 s...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
269,1321,1321,A photo of the monument shortly before it open...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
270,1339,1339,English Satam Muhammed Abdel Rahman alSuqami ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
271,1368,1368,Overview of the accident site Empire Airlines ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [175]:
saved_path = '/raid/AISSEL/htest/datasets/av_data/all'
for el in data_dict:
    name = f'caption_{el}_all_topics_av.tsv'
    df_s = data_dict[el][names]
    df_s.to_csv(f'{saved_path}/{name}', sep="\t", index=False, header=False)

    print(name)

caption_stage1_train_all_topics_av.tsv
caption_stage2_train_all_topics_av.tsv
caption_val_all_topics_av.tsv
caption_test_all_topics_av.tsv


In [176]:
fn = '/raid/AISSEL/htest/datasets/av_data/all/caption_test_all_topics_av.tsv'
df_test = pd.read_csv(fn, sep='\t', names=names)
df_test

Unnamed: 0,uniq_id,image_id,caption,labels,image
0,12,12,This drone is an example of UAV that could be ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,25,25,Ryan BQM34 Firebee jetpropelled drone used as ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,28,28,ARDrone preproduction prototype Parrot ARDrone...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,63,63,English Naval Aircraft Factory TDN1 assault d...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,76,76,Unmanned Aerial Vehicle trainer at Holloman Ai...,,iVBORw0KGgoAAAANSUhEUgAABAAAAAKpCAIAAACtvMnIAA...
...,...,...,...,...,...
268,1312,1312,A Pakistan International Airlines Boeing 720 s...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
269,1321,1321,A photo of the monument shortly before it open...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
270,1339,1339,English Satam Muhammed Abdel Rahman alSuqami ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
271,1368,1368,Overview of the accident site Empire Airlines ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [177]:
data_dict['test'].to_csv(f'{saved_path}/test_all_google_style.tsv', sep="\t", index=False)

In [208]:
df_t = pd.read_csv(f'{saved_path}/test_all_google_style.tsv', sep="\t")
df_t

Unnamed: 0,uniq_id,image_id,caption,context,topic_id,labels,image,topics
0,12,12,This drone is an example of UAV that could be ...,Humanrobot interaction is the study of interac...,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a
1,25,25,Ryan BQM34 Firebee jetpropelled drone used as ...,A target drone is an unmanned aerial vehicle g...,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a
2,28,28,ARDrone preproduction prototype Parrot ARDrone...,Parrot ARDrone is a discontinued remote contro...,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a
3,63,63,English Naval Aircraft Factory TDN1 assault d...,The Naval Aircraft Factory TDN was an early un...,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a
4,76,76,Unmanned Aerial Vehicle trainer at Holloman Ai...,The 429th Attack Squadron is a classic associa...,31,,iVBORw0KGgoAAAANSUhEUgAABAAAAAKpCAIAAACtvMnIAA...,a
...,...,...,...,...,...,...,...,...
268,1312,1312,A Pakistan International Airlines Boeing 720 s...,In March 1981 AlZulfiqar led by Murtaza Bhutto...,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,i
269,1321,1321,A photo of the monument shortly before it open...,The Pentagon Memorial located just southwest o...,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,i
270,1339,1339,English Satam Muhammed Abdel Rahman alSuqami ...,Satam Muhammed Abdel Rahman alSuqami was a Sau...,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,i
271,1368,1368,Overview of the accident site Empire Airlines ...,Empire Airlines Flight 8284 was a cargo flight...,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,i


### save just missing one !

h, i

In [187]:
sel_list = ['h', 'i']
miss_list = ['h', 'i']

In [188]:
sel_topics_dict = dict()
for el in miss_list:
    sel_topics_dict[el] = pg_dict[el]
    sel_topics_dict[el]['image_path'] = sel_topics_dict[el]['image_url'].apply(link_to_b64)
    sel_topics_dict[el] = sel_topics_dict[el][sel_topics_dict[el].image_path.notnull()]
    sel_topics_dict[el] = sel_topics_dict[el].reset_index(drop=True)

In [189]:
selected_col = ['caption', 'context', 'topic_id', 'image_path']
for el in sel_topics_dict:
    sel_topics_dict[el] = sel_topics_dict[el][selected_col]
    sel_topics_dict[el] = sel_topics_dict[el].drop_duplicates(subset=selected_col, keep=False)
    sel_topics_dict[el] = sel_topics_dict[el].reset_index(drop=True)

In [190]:
for el in sel_topics_dict:
    print(el, len(sel_topics_dict[el]))

h 1197
i 1044


In [191]:
sorted_key = []
for k in sorted(sel_topics_dict, key=lambda k: len(sel_topics_dict[k])):
    print(k, len(sel_topics_dict[k]))
    sorted_key.append(k)

i 1044
h 1197


In [192]:
sorted_key

['i', 'h']

In [193]:
for idx, k in enumerate(sorted_key):
    print(sorted_key[idx:], k)
    for el in sorted_key[idx:]:
        if el == k:
            continue
        print(f'we are going to calc: {el} - {k}')
        sel_topics_dict[el] = df2_df1(sel_topics_dict[k], sel_topics_dict[el])
#     break

['i', 'h'] i
we are going to calc: h - i
['h'] h


In [194]:
sorted_key = []
for k in sorted(sel_topics_dict, key=lambda k: len(sel_topics_dict[k])):
    print(k, len(sel_topics_dict[k]))
    sorted_key.append(k)

i 1044
h 1129


In [195]:
sel_topics_dict[el]

Unnamed: 0,caption,context,topic_id,image_path
0,An Aeroflot Sukhoi Superjet 100 at Sheremetyev...,The Aeroflot passenger fleet consists of narro...,145,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
1,Aeroflot is the main user. English: An Aeroflo...,The Sukhoi Superjet 100 or SSJ100 is a regiona...,145,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
2,"The aircraft involved seen in 2017 Aeroflot, R...",Aeroflot Flight 1492 was a regular passenger f...,145,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
3,An Airbus A319 of Aeroflot English: Aeroflot -...,Aeroflot is the national airline of the Russia...,145,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
4,Ilyushin Il-96 and Sukhoi Superjet 100 of Aero...,The PJSC United Aircraft Corporation is a Russ...,145,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
...,...,...,...,...
1192,The Airbus A380 is currently the world's large...,"A four-engined jet, sometimes called a quadjet...",295,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
1193,EC-LZD Evelop Airlines Airbus A320-214 2,This is a list of airlines of Spain. The list ...,295,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
1194,A Singapore Airlines Boeing 777-300ER Singapor...,Singapore Airlines is the flag carrier airline...,295,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
1195,Arkia Israel Airlines plane Arkia Israel Airli...,United Nations Security Council resolution 145...,295,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...


In [196]:
sel_topics_with_images = dict()
count = 0
for el in sel_topics_dict:
    sel_topics_with_images[el] = create_data(sel_topics_dict[el], count)
    count = len(sel_topics_with_images[el])

/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9lL2UxL1ZRLUJRR19MTEJHMjUtMDctMjAxNGEuanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy85LzlkLzEwMV9JT0YtU3R5bGl6ZWQuanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9jL2NiL0pha2FydGFfS2VtYXlvcmFuX0FpcnBvcnRfaW5fMTk2Mi5qcGc=.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy80LzQwLzY4MFcuanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9mL2Y4L0VHTExfLV9BaXJidXNfQTMyMF8tX0NvYmFsdF9BaXJfLV81Qi1EQ1JfJTI4NDMwNzk5MTU0NjUlMjkuanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9jL2M0L1Nhc19iNzM3LTgwMF9sbi1yY3lfYXJwLmpwZw==.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZ

#### Drop val from miss train set

In [197]:
dff = data_dict['test'].copy()
dff_h = dff[dff['topics'] == 'h']
dff_i = dff[dff['topics'] == 'i']

In [198]:
df_f = sel_topics_with_images['h']
h_idx = []
for index, row in tqdm(dff_h.iterrows(), total=dff_h.shape[0]):
#     res = df_f.loc[df_f['caption'].str.contains(row['caption'], case=False)]
    res = df_f[df_f['caption'] == row['caption']]
    for el in res.index.tolist():
        h_idx.append(el)
h_idx = list(set(h_idx))
print(h_idx)

  0%|          | 0/47 [00:00<?, ?it/s]

[387, 515, 771, 1031, 14, 144, 401, 785, 1040, 20, 665, 669, 417, 674, 675, 803, 681, 557, 813, 688, 442, 699, 316, 320, 194, 836, 969, 587, 78, 1103, 208, 209, 1104, 341, 214, 87, 597, 857, 346, 853, 604, 605, 737, 610, 999, 619, 236, 493, 878, 623, 1008, 508]


In [199]:
len(h_idx)

52

In [200]:
df_f = sel_topics_with_images['i']
i_idx = []
for index, row in tqdm(dff_i.iterrows(), total=dff_i.shape[0]):
#     res = df_f.loc[df_f['caption'].str.contains(row['caption'], case=False)]
    res = df_f[df_f['caption'] == row['caption']]
    for el in res.index.tolist():
        i_idx.append(el)
i_idx = list(set(i_idx))
print(i_idx)

  0%|          | 0/46 [00:00<?, ?it/s]

[640, 513, 514, 261, 265, 393, 397, 639, 912, 273, 401, 921, 410, 794, 28, 29, 922, 927, 160, 546, 163, 42, 939, 44, 432, 560, 816, 567, 824, 570, 571, 189, 64, 838, 966, 74, 718, 722, 725, 986, 219, 987, 869, 105, 237, 623, 240, 887, 372, 886, 631, 122, 379, 126, 383]


In [201]:
len(i_idx)

55

In [202]:
for el in sel_topics_with_images:
    print(el, len(sel_topics_with_images[el]))

h 1122
i 1033


In [203]:
sel_topics_with_images['h'] = sel_topics_with_images['h'].drop(sel_topics_with_images['h'].index[h_idx])
sel_topics_with_images['i'] = sel_topics_with_images['i'].drop(sel_topics_with_images['i'].index[i_idx])

In [205]:
data_dict['test']

Unnamed: 0,uniq_id,image_id,caption,context,topic_id,labels,image,topics
0,12,12,This drone is an example of UAV that could be ...,Humanrobot interaction is the study of interac...,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a
1,25,25,Ryan BQM34 Firebee jetpropelled drone used as ...,A target drone is an unmanned aerial vehicle g...,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a
2,28,28,ARDrone preproduction prototype Parrot ARDrone...,Parrot ARDrone is a discontinued remote contro...,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a
3,63,63,English Naval Aircraft Factory TDN1 assault d...,The Naval Aircraft Factory TDN was an early un...,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a
4,76,76,Unmanned Aerial Vehicle trainer at Holloman Ai...,The 429th Attack Squadron is a classic associa...,31,,iVBORw0KGgoAAAANSUhEUgAABAAAAAKpCAIAAACtvMnIAA...,a
...,...,...,...,...,...,...,...,...
268,1312,1312,A Pakistan International Airlines Boeing 720 s...,In March 1981 AlZulfiqar led by Murtaza Bhutto...,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,i
269,1321,1321,A photo of the monument shortly before it open...,The Pentagon Memorial located just southwest o...,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,i
270,1339,1339,English Satam Muhammed Abdel Rahman alSuqami ...,Satam Muhammed Abdel Rahman alSuqami was a Sau...,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,i
271,1368,1368,Overview of the accident site Empire Airlines ...,Empire Airlines Flight 8284 was a cargo flight...,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,i


In [204]:
sel_topics_with_images['h']

Unnamed: 0,uniq_id,image_id,caption,context,topic_id,labels,image
0,0,0,An Aeroflot Sukhoi Superjet 100 at Sheremetyev...,The Aeroflot passenger fleet consists of narro...,145,,iVBORw0KGgoAAAANSUhEUgAABj8AAAQpCAIAAAARSIheAA...
1,1,1,Aeroflot is the main user English An Aeroflot ...,The Sukhoi Superjet 100 or SSJ100 is a regiona...,145,,iVBORw0KGgoAAAANSUhEUgAABj8AAAQpCAIAAAARSIheAA...
2,2,2,The aircraft involved seen in 2017 Aeroflot RA...,Aeroflot Flight 1492 was a regular passenger f...,145,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,3,3,An Airbus A319 of Aeroflot English Aeroflot R...,Aeroflot is the national airline of the Russia...,145,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,4,4,Ilyushin Il96 and Sukhoi Superjet 100 of Aerof...,The PJSC United Aircraft Corporation is a Russ...,145,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
...,...,...,...,...,...,...,...
1117,1192,1192,The Airbus A380 is currently the worlds larges...,A fourengined jet sometimes called a quadjet i...,295,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1118,1193,1193,ECLZD Evelop Airlines Airbus A320214 2,This is a list of airlines of Spain The list i...,295,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1119,1194,1194,A Singapore Airlines Boeing 777300ER Singapore...,Singapore Airlines is the flag carrier airline...,295,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1120,1195,1195,Arkia Israel Airlines plane Arkia Israel Airli...,United Nations Security Council resolution 145...,295,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [206]:
for el in sel_topics_with_images:
    sel_topics_with_images[el] = sel_topics_with_images[el].dropna(subset=['caption'])
    sel_topics_with_images[el]['topics']  = [el] * len(sel_topics_with_images[el])
    print(el, len(sel_topics_with_images[el]))

h 1070
i 978


In [209]:
name_lsit = ['stage1_train', 'stage2_train', 'val', 'test']
por_dict = dict()
data_dict = dict()
for n in name_lsit:
    data_dict[n] = pd.DataFrame() 
for el in miss_list:
    s1, s2, v, t = get_portion(sel_topics_with_images[el])
    
    s1 = s1.reset_index(drop=True)
    data_dict['stage1_train'] = pd.concat([data_dict['stage1_train'], s1], ignore_index=True)
    
    s2 = s2.reset_index(drop=True)
    data_dict['stage2_train'] = pd.concat([data_dict['stage2_train'], s2], ignore_index=True)
    
    t = t.reset_index(drop=True)
    data_dict['test'] = pd.concat([data_dict['test'], t], ignore_index=True)
    
    v = v.reset_index(drop=True)
    data_dict['val'] = pd.concat([data_dict['val'], v], ignore_index=True)
#     data_dict[el] = por_dict

In [210]:
len(data_dict['stage1_train'])

1536

In [211]:
len(data_dict['stage2_train'])

307

In [212]:
len(data_dict['val'])

103

In [213]:
len(data_dict['test'])

102

In [216]:
names = ['uniq_id', 'image_id', 'caption', 'labels', 'image']
df_s = data_dict['stage1_train'][names]
df_s

Unnamed: 0,uniq_id,image_id,caption,labels,image
0,1167,1167,English Lineup of China Airlines jets includi...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1,515,515,Southwest Airlines headquarters in Dallas Engl...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,347,347,English Fly Alaska Seaplanes terminal Gustavu...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,637,637,Bombardier Global Express Global 6000 landing ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,1084,1084,English Royal Malaysian Air Force Airbus A400M,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
...,...,...,...,...,...
1531,1602,1602,Breguet 941 prototype in 1963 English Flight e...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1532,1234,1234,The cockpit starboard console of an A6M2 which...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1533,2125,2125,A United DC6 at Stapleton Airport similar to t...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
1534,1317,1317,The salvaged cockpit of XW666 XW666crashed int...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


In [217]:
saved_path = '/raid/AISSEL/htest/datasets/av_data/missed'
for el in data_dict:
    name = f'caption_{el}_miss_topics_av.tsv'
    df_s = data_dict[el][names]
    df_s.to_csv(f'{saved_path}/{name}', sep="\t", index=False, header=False)

    print(name)

caption_stage1_train_miss_topics_av.tsv
caption_stage2_train_miss_topics_av.tsv
caption_val_miss_topics_av.tsv
caption_test_miss_topics_av.tsv


In [218]:
df_s = pd.read_csv(f'{saved_path}/caption_{el}_miss_topics_av.tsv', sep="\t", names=names)
df_s

Unnamed: 0,uniq_id,image_id,caption,labels,image
0,10,10,English AeroflotNord Boeing 737 Norsk bokmål ...,,iVBORw0KGgoAAAANSUhEUgAAAxoAAAHPCAIAAABbTjuwAA...
1,29,29,Flight Airfield ShevlinoRussian glider AC6 3926K,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
2,35,35,A Piper Aerostar general aviation aircraft at ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
3,37,37,Aeroflot Tupolev Tu104B at Arlanda Airport in ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
4,38,38,Yamal Airlines Sukhoi Superjet 100 in Roshchin...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
...,...,...,...,...,...
97,2115,2115,The aircraft involved in the accident English ...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
98,2116,2116,United Airlines DC811 powered by Pratt Whitne...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
99,2149,2149,A UH72A at the Pentagon 2010 English The US Ar...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...
100,2156,2156,The Pentagon headquarters of the Department of...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...


# Random dataset

In [39]:
df = pd.read_csv(f'{root_path}/aircraft_q75_en_sbert.tsv', sep="\t")
df = df.drop('Unnamed: 0', 1)
df

Unnamed: 0,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,mime_type,...,251_context_score,31_context_score,10_context_score,258_context_score,72_context_score,93_context_score,118_context_score,19_context_score,139_context_score,237_context_score
0,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,image/jpeg,...,0.020924,0.033392,0.091493,0.065447,0.147460,0.013493,0.008091,0.039506,0.079395,0.018479
1,en,https://en.wikipedia.org/wiki/1994,https://upload.wikimedia.org/wikipedia/commons...,1994,June,1994 / Deaths / June,Menachem Mendel Schneerson,English: Menachem Mendel Schneerson - the Luba...,,image/jpeg,...,0.044889,-0.039786,-0.068970,-0.041991,0.244107,0.039615,0.005427,0.041240,-0.072191,0.058443
2,en,https://en.wikipedia.org/wiki/Duffy_Dyer,https://upload.wikimedia.org/wikipedia/commons...,Duffy Dyer,,Duffy Dyer,,English: Image cropped from a baseball card of...,,image/jpeg,...,-0.056391,0.009440,0.066075,0.074756,0.015237,-0.005682,-0.019572,0.151901,0.103661,0.111322
3,en,https://en.wikipedia.org/wiki/Anguera,http://upload.wikimedia.org/wikipedia/commons/...,Anguera,,Anguera,,Português: Anguera,Official seal of Anguera,image/jpeg,...,0.043653,0.124242,0.076659,-0.061763,-0.013700,-0.013047,-0.066803,0.054065,0.030850,-0.076256
4,en,https://en.wikipedia.org/wiki/Comparison_of_HT...,https://upload.wikimedia.org/wikipedia/commons...,Comparison of HTC devices,S Series (Windows Mobile),Comparison of HTC devices / S Series (Windows ...,,HTC S710,,image/jpeg,...,0.019863,0.030068,-0.109320,0.052087,-0.068640,-0.038589,-0.016587,0.056286,-0.053078,-0.071031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352996,en,https://en.wikipedia.org/wiki/Urban_Transporta...,https://upload.wikimedia.org/wikipedia/commons...,Urban Transportation Development Corporation,UTDC products,Urban Transportation Development Corporation /...,Massachusetts Bay Transportation Authority 170...,English: UTDC/Bombardier 1700 series Red Line ...,,image/jpeg,...,0.185282,-0.023973,0.133213,0.088106,0.064406,0.066175,0.129016,-0.035138,-0.027193,0.038367
1352997,en,https://en.wikipedia.org/wiki/Central_Institut...,https://upload.wikimedia.org/wikipedia/commons...,Central Institute of Plastics Engineering & Te...,,Central Institute of Plastics Engineering & Te...,,English: CIPET,,image/jpeg,...,0.091530,0.142491,-0.009801,-0.159589,-0.089324,0.106132,-0.015100,-0.045212,0.013892,-0.063472
1352998,en,https://en.wikipedia.org/wiki/SUN_workstation,https://upload.wikimedia.org/wikipedia/commons...,SUN workstation,History,SUN workstation / History,The three boards (plus memory extension) as la...,Sun 100Y Cardcage and Powersupply Photo taken ...,,image/jpeg,...,0.081194,0.053022,0.059844,0.125492,0.102487,0.197474,0.002423,0.060117,0.070297,0.034687
1352999,en,https://en.wikipedia.org/wiki/Pyatigorsk,https://upload.wikimedia.org/wikipedia/commons...,Pyatigorsk,,Pyatigorsk,,Русский: снимок июля 2009 г.,,image/jpeg,...,-0.053566,0.017492,0.253884,-0.063466,0.014604,-0.015967,-0.007643,0.060969,-0.028550,-0.083007


In [40]:
df_cap = df
df_cap

Unnamed: 0,language,page_url,image_url,page_title,section_title,hierarchical_section_title,caption_reference_description,caption_attribution_description,caption_alt_text_description,mime_type,...,251_context_score,31_context_score,10_context_score,258_context_score,72_context_score,93_context_score,118_context_score,19_context_score,139_context_score,237_context_score
0,en,https://en.wikipedia.org/wiki/FMW_7th_Annivers...,https://upload.wikimedia.org/wikipedia/commons...,FMW 7th Anniversary Show,,FMW 7th Anniversary Show,Kawasaki Stadium,English: kawasaki_fujimi Stadium 日本語: 川崎富士見球技場...,,image/jpeg,...,0.020924,0.033392,0.091493,0.065447,0.147460,0.013493,0.008091,0.039506,0.079395,0.018479
1,en,https://en.wikipedia.org/wiki/1994,https://upload.wikimedia.org/wikipedia/commons...,1994,June,1994 / Deaths / June,Menachem Mendel Schneerson,English: Menachem Mendel Schneerson - the Luba...,,image/jpeg,...,0.044889,-0.039786,-0.068970,-0.041991,0.244107,0.039615,0.005427,0.041240,-0.072191,0.058443
2,en,https://en.wikipedia.org/wiki/Duffy_Dyer,https://upload.wikimedia.org/wikipedia/commons...,Duffy Dyer,,Duffy Dyer,,English: Image cropped from a baseball card of...,,image/jpeg,...,-0.056391,0.009440,0.066075,0.074756,0.015237,-0.005682,-0.019572,0.151901,0.103661,0.111322
3,en,https://en.wikipedia.org/wiki/Anguera,http://upload.wikimedia.org/wikipedia/commons/...,Anguera,,Anguera,,Português: Anguera,Official seal of Anguera,image/jpeg,...,0.043653,0.124242,0.076659,-0.061763,-0.013700,-0.013047,-0.066803,0.054065,0.030850,-0.076256
4,en,https://en.wikipedia.org/wiki/Comparison_of_HT...,https://upload.wikimedia.org/wikipedia/commons...,Comparison of HTC devices,S Series (Windows Mobile),Comparison of HTC devices / S Series (Windows ...,,HTC S710,,image/jpeg,...,0.019863,0.030068,-0.109320,0.052087,-0.068640,-0.038589,-0.016587,0.056286,-0.053078,-0.071031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1352996,en,https://en.wikipedia.org/wiki/Urban_Transporta...,https://upload.wikimedia.org/wikipedia/commons...,Urban Transportation Development Corporation,UTDC products,Urban Transportation Development Corporation /...,Massachusetts Bay Transportation Authority 170...,English: UTDC/Bombardier 1700 series Red Line ...,,image/jpeg,...,0.185282,-0.023973,0.133213,0.088106,0.064406,0.066175,0.129016,-0.035138,-0.027193,0.038367
1352997,en,https://en.wikipedia.org/wiki/Central_Institut...,https://upload.wikimedia.org/wikipedia/commons...,Central Institute of Plastics Engineering & Te...,,Central Institute of Plastics Engineering & Te...,,English: CIPET,,image/jpeg,...,0.091530,0.142491,-0.009801,-0.159589,-0.089324,0.106132,-0.015100,-0.045212,0.013892,-0.063472
1352998,en,https://en.wikipedia.org/wiki/SUN_workstation,https://upload.wikimedia.org/wikipedia/commons...,SUN workstation,History,SUN workstation / History,The three boards (plus memory extension) as la...,Sun 100Y Cardcage and Powersupply Photo taken ...,,image/jpeg,...,0.081194,0.053022,0.059844,0.125492,0.102487,0.197474,0.002423,0.060117,0.070297,0.034687
1352999,en,https://en.wikipedia.org/wiki/Pyatigorsk,https://upload.wikimedia.org/wikipedia/commons...,Pyatigorsk,,Pyatigorsk,,Русский: снимок июля 2009 г.,,image/jpeg,...,-0.053566,0.017492,0.253884,-0.063466,0.014604,-0.015967,-0.007643,0.060969,-0.028550,-0.083007


In [41]:
caption_score_list = [el for el in df.columns if 'caption_score' in el]
caption_score_list = caption_score_list[1:]
print(caption_score_list)
print(len(caption_score_list))

['63_caption_score', '3_caption_score', '271_caption_score', '145_caption_score', '298_caption_score', '11_caption_score', '45_caption_score', '201_caption_score', '35_caption_score', '108_caption_score', '58_caption_score', '144_caption_score', '157_caption_score', '186_caption_score', '230_caption_score', '159_caption_score', '220_caption_score', '77_caption_score', '256_caption_score', '104_caption_score', '86_caption_score', '2_caption_score', '34_caption_score', '247_caption_score', '23_caption_score', '80_caption_score', '165_caption_score', '32_caption_score', '319_caption_score', '126_caption_score', '66_caption_score', '225_caption_score', '25_caption_score', '8_caption_score', '235_caption_score', '295_caption_score', '227_caption_score', '44_caption_score', '234_caption_score', '50_caption_score', '251_caption_score', '31_caption_score', '10_caption_score', '258_caption_score', '72_caption_score', '93_caption_score', '118_caption_score', '19_caption_score', '139_caption_scor

In [42]:
context_score_list = [el for el in df.columns if 'context_score' in el]
# context_score_list = context_score_list[:-1]
print(context_score_list)
print(len(context_score_list))

['63_context_score', '3_context_score', '271_context_score', '145_context_score', '298_context_score', '11_context_score', '45_context_score', '201_context_score', '35_context_score', '108_context_score', '58_context_score', '144_context_score', '157_context_score', '186_context_score', '230_context_score', '159_context_score', '220_context_score', '77_context_score', '256_context_score', '104_context_score', '86_context_score', '2_context_score', '34_context_score', '247_context_score', '23_context_score', '80_context_score', '165_context_score', '32_context_score', '319_context_score', '126_context_score', '66_context_score', '225_context_score', '25_context_score', '8_context_score', '235_context_score', '295_context_score', '227_context_score', '44_context_score', '234_context_score', '50_context_score', '251_context_score', '31_context_score', '10_context_score', '258_context_score', '72_context_score', '93_context_score', '118_context_score', '19_context_score', '139_context_scor

In [43]:
other_list = [el for el in df.columns if el not in context_score_list and el not in caption_score_list]
other_list

['language',
 'page_url',
 'image_url',
 'page_title',
 'section_title',
 'hierarchical_section_title',
 'caption_reference_description',
 'caption_attribution_description',
 'caption_alt_text_description',
 'mime_type',
 'original_height',
 'original_width',
 'is_main_image',
 'attribution_passes_lang_id',
 'page_changed_recently',
 'context_page_description',
 'context_section_description',
 'caption',
 'context',
 'aircraft_caption_score']

In [44]:
im_c = ['image_url', 'caption', 'context']
cap_s = ['aircraft_caption_score']
col_list = im_c + cap_s + caption_score_list + context_score_list

In [50]:
df_dict = dict()
for col_id in q_id:
#     num, _ = col.split('_', 1)
    col = f'{col_id}_caption_score'
    c_list = [col] + [f'{col_id}_context_score'] + im_c + cap_s
    df_n = df_cap[c_list]
    df_n = df_n[df_n[col] > df_n[col].quantile(0.75)]
#     df_n = df_n[df_n[col] >= 0.40]
#     df_n = df_n[df_n['aircraft_caption_score'] >= 0.48]
#     df = df[df[col] > 0.55]
    df_n = df_n.sort_values(by=[col], ascending=False)
    df_dict[col] = df_n

In [51]:
df_dict[col]

Unnamed: 0,237_caption_score,237_context_score,image_url,caption,context,aircraft_caption_score
1065260,0.602327,0.558602,https://upload.wikimedia.org/wikipedia/commons...,The Pilot in Command must hold the rank of Cap...,The pilot in command of an aircraft is the per...,0.286548
520840,0.561625,0.267597,https://upload.wikimedia.org/wikipedia/commons...,Nimbin crew members \n Captain William James B...,The Nimbin was a steel screw steamer built in ...,0.264203
1031741,0.550900,0.466005,https://upload.wikimedia.org/wikipedia/commons...,"Group Captain McCauley, 1943 English: AWM Capt...","Air Marshal Sir John Patrick Joseph McCauley, ...",0.228081
906004,0.545596,0.494362,https://upload.wikimedia.org/wikipedia/commons...,Crew management - captain at sea English: Crew...,"Crew management for ships, otherwise known as ...",0.214876
1086096,0.543684,0.277029,https://upload.wikimedia.org/wikipedia/commons...,Eric Kaniut (in yellow) works out with other o...,Captain Eric G. Kaniut of the United States Na...,0.186574
...,...,...,...,...,...,...
717340,0.103997,-0.102237,http://upload.wikimedia.org/wikipedia/commons/...,Irène Curie,The Nobel Prize in Chemistry is awarded annual...,0.200706
1199542,0.103997,0.111687,https://upload.wikimedia.org/wikipedia/commons...,Pérez de Cuéllar and Iranian President Ali Kha...,Javier Felipe Ricardo Pérez de Cuéllar y de la...,0.173361
1246611,0.103996,-0.140092,https://upload.wikimedia.org/wikipedia/commons...,"English: Tall Ships Kathleen and May, and Lea...","There are over 9,000 Grade I listed buildings ...",0.194163
709623,0.103995,0.044810,https://upload.wikimedia.org/wikipedia/commons...,English: Caption on image: Seattle. Indian ca...,Past and present structures on Elliott Bay in ...,0.183036


In [52]:
for el in df_dict:
    t,_ = el.split('_', 1)
#     print(t)
    df_dict[el].columns = [f'{t}_caption_sim', f'{t}_context_sim', 'image_url',
                          'caption', 'context', f'aircraft_caption_sim']
    display(df_dict[el].describe().round(2))
    print('\n')

Unnamed: 0,63_caption_sim,63_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.18,0.1,0.25
std,0.06,0.11,0.09
min,0.12,-0.24,0.13
25%,0.13,0.03,0.18
50%,0.16,0.09,0.23
75%,0.2,0.17,0.31
max,0.67,0.64,0.72






Unnamed: 0,3_caption_sim,3_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.23,0.16,0.25
std,0.07,0.13,0.09
min,0.15,-0.28,0.13
25%,0.18,0.06,0.18
50%,0.21,0.14,0.23
75%,0.27,0.24,0.31
max,0.67,0.63,0.72






Unnamed: 0,271_caption_sim,271_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.18,0.12,0.25
std,0.06,0.1,0.09
min,0.12,-0.23,0.13
25%,0.14,0.05,0.17
50%,0.16,0.1,0.22
75%,0.21,0.18,0.3
max,0.53,0.65,0.72






Unnamed: 0,145_caption_sim,145_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.24,0.14,0.25
std,0.05,0.1,0.09
min,0.18,-0.24,0.13
25%,0.2,0.07,0.17
50%,0.22,0.14,0.23
75%,0.27,0.21,0.31
max,0.71,0.64,0.72






Unnamed: 0,298_caption_sim,298_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.13,0.09,0.24
std,0.05,0.09,0.09
min,0.08,-0.26,0.13
25%,0.1,0.02,0.17
50%,0.12,0.08,0.21
75%,0.15,0.14,0.3
max,0.52,0.57,0.72






Unnamed: 0,11_caption_sim,11_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.19,0.11,0.24
std,0.05,0.09,0.09
min,0.14,-0.26,0.13
25%,0.16,0.05,0.17
50%,0.18,0.1,0.21
75%,0.21,0.16,0.29
max,0.66,0.64,0.72






Unnamed: 0,45_caption_sim,45_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.23,0.15,0.25
std,0.07,0.12,0.09
min,0.16,-0.25,0.13
25%,0.18,0.06,0.18
50%,0.21,0.14,0.24
75%,0.27,0.22,0.31
max,0.64,0.63,0.72






Unnamed: 0,201_caption_sim,201_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.17,0.12,0.25
std,0.06,0.1,0.09
min,0.1,-0.21,0.13
25%,0.12,0.05,0.18
50%,0.15,0.11,0.23
75%,0.2,0.18,0.31
max,0.55,0.61,0.72






Unnamed: 0,35_caption_sim,35_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.22,0.13,0.24
std,0.05,0.1,0.09
min,0.16,-0.23,0.13
25%,0.18,0.07,0.17
50%,0.21,0.13,0.22
75%,0.25,0.19,0.3
max,0.68,0.63,0.72






Unnamed: 0,108_caption_sim,108_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.19,0.11,0.24
std,0.05,0.09,0.09
min,0.14,-0.22,0.13
25%,0.16,0.05,0.17
50%,0.18,0.11,0.22
75%,0.21,0.17,0.3
max,0.66,0.61,0.72






Unnamed: 0,58_caption_sim,58_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.18,0.11,0.24
std,0.05,0.09,0.09
min,0.12,-0.22,0.13
25%,0.14,0.05,0.17
50%,0.17,0.11,0.22
75%,0.2,0.17,0.29
max,0.7,0.65,0.72






Unnamed: 0,144_caption_sim,144_context_sim,aircraft_caption_sim
count,338248.0,338248.0,338248.0
mean,0.27,0.17,0.25
std,0.08,0.12,0.09
min,0.18,-0.24,0.13
25%,0.21,0.08,0.18
50%,0.24,0.15,0.23
75%,0.3,0.25,0.3
max,0.71,0.64,0.72






Unnamed: 0,157_caption_sim,157_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.24,0.13,0.25
std,0.06,0.11,0.09
min,0.18,-0.22,0.13
25%,0.2,0.04,0.17
50%,0.23,0.12,0.23
75%,0.28,0.2,0.3
max,0.69,0.66,0.72






Unnamed: 0,186_caption_sim,186_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.24,0.14,0.25
std,0.05,0.1,0.09
min,0.18,-0.26,0.13
25%,0.2,0.07,0.18
50%,0.23,0.13,0.23
75%,0.27,0.2,0.3
max,0.62,0.61,0.72






Unnamed: 0,230_caption_sim,230_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.19,0.14,0.26
std,0.06,0.1,0.09
min,0.12,-0.24,0.13
25%,0.14,0.07,0.19
50%,0.17,0.14,0.24
75%,0.22,0.21,0.31
max,0.59,0.61,0.72






Unnamed: 0,159_caption_sim,159_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.25,0.16,0.25
std,0.06,0.11,0.09
min,0.18,-0.29,0.13
25%,0.2,0.08,0.18
50%,0.23,0.15,0.23
75%,0.28,0.23,0.3
max,0.69,0.62,0.72






Unnamed: 0,220_caption_sim,220_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.19,0.11,0.25
std,0.06,0.11,0.09
min,0.13,-0.28,0.13
25%,0.15,0.04,0.17
50%,0.17,0.1,0.23
75%,0.22,0.18,0.31
max,0.69,0.69,0.72






Unnamed: 0,77_caption_sim,77_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.19,0.13,0.24
std,0.06,0.11,0.09
min,0.13,-0.24,0.13
25%,0.15,0.05,0.17
50%,0.18,0.12,0.22
75%,0.22,0.19,0.3
max,0.73,0.72,0.72






Unnamed: 0,256_caption_sim,256_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.2,0.11,0.24
std,0.06,0.09,0.09
min,0.13,-0.21,0.13
25%,0.15,0.04,0.17
50%,0.18,0.1,0.22
75%,0.23,0.17,0.3
max,0.58,0.65,0.72






Unnamed: 0,104_caption_sim,104_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.22,0.14,0.25
std,0.05,0.1,0.09
min,0.16,-0.24,0.13
25%,0.18,0.07,0.17
50%,0.21,0.13,0.23
75%,0.25,0.2,0.3
max,0.6,0.56,0.72






Unnamed: 0,86_caption_sim,86_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.2,0.12,0.24
std,0.06,0.1,0.09
min,0.14,-0.3,0.13
25%,0.16,0.05,0.17
50%,0.19,0.12,0.21
75%,0.23,0.18,0.29
max,0.72,0.71,0.72






Unnamed: 0,2_caption_sim,2_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.16,0.1,0.25
std,0.05,0.1,0.09
min,0.1,-0.23,0.13
25%,0.12,0.03,0.18
50%,0.14,0.09,0.23
75%,0.19,0.16,0.3
max,0.64,0.59,0.72






Unnamed: 0,34_caption_sim,34_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.15,0.1,0.25
std,0.06,0.1,0.09
min,0.09,-0.25,0.13
25%,0.1,0.03,0.17
50%,0.13,0.09,0.23
75%,0.18,0.17,0.3
max,0.57,0.6,0.72






Unnamed: 0,247_caption_sim,247_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.21,0.13,0.25
std,0.06,0.1,0.09
min,0.14,-0.26,0.13
25%,0.16,0.06,0.18
50%,0.19,0.13,0.23
75%,0.24,0.2,0.3
max,0.64,0.58,0.72






Unnamed: 0,23_caption_sim,23_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.19,0.11,0.25
std,0.05,0.1,0.09
min,0.13,-0.23,0.13
25%,0.15,0.04,0.17
50%,0.18,0.1,0.22
75%,0.22,0.18,0.3
max,0.68,0.68,0.72






Unnamed: 0,80_caption_sim,80_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.15,0.1,0.25
std,0.05,0.09,0.09
min,0.09,-0.23,0.13
25%,0.11,0.04,0.18
50%,0.14,0.09,0.23
75%,0.18,0.16,0.31
max,0.6,0.58,0.72






Unnamed: 0,165_caption_sim,165_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.23,0.15,0.25
std,0.05,0.11,0.09
min,0.17,-0.23,0.13
25%,0.19,0.08,0.18
50%,0.22,0.15,0.23
75%,0.26,0.22,0.3
max,0.61,0.63,0.72






Unnamed: 0,32_caption_sim,32_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.24,0.15,0.25
std,0.07,0.12,0.09
min,0.16,-0.22,0.13
25%,0.19,0.06,0.18
50%,0.22,0.14,0.23
75%,0.28,0.24,0.31
max,0.7,0.72,0.72






Unnamed: 0,319_caption_sim,319_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.19,0.12,0.25
std,0.05,0.09,0.09
min,0.13,-0.23,0.13
25%,0.15,0.06,0.18
50%,0.18,0.12,0.23
75%,0.21,0.18,0.3
max,0.51,0.49,0.72






Unnamed: 0,126_caption_sim,126_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.13,0.07,0.25
std,0.05,0.08,0.09
min,0.08,-0.2,0.13
25%,0.09,0.01,0.18
50%,0.11,0.07,0.23
75%,0.15,0.13,0.31
max,0.55,0.74,0.72






Unnamed: 0,66_caption_sim,66_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.16,0.11,0.25
std,0.06,0.1,0.09
min,0.09,-0.24,0.13
25%,0.11,0.04,0.17
50%,0.14,0.1,0.22
75%,0.19,0.17,0.3
max,0.57,0.65,0.72






Unnamed: 0,225_caption_sim,225_context_sim,aircraft_caption_sim
count,338249.0,338249.0,338249.0
mean,0.14,0.1,0.24
std,0.05,0.09,0.09
min,0.09,-0.27,0.13
25%,0.1,0.03,0.17
50%,0.13,0.09,0.22
75%,0.16,0.16,0.3
max,0.54,0.65,0.72






Unnamed: 0,25_caption_sim,25_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.23,0.14,0.25
std,0.08,0.13,0.09
min,0.15,-0.25,0.13
25%,0.17,0.05,0.17
50%,0.2,0.12,0.22
75%,0.26,0.21,0.3
max,0.68,0.7,0.72






Unnamed: 0,8_caption_sim,8_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.25,0.17,0.25
std,0.08,0.13,0.09
min,0.16,-0.24,0.13
25%,0.19,0.08,0.18
50%,0.23,0.16,0.23
75%,0.29,0.26,0.31
max,0.74,0.67,0.72






Unnamed: 0,235_caption_sim,235_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.19,0.09,0.24
std,0.05,0.09,0.09
min,0.14,-0.26,0.13
25%,0.16,0.03,0.17
50%,0.18,0.09,0.22
75%,0.21,0.15,0.29
max,0.78,0.61,0.72






Unnamed: 0,295_caption_sim,295_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.19,0.12,0.24
std,0.06,0.1,0.09
min,0.13,-0.3,0.13
25%,0.15,0.06,0.17
50%,0.17,0.12,0.22
75%,0.21,0.18,0.29
max,0.75,0.66,0.72






Unnamed: 0,227_caption_sim,227_context_sim,aircraft_caption_sim
count,338240.0,338240.0,338240.0
mean,0.19,0.12,0.25
std,0.05,0.09,0.09
min,0.13,-0.27,0.13
25%,0.15,0.05,0.18
50%,0.18,0.11,0.23
75%,0.22,0.17,0.3
max,0.64,0.62,0.72






Unnamed: 0,44_caption_sim,44_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.25,0.15,0.24
std,0.06,0.11,0.09
min,0.18,-0.24,0.13
25%,0.2,0.08,0.17
50%,0.23,0.15,0.22
75%,0.27,0.22,0.29
max,0.62,0.62,0.72






Unnamed: 0,234_caption_sim,234_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.27,0.17,0.25
std,0.08,0.13,0.09
min,0.19,-0.27,0.13
25%,0.21,0.08,0.18
50%,0.24,0.16,0.24
75%,0.3,0.25,0.31
max,0.8,0.74,0.72






Unnamed: 0,50_caption_sim,50_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.2,0.11,0.25
std,0.05,0.09,0.09
min,0.14,-0.28,0.13
25%,0.16,0.05,0.17
50%,0.18,0.11,0.22
75%,0.22,0.18,0.3
max,0.71,0.69,0.72






Unnamed: 0,251_caption_sim,251_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.24,0.15,0.25
std,0.05,0.1,0.09
min,0.18,-0.22,0.13
25%,0.2,0.08,0.19
50%,0.23,0.15,0.24
75%,0.27,0.22,0.31
max,0.59,0.52,0.72






Unnamed: 0,31_caption_sim,31_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.16,0.1,0.24
std,0.05,0.08,0.09
min,0.1,-0.23,0.13
25%,0.12,0.04,0.17
50%,0.15,0.09,0.22
75%,0.18,0.15,0.29
max,0.68,0.61,0.72






Unnamed: 0,10_caption_sim,10_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.19,0.12,0.24
std,0.07,0.11,0.09
min,0.12,-0.21,0.13
25%,0.14,0.05,0.17
50%,0.17,0.11,0.21
75%,0.22,0.19,0.3
max,0.65,0.64,0.72






Unnamed: 0,258_caption_sim,258_context_sim,aircraft_caption_sim
count,338249.0,338249.0,338249.0
mean,0.2,0.11,0.25
std,0.05,0.1,0.09
min,0.14,-0.26,0.13
25%,0.16,0.05,0.18
50%,0.19,0.11,0.23
75%,0.23,0.18,0.3
max,0.57,0.62,0.72






Unnamed: 0,72_caption_sim,72_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.28,0.16,0.25
std,0.06,0.11,0.09
min,0.21,-0.26,0.13
25%,0.23,0.08,0.17
50%,0.26,0.15,0.22
75%,0.31,0.23,0.3
max,0.63,0.69,0.72






Unnamed: 0,93_caption_sim,93_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.16,0.11,0.26
std,0.05,0.09,0.09
min,0.1,-0.22,0.13
25%,0.12,0.05,0.19
50%,0.14,0.11,0.24
75%,0.18,0.17,0.31
max,0.54,0.56,0.72






Unnamed: 0,118_caption_sim,118_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.2,0.1,0.23
std,0.05,0.09,0.09
min,0.14,-0.28,0.13
25%,0.16,0.04,0.16
50%,0.19,0.1,0.2
75%,0.22,0.16,0.27
max,0.66,0.6,0.72






Unnamed: 0,19_caption_sim,19_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.22,0.13,0.25
std,0.05,0.1,0.09
min,0.16,-0.23,0.13
25%,0.18,0.06,0.17
50%,0.2,0.12,0.22
75%,0.24,0.19,0.3
max,0.66,0.65,0.72






Unnamed: 0,139_caption_sim,139_context_sim,aircraft_caption_sim
count,338250.0,338250.0,338250.0
mean,0.18,0.12,0.25
std,0.05,0.09,0.09
min,0.12,-0.26,0.13
25%,0.14,0.06,0.18
50%,0.17,0.12,0.23
75%,0.21,0.18,0.3
max,0.64,0.59,0.72






Unnamed: 0,237_caption_sim,237_context_sim,aircraft_caption_sim
count,338249.0,338249.0,338249.0
mean,0.18,0.13,0.24
std,0.07,0.11,0.09
min,0.1,-0.24,0.13
25%,0.13,0.05,0.17
50%,0.16,0.12,0.22
75%,0.21,0.21,0.29
max,0.6,0.61,0.72






In [53]:
for t in topics:
    df_key = str(t) + '_caption_score'
#     df_dict[df_key]['context_gt'] = df_dict[df_key]['context'].apply(simple_search, t=t)
    df_dict[df_key]['topic_id'] = t

In [54]:
df_dict['3_caption_score'].describe().round(2)

Unnamed: 0,3_caption_sim,3_context_sim,aircraft_caption_sim,topic_id
count,338250.0,338250.0,338250.0,338250.0
mean,0.23,0.16,0.25,3.0
std,0.07,0.13,0.09,0.0
min,0.15,-0.28,0.13,3.0
25%,0.18,0.06,0.18,3.0
50%,0.21,0.14,0.23,3.0
75%,0.27,0.24,0.31,3.0
max,0.67,0.63,0.72,3.0


In [55]:
pg_dict = dict()
pg_dict['a'] = [31]
pg_dict['b'] = [19, 32]
pg_dict['c'] = [157, 230, 63, 319, 80, 201]
pg_dict['d'] = [144, 45, 104, 35, 44, 165, 186]
pg_dict['e'] = [8, 25, 159, 108, 50, 234]

pg_dict['f'] = [235, 225, 58, 227]
pg_dict['g'] = [93, 258, 139, 251]
pg_dict['h'] = [145, 256, 34, 11, 77, 86, 10, 118, 295]
pg_dict['i'] = [3, 271, 298]
pg_dict['j'] = [247, 2, 237, 72, 23, 126, 66, 220]

In [56]:
k_name = ['image_url', 'caption', 'context', 'concept2caption_sim', 
          'topic_id']
# k_name = ['image_url', 'caption', 'context', 'concept2caption_sim', 
#           'concept2context_sim']
for el in pg_dict:
    pg_dict[el] = [f'{i}_caption_score' for i in pg_dict[el]]
    pg_dict[el] = [df_dict[i] for i in pg_dict[el]]
    pg_dict[el] = [df_.reset_index(drop=True) for df_ in pg_dict[el]]
    

In [58]:
for el in pg_dict:
    t_df = pd.DataFrame(columns = ['topic2caption_sim', 'topic2context_sim'] + k_name)
    for df_ in pg_dict[el]:
        df_.columns = ['topic2caption_sim', 'topic2context_sim'] + k_name
        t_df = pd.concat([t_df, df_], ignore_index=True)
    pg_dict[el] = t_df

In [59]:
for el in pg_dict:
    print(el)
    display(pg_dict[el].describe().round(2))


a


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim
count,338250.0,338250.0,338250.0
mean,0.16,0.1,0.24
std,0.05,0.08,0.09
min,0.1,-0.23,0.13
25%,0.12,0.04,0.17
50%,0.15,0.09,0.22
75%,0.18,0.15,0.29
max,0.68,0.61,0.72


b


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim
count,676500.0,676500.0,676500.0
mean,0.23,0.14,0.25
std,0.06,0.11,0.09
min,0.16,-0.23,0.13
25%,0.18,0.06,0.18
50%,0.21,0.13,0.23
75%,0.26,0.21,0.3
max,0.7,0.72,0.72


c


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim
count,2029500.0,2029500.0,2029500.0
mean,0.19,0.12,0.25
std,0.06,0.1,0.09
min,0.09,-0.24,0.13
25%,0.14,0.05,0.18
50%,0.17,0.11,0.23
75%,0.22,0.18,0.31
max,0.69,0.66,0.72


d


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim
count,2367748.0,2367748.0,2367748.0
mean,0.24,0.15,0.25
std,0.06,0.11,0.09
min,0.16,-0.26,0.13
25%,0.19,0.07,0.18
50%,0.22,0.14,0.23
75%,0.27,0.21,0.3
max,0.71,0.64,0.72


e


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim
count,2029500.0,2029500.0,2029500.0
mean,0.23,0.14,0.25
std,0.07,0.12,0.09
min,0.14,-0.29,0.13
25%,0.18,0.06,0.18
50%,0.21,0.13,0.23
75%,0.26,0.21,0.3
max,0.8,0.74,0.72


f


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim
count,1352989.0,1352989.0,1352989.0
mean,0.18,0.11,0.24
std,0.06,0.09,0.09
min,0.09,-0.27,0.13
25%,0.14,0.04,0.17
50%,0.17,0.1,0.22
75%,0.2,0.16,0.29
max,0.78,0.65,0.72


g


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim
count,1352999.0,1352999.0,1352999.0
mean,0.19,0.12,0.25
std,0.06,0.1,0.09
min,0.1,-0.26,0.13
25%,0.15,0.06,0.18
50%,0.19,0.12,0.23
75%,0.23,0.19,0.31
max,0.64,0.62,0.72


h


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim
count,3044250.0,3044250.0,3044250.0
mean,0.19,0.12,0.24
std,0.06,0.1,0.09
min,0.09,-0.3,0.13
25%,0.15,0.05,0.17
50%,0.18,0.11,0.22
75%,0.22,0.18,0.29
max,0.75,0.72,0.72


i


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim
count,1014750.0,1014750.0,1014750.0
mean,0.18,0.12,0.25
std,0.07,0.11,0.09
min,0.08,-0.28,0.13
25%,0.13,0.04,0.17
50%,0.17,0.11,0.22
75%,0.22,0.19,0.3
max,0.67,0.65,0.72


j


Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim
count,2705999.0,2705999.0,2705999.0
mean,0.19,0.12,0.25
std,0.07,0.11,0.09
min,0.08,-0.28,0.13
25%,0.13,0.04,0.17
50%,0.17,0.11,0.22
75%,0.23,0.19,0.3
max,0.69,0.74,0.72


In [60]:
df__ = pd.DataFrame()
for el in pg_dict:
    df__ = pd.concat([df_, pg_dict[el]], ignore_index=True)
    df__.drop_duplicates(subset="image_url",
                     keep='first', inplace=True)
df__.drop_duplicates(subset="image_url",
                     keep='first', inplace=True)
display(df__.describe().round(2))

Unnamed: 0,topic2caption_sim,topic2context_sim,concept2caption_sim
count,690206.0,690206.0,690206.0
mean,0.18,0.09,0.21
std,0.06,0.1,0.07
min,0.08,-0.28,0.13
25%,0.14,0.02,0.16
50%,0.16,0.08,0.19
75%,0.21,0.15,0.24
max,0.69,0.67,0.72


In [70]:
missed_topics_dict = dict()
for el in pg_dict:
    missed_topics_dict[el] = pg_dict[el]
    missed_topics_dict[el]['image_path'] = missed_topics_dict[el]['image_url'].apply(link_to_b64)
    missed_topics_dict[el] = missed_topics_dict[el][missed_topics_dict[el].image_path.notnull()]
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [71]:
missed_topics_dict[el]

Unnamed: 0,topic2caption_sim,topic2context_sim,image_url,caption,context,concept2caption_sim,topic_id,image_path
0,0.637767,0.378934,http://upload.wikimedia.org/wikipedia/commons/...,"Cockpit of the F-22, showing instruments, head...",The Lockheed Martin F-22 Raptor is a single-se...,0.347256,247,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...
1,0.618332,0.265842,https://upload.wikimedia.org/wikipedia/commons...,Cockpit view from an FC-2W2 English: Fairchild...,The Fairchild FC-1 and its derivatives were a ...,0.369612,247,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
2,0.606998,0.065710,http://upload.wikimedia.org/wikipedia/commons/...,Flight instruments provide pilots with the too...,Electrical engineering is an engineering disci...,0.473126,247,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...
3,0.603853,0.260137,https://upload.wikimedia.org/wikipedia/commons...,F-CK-1C/D cockpit view F-CK-1C/D cockpit view,"The AIDC F-CK-1 Ching-Kuo, commonly known as t...",0.293287,247,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
4,0.594347,0.224800,https://upload.wikimedia.org/wikipedia/commons...,A mock-up of the prototype cockpit with two mu...,The Northrop F-20 Tigershark was a light fight...,0.394431,247,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
...,...,...,...,...,...,...,...,...
943909,0.126350,0.187087,https://upload.wikimedia.org/wikipedia/commons...,Critical Care RRV at Coventry Airport English:...,The Air Ambulance Service is a registered char...,0.238859,220,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
943910,0.126349,0.057010,https://upload.wikimedia.org/wikipedia/commons...,English: Charang Darbar Deutsch: Palast in Ts...,Mustang is one of district in the Dhaulagiri Z...,0.158898,220,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
943911,0.126348,0.065776,https://upload.wikimedia.org/wikipedia/commons...,English: Abdullah Koni exults with the AFC CH...,Abdulla Obaid Koni is a retired Qatari footbal...,0.243446,220,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...
943912,0.126347,0.152888,https://upload.wikimedia.org/wikipedia/commons...,The interior of a Circle Line S7 Stock English...,"The London Underground S7 and S8 Stock, common...",0.260340,220,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...


In [72]:
selected_col = ['image_path','topic2caption_sim', 'concept2caption_sim', 'topic2context_sim', 'caption']
for el in missed_topics_dict:
    missed_topics_dict[el] = missed_topics_dict[el][selected_col]
    missed_topics_dict[el] = missed_topics_dict[el].drop_duplicates(subset=selected_col, keep=False)
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [73]:
for el in missed_topics_dict:
#     missed_topics_dict[el] = missed_topics_dict[el][selected_col]
    missed_topics_dict[el] = missed_topics_dict[el].drop_duplicates(keep=False)
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [74]:
missed_topics_dict['a']

Unnamed: 0,image_path,topic2caption_sim,concept2caption_sim,topic2context_sim,caption
0,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.680314,0.475999,0.477511,A DJI Phantom quadcopter drone in flight Engli...
1,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.628817,0.287495,0.063607,DJI releases the Phantom UAV in January which ...
2,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.605127,0.337273,0.530003,DJI Mavic Pro drone on a wooden floor.
3,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.604035,0.389452,0.015686,English: DJI Vision Plus - Drone Shot - 350 F...
4,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.602371,0.400906,0.113258,"A DJI M600 drone, similar to the drone used du..."
...,...,...,...,...,...
116750,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...,0.104888,0.149450,0.053595,Photographed in 1994 (age 65). Photo of James ...
116751,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...,0.104886,0.368552,-0.014831,English: Bristol 4-ton truck on display at th...
116752,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.104886,0.162808,0.140625,MTV EXIT concert at Mỹ Đình National Stadium E...
116753,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.104886,0.183375,0.097563,"The Mahon Falls, where the river falls from th..."


In [75]:
for el in missed_topics_dict:
    print(el, len(missed_topics_dict[el]))

a 116755
b 211691
c 669567
d 791652
e 669350
f 472305
g 481500
h 1160586
i 341152
j 942328


In [76]:
sorted_key = []
for k in sorted(missed_topics_dict, key=lambda k: len(missed_topics_dict[k])):
    print(k, len(missed_topics_dict[k]))
    sorted_key.append(k)

a 116755
b 211691
i 341152
f 472305
g 481500
e 669350
c 669567
d 791652
j 942328
h 1160586


In [77]:
def df2_df1(df1, df2):
    cond = df2['image_path'].isin(df1['image_path'])
    df2.drop(df2[cond].index, inplace = True)
    return df2

In [78]:
for idx, k in enumerate(sorted_key):
    print(sorted_key[idx:], k)
    for el in sorted_key[idx:]:
        if el == k:
            continue
        print(f'we are going to calc: {el} - {k}')
        missed_topics_dict[el] = df2_df1(missed_topics_dict[k], missed_topics_dict[el])
#     break

['a', 'b', 'i', 'f', 'g', 'e', 'c', 'd', 'j', 'h'] a
we are going to calc: b - a
we are going to calc: i - a
we are going to calc: f - a
we are going to calc: g - a
we are going to calc: e - a
we are going to calc: c - a
we are going to calc: d - a
we are going to calc: j - a
we are going to calc: h - a
['b', 'i', 'f', 'g', 'e', 'c', 'd', 'j', 'h'] b
we are going to calc: i - b
we are going to calc: f - b
we are going to calc: g - b
we are going to calc: e - b
we are going to calc: c - b
we are going to calc: d - b
we are going to calc: j - b
we are going to calc: h - b
['i', 'f', 'g', 'e', 'c', 'd', 'j', 'h'] i
we are going to calc: f - i
we are going to calc: g - i
we are going to calc: e - i
we are going to calc: c - i
we are going to calc: d - i
we are going to calc: j - i
we are going to calc: h - i
['f', 'g', 'e', 'c', 'd', 'j', 'h'] f
we are going to calc: g - f
we are going to calc: e - f
we are going to calc: c - f
we are going to calc: d - f
we are going to calc: j - f
we are

In [79]:
sorted_key = []
for k in sorted(missed_topics_dict, key=lambda k: len(missed_topics_dict[k])):
    print(k, len(missed_topics_dict[k]))
    sorted_key.append(k)

d 13197
c 18626
h 25981
j 29977
g 30915
e 36105
f 98973
i 108324
a 116755
b 116977


In [80]:
missed_topics_dict['d']

Unnamed: 0,image_path,topic2caption_sim,concept2caption_sim,topic2context_sim,caption
33559,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.272379,0.190188,0.225401,Coat of Arms of the 7° Infantry Regiment; Ita...
33560,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.272379,0.190188,0.176789,Coat of Arms of the 7° Infantry Regiment; Ita...
33561,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.272379,0.190188,0.211528,Coat of Arms of the 7° Infantry Regiment; Ita...
36918,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.265540,0.148981,0.121909,"Iron: Man (1993), in Victoria Square, Birmingh..."
44318,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.252166,0.179095,0.215124,English: Coat of Arms of the 46° Infantry Reg...
...,...,...,...,...,...
791577,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.181363,0.133359,-0.040236,English: Coat of arms of Bockholt
791597,aHR0cDovL3VwbG9hZC53aWtpbWVkaWEub3JnL3dpa2lwZW...,0.181350,0.142486,0.075345,Roth's car after practicing for the 2007 Indy ...
791616,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.181339,0.155946,0.121076,"In December 2014 English: Sophie Hunter, Londo..."
791639,aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcG...,0.181326,0.135704,0.048873,Wolfenstein's Los Angeles photography studio o...


In [81]:
random_count = {'a': 136, 'e': 185, 'g': 433, 'i':521, 'j': 350}

In [82]:
for el in random_count:
    missed_topics_dict[el] = missed_topics_dict[el].sample(n =random_count[el])
    missed_topics_dict[el] = missed_topics_dict[el].reset_index(drop=True)

In [83]:
sorted_key = []
for k in sorted(missed_topics_dict, key=lambda k: len(missed_topics_dict[k])):
    print(k, len(missed_topics_dict[k]))
    sorted_key.append(k)

a 136
e 185
j 350
g 433
i 521
d 13197
c 18626
h 25981
f 98973
b 116977


In [84]:
missed_topics_with_images = dict()
count = 0
for el in random_count:
    missed_topics_with_images[el] = create_data(missed_topics_dict[el], count)
    count = len(missed_topics_with_images[el])

/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvZW4vNC80OC9BYnlhbl9jb250cm9sJTJDX01hcmNoXzE1XzIwMTgucG5n.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy82LzY4L1ZpcGVySmV0LmpwZw==.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9hL2E5L0xhX0ZvcnR1bmFfV2F0ZXJmYWxsX1Bvb2wuanBn.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy81LzU0L1poYWxlX0Rvcm9zdGthci5qcGc=.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy9kL2QzL0ZsYWdfb2ZfdGhlX1V0dGFyYWRpdF9Qcm92aW5jZS5wbmc=.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy82LzY5L0tfTWljaGVsbGVfcGljdHVyZS5wbmc=.jpg
/raid/AISSEL/htest/datasets/wit/images/aHR0cHM6Ly91cGxvYWQud2lraW1lZGlhLm9yZy93aWtpcGVkaWEvY29tbW9ucy83Lzc2L1N1cmVzaF9Nb

In [85]:
missed_topics_with_images[el]

Unnamed: 0,uniq_id,image_id,caption,labels,image,topic2caption_sim,concept2caption_sim,topic2context_sim
0,506,506,Heather DeweyHagborg photo from her 2013 proje...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.225997,0.140048,0.207579
1,507,507,Arbol de la Vida handcraft Español Árbol de la...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.081707,0.162514,-0.060282
2,508,508,Kudlow in 1981 Larry Kudlow in 1981,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.130975,0.160235,-0.133789
3,509,509,Injection with a syringe Nurse gives injection...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.169448,0.145969,-0.002835
4,510,510,English Argentine deputy,,iVBORw0KGgoAAAANSUhEUgAAAPAAAADwCAIAAACxN37FAA...,0.188680,0.164544,0.050294
...,...,...,...,...,...,...,...,...
338,851,851,Illustration for door zone,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.094197,0.166952,-0.029252
339,852,852,20th director of DARPA English Official US gov...,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.106685,0.136647,0.054722
340,853,853,Kept In,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.109961,0.197423,0.009064
341,854,854,English A Kikuyu woman,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.283894,0.137556,0.074353


In [86]:
s = 0
for el in missed_topics_with_images:
    missed_topics_with_images[el] = missed_topics_with_images[el].dropna(subset=['caption'])
    missed_topics_with_images[el]['topics']  = [el] * len(missed_topics_with_images[el])
    print(el, len(missed_topics_with_images[el]))
    s = s + len(missed_topics_with_images[el])
print(s)

a 133
e 181
g 426
i 506
j 343
1589


In [87]:
scol = ['topic2caption_sim', 'concept2caption_sim']
for el in missed_topics_with_images:
    print(el)
    display(missed_topics_with_images[el][scol].describe().round(2))

a


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,133.0,133.0
mean,0.15,0.25
std,0.04,0.09
min,0.11,0.13
25%,0.12,0.17
50%,0.14,0.23
75%,0.17,0.29
max,0.31,0.5


e


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,181.0,181.0
mean,0.18,0.17
std,0.03,0.03
min,0.14,0.13
25%,0.16,0.15
50%,0.17,0.16
75%,0.2,0.19
max,0.32,0.27


g


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,426.0,426.0
mean,0.16,0.17
std,0.04,0.03
min,0.1,0.13
25%,0.13,0.15
50%,0.15,0.17
75%,0.19,0.19
max,0.34,0.3


i


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,506.0,506.0
mean,0.15,0.18
std,0.05,0.04
min,0.08,0.13
25%,0.11,0.15
50%,0.14,0.18
75%,0.18,0.2
max,0.48,0.36


j


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,343.0,343.0
mean,0.15,0.16
std,0.05,0.02
min,0.08,0.13
25%,0.12,0.14
50%,0.14,0.16
75%,0.18,0.18
max,0.33,0.28


In [88]:
df_2 = pd.DataFrame()
for el in missed_topics_with_images:
    df_2 = pd.concat([df_2, missed_topics_with_images[el]], ignore_index=True)
# df_
display(df_2[scol].describe().round(2))

Unnamed: 0,topic2caption_sim,concept2caption_sim
count,1589.0,1589.0
mean,0.16,0.18
std,0.05,0.05
min,0.08,0.13
25%,0.12,0.15
50%,0.15,0.17
75%,0.18,0.2
max,0.48,0.5


In [89]:
random_count_wit = {'a': 36, 'e': 85, 'g': 333, 'i':421, 'j': 250}

In [90]:
for el in random_count_wit:
    missed_topics_with_images[el] = missed_topics_with_images[el].drop_duplicates(keep=False)
    missed_topics_with_images[el] = missed_topics_with_images[el].sample(n =random_count_wit[el])
    missed_topics_with_images[el] = missed_topics_with_images[el].reset_index(drop=True)

In [91]:
for el in random_count_wit:
    print(el)

    display(missed_topics_with_images[el][['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

a


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,36.0,36.0
mean,0.15,0.23
std,0.05,0.08
min,0.11,0.14
25%,0.12,0.16
50%,0.14,0.22
75%,0.16,0.26
max,0.31,0.48


e


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,85.0,85.0
mean,0.18,0.17
std,0.03,0.03
min,0.14,0.13
25%,0.16,0.15
50%,0.17,0.16
75%,0.2,0.19
max,0.29,0.26


g


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,333.0,333.0
mean,0.16,0.17
std,0.04,0.03
min,0.1,0.13
25%,0.13,0.15
50%,0.16,0.17
75%,0.19,0.19
max,0.34,0.3


i


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,421.0,421.0
mean,0.15,0.18
std,0.05,0.04
min,0.08,0.13
25%,0.11,0.15
50%,0.14,0.18
75%,0.18,0.21
max,0.48,0.36


j


Unnamed: 0,topic2caption_sim,concept2caption_sim
count,250.0,250.0
mean,0.15,0.16
std,0.05,0.03
min,0.08,0.13
25%,0.11,0.14
50%,0.14,0.16
75%,0.18,0.18
max,0.32,0.28


In [92]:
df_l = pd.DataFrame()
for el in random_count_wit:
    df_l = pd.concat([df_l, missed_topics_with_images[el]], ignore_index=True)
# df_
display(df_l[['topic2caption_sim', 'concept2caption_sim']].describe().round(2))

Unnamed: 0,topic2caption_sim,concept2caption_sim
count,1125.0,1125.0
mean,0.16,0.18
std,0.05,0.04
min,0.08,0.13
25%,0.12,0.15
50%,0.15,0.17
75%,0.18,0.19
max,0.48,0.48


In [93]:
def get_portion(df, p1=72, p2=16, p3=12):
    s1 = df.sample(frac = p1/100)
    rest_part_1 = df.drop(s1.index)
    s2 = rest_part_1.sample(frac = p2/(100-p1))
    s3 = rest_part_1.drop(s2.index)
    return s1, s2, s3

In [94]:
name_lsit = ['stage1_train', 'stage2_train', 'val']
por_dict = dict()
data_dict = dict()
for n in name_lsit:
    data_dict[n] = pd.DataFrame() 
for el in random_count_wit:
    s1, s2, v = get_portion(missed_topics_with_images[el])
    
    s1 = s1.reset_index(drop=True)
    data_dict['stage1_train'] = pd.concat([data_dict['stage1_train'], s1], ignore_index=True)
    
    s2 = s2.reset_index(drop=True)
    data_dict['stage2_train'] = pd.concat([data_dict['stage2_train'], s2], ignore_index=True)
    
    v = v.reset_index(drop=True)
    data_dict['val'] = pd.concat([data_dict['val'], v], ignore_index=True)
#     data_dict[el] = por_dict

In [95]:
print(len(data_dict['stage1_train']))
print(len(data_dict['stage2_train']))
print(len(data_dict['val']))

810
180
135


In [96]:
names = ['uniq_id', 'image_id', 'caption', 'labels', 'image']

In [98]:
! mkdir -p /raid/AISSEL/htest/datasets/av_data/wit/random_missed
saved_path = '/raid/AISSEL/htest/datasets/av_data/wit/random_missed'
for el in data_dict:
    name = f'caption_{el}.tsv'
    df_s = data_dict[el][names]
    df_s.to_csv(f'{saved_path}/{name}', sep="\t", index=False, header=False)

    print(name)

caption_stage1_train.tsv
caption_stage2_train.tsv
caption_val.tsv
