In [1]:
import os

# General packages
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
import PIL.Image
from tqdm.notebook import tqdm, trange
import glob
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

from IPython.display import Image as IImage
from IPython.display import display
import warnings
warnings.filterwarnings("ignore")
tqdm.pandas()

In [2]:
import torch
import clip
from PIL import Image
import base64

import os

In [3]:
from sentence_transformers import SentenceTransformer, util
embedder = SentenceTransformer('all-MiniLM-L6-v2')


In [4]:
os.listdir('/raid/AISSEL/htest/datasets/av_data/ftest')

['all',
 'df_av_test.pk',
 'df_av_test_tags.pk',
 'df_av_test_before_tags.pk',
 'df_av_test_after_tags.pk',
 'all_q50',
 'missed_q50',
 'all_aviation.zip',
 'missed_splited_aircraft.zip',
 'missed_q50_over_avg',
 'missed_q50_over_avg_cider',
 'missed_q50_over_avg_cider_web']

In [5]:
def find_gpus(nums=6):
    os.system('nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp_free_gpus')
    with open('tmp_free_gpus', 'r') as lines_txt:
        frees = lines_txt.readlines()
        idx_freeMemory_pair = [ (idx,int(x.split()[2]))
                              for idx,x in enumerate(frees) ]
    idx_freeMemory_pair.sort(key=lambda my_tuple:my_tuple[1],reverse=True)
    usingGPUs = [str(idx_memory_pair[0])
                    for idx_memory_pair in idx_freeMemory_pair[:nums] ]
    usingGPUs =  ','.join(usingGPUs)
    print('using GPU idx: #', usingGPUs)
    return usingGPUs


In [6]:
os.environ['CUDA_VISIBLE_DEVICES'] = find_gpus(nums=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

using GPU idx: # 0,1
Using device: cuda



## get topics

In [7]:
from bertopic import BERTopic
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
topics = {63: {'aerodynamic',
  'aircraft',
  'airplane',
  'aviation',
  'fly',
  'fuselage',
  'plane',
  'wing'},
 3: {'aircraft',
  'aviation',
  'fighter',
  'flew',
  'flight',
  'fly',
  'lindbergh',
  'pilot',
  'squadron'},
 271: {'aerial',
  'aerodrome',
  'aircraft',
  'airport',
  'airspace',
  'aviation',
  'flight',
  'fly',
  'icao',
  'licensing',
  'pilot'},
 145: {'aeroflot',
  'aeroflots',
  'aerosvit',
  'aircraft',
  'airline',
  'airport',
  'boeing',
  'flight',
  'superjet',
  'volgadnepr'},
 298: {'aircraft',
  'airline',
  'aviation',
  'corporate',
  'openairplane',
  'pilot',
  'remuneration',
  'shareholders13',
  'stakeholder'},
 11: {'airbus',
  'aircraft',
  'airline',
  'airliner',
  'airport',
  'boeing',
  'flight',
  'lufthansa',
  'passenger',
  'southwest'},
 45: {'aircraft',
  'armament',
  'bomber',
  'corsair',
  'fighter',
  'fuselage',
  'navy',
  'squadron',
  'wing'},
 201: {'aerobot',
  'aerobraking',
  'aerodynamic',
  'aeronautics',
  'aircraft',
  'balloon',
  'fly',
  'glide',
  'rocket',
  'spacecraft',
  'spaceflight'},
 35: {'1900d',
  'aircraft',
  'beech',
  'beechcraft',
  'cabin',
  'cessna',
  'engine',
  'gear',
  'skyhawk',
  'wing'},
 108: {'aircraft',
  'biplane',
  'cantilever',
  'design',
  'fuselage',
  'monoplane',
  'sesquiplane',
  'strut',
  'wing'},
 58: {'737',
  '747',
  '747400',
  '747400s',
  '7478',
  'aircraft',
  'airline',
  'airliner',
  'boeing',
  'boeings',
  'freighter'},
 144: {'1943',
  '1944',
  'aircraft',
  'armament',
  'bomber',
  'raf',
  'squadron',
  'v72',
  'vengeance'},
 157: {'aerial',
  'aircraft',
  'balloon',
  'bomber',
  'camera',
  'photographic',
  'photography',
  'reconnaissance'},
 186: {'aircraft',
  'albatros',
  'armament',
  'benz',
  'biplane',
  'diii',
  'fuselage',
  'mercedes',
  'strut',
  'wing'},
 230: {'aircraft',
  'airlift',
  'airlifters',
  'cargo',
  'carry',
  'equipment',
  'helicopter',
  'payload',
  'pylon',
  'transport'},
 159: {'aircraft',
  'beaufort',
  'beauforts',
  'fuselage',
  'pilot',
  'raf',
  'spitfire',
  'squadron',
  'supermarine'},
 220: {'911',
  'aircraft',
  'airline',
  'faa',
  'flight',
  'hijack',
  'hijacker',
  'pentagon',
  'plane',
  'tower'},
 77: {'aircraft',
  'airline',
  'airport',
  'boeing',
  'calgary',
  'fleet',
  'flight',
  'toronto',
  'vancouver',
  'westjet'},
 256: {'aircraft', 'airline', 'airport', 'code', 'digit', 'flight', 'number'},
 104: {'1916',
  'aircraft',
  'argosy',
  'biplane',
  'engine',
  'fighter',
  'fit',
  'fly',
  'raf',
  'wing'},
 86: {'aircraft',
  'airline',
  'airport',
  'airway',
  'boeing',
  'fiji',
  'flight',
  'pacific',
  'qantas',
  'sydney'},
 2: {'aircraft',
  'airline',
  'cockpit',
  'crash',
  'crashed',
  'crew',
  'investigator',
  'passenger',
  'pilot',
  'wreckage'},
 34: {'administration',
  'aircraft',
  'airline',
  'airport',
  'amend',
  'aviation',
  'federal',
  'legislation',
  'tsa'},
 247: {'aircraft',
  'cockpit',
  'control',
  'display',
  'gunner',
  'instrument',
  'panel',
  'pilot'},
 23: {'aircraft',
  'amelia',
  'aviation',
  'flew',
  'flight',
  'fly',
  'pilot',
  'woman'},
 80: {'aerodynamic',
  'aircraft',
  'flight',
  'hypersonic',
  'land',
  'lift',
  'rocket',
  'shuttle',
  'spacecraft',
  'spaceplane'},
 165: {'aircraft',
  'bluebird',
  'bomber',
  'built',
  'design',
  'fuselage',
  'hampdens',
  'patent',
  'prototype',
  'raf',
  'wing'},
 32: {'aircraft',
  'boat',
  'floatplane',
  'flyingboat',
  'naval',
  'seaplane',
  'ship'},
 319: {'aeronautical',
  'aeronautics',
  'aircraft',
  'design',
  'engineer',
  'engineering',
  'hypersonic',
  'ice',
  'langley',
  'pyrodyne',
  'rodert'},
 126: {'aircraft',
  'anxiety',
  'aviophobia',
  'circadian',
  'fatigue',
  'fear',
  'flight',
  'fly',
  'phobia',
  'pilot'},
 66: {'aircraft',
  'airport',
  'airspace',
  'atc',
  'clearance',
  'controller',
  'faa',
  'flight',
  'pilot',
  'radar'},
 225: {'aerospace',
  'airbus',
  'aircraft',
  'boeing',
  'bombardier',
  'company',
  'industry',
  'merger',
  'supplier'},
 25: {'1944',
  'aircraft',
  'bomber',
  'bombing',
  'combat',
  'fighter',
  'luftwaffe',
  'pilot',
  'raf'},
 8: {'aircraft',
  'airfield',
  'airlift',
  'deployed',
  'fly',
  'mission',
  'operational',
  'raf',
  'squadron'},
 235: {'737',
  '737700',
  '737800',
  '737ng',
  '737s',
  'aircraft',
  'airline',
  'boeing',
  'boeings',
  'fuselage',
  'southwest'},
 295: {'airasia',
  'airbus',
  'aircraft',
  'airline',
  'boeing',
  'brunei',
  'kuala',
  'lumpur',
  'malaysia',
  'malaysian'},
 227: {'aircraft',
  'airline',
  'amazon',
  'ati',
  'boeing',
  'cargo',
  'cargolux',
  'charter',
  'fleet'},
 44: {'aircraft',
  'armament',
  'bomber',
  'design',
  'engine',
  'fuselage',
  'ju',
  'luftwaffe',
  'messerschmitt',
  'wing'},
 234: {'1944',
  'aircraft',
  'bomber',
  'havilland',
  'hornet',
  'mosquito',
  'raf',
  'squadron'},
 50: {'aircraft',
  'canadian',
  'dh60',
  'dhc',
  'fly',
  'fuselage',
  'havilland',
  'moth',
  'otter'},
 251: {'aircraft',
  'airliner',
  'company',
  'il214',
  'jet',
  'mig35',
  'su35s',
  'superjet',
  'tu204',
  'uac',
  'uacs'},
 31: {'aerial',
  'aircraft',
  'dji',
  'drone',
  'reconnaissance',
  'sensor',
  'uas',
  'uav',
  'uavs',
  'unmanned'},
 10: {'aircraft',
  'airfield',
  'airline',
  'airport',
  'county',
  'facility',
  'flight',
  'runway'},
 258: {'aircraft',
  'airway',
  'cabin',
  'fairchild',
  'fuselage',
  'h45',
  'h47',
  'husky',
  'monoplane',
  'rcaf'},
 72: {'1910',
  '1942',
  '1942nd',
  'aerospace4',
  'aircraft',
  'aviation',
  'aviationrelated',
  'birmingham',
  'event',
  'huntsville',
  'year'},
 93: {'aeronautical',
  'aeronautics',
  'aerospace',
  'aircraft',
  'aviation',
  'engineer',
  'engineering',
  'flight',
  'mechanical',
  'optimization'},
 118: {'aircraft',
  'airline',
  'airway',
  'bcal',
  'bcals',
  'bua',
  'flight',
  'gatwick',
  'heathrow',
  'london'},
 19: {'aircraft', 'glide', 'glider', 'kite', 'sailplane', 'soar', 'wing'},
 139: {'a37',
  'a37b',
  'aircraft',
  'cessna',
  'navy',
  'refuel',
  't33',
  't37',
  'trainer',
  'training',
  'usaf'},
 237: {'aircraft',
  'captain',
  'copilot',
  'crew',
  'duty',
  'faa',
  'officer',
  'operate',
  'operation',
  'pilot',
  'pilotincommand'}}

In [10]:
# topics = get_topics(concept='aircraft')
q_id = [el for el in topics]

In [11]:
for el in topics:
    print(el, topics[el])

63 {'aviation', 'wing', 'aircraft', 'fuselage', 'plane', 'airplane', 'aerodynamic', 'fly'}
3 {'aviation', 'flight', 'fighter', 'aircraft', 'flew', 'lindbergh', 'squadron', 'pilot', 'fly'}
271 {'aviation', 'flight', 'aircraft', 'aerodrome', 'pilot', 'licensing', 'airspace', 'icao', 'aerial', 'airport', 'fly'}
145 {'flight', 'volgadnepr', 'aircraft', 'airport', 'aeroflot', 'airline', 'aerosvit', 'aeroflots', 'superjet', 'boeing'}
298 {'aviation', 'openairplane', 'aircraft', 'remuneration', 'stakeholder', 'shareholders13', 'airline', 'pilot', 'corporate'}
11 {'flight', 'aircraft', 'passenger', 'southwest', 'airline', 'airport', 'airbus', 'lufthansa', 'airliner', 'boeing'}
45 {'fighter', 'aircraft', 'squadron', 'fuselage', 'armament', 'bomber', 'navy', 'corsair', 'wing'}
201 {'aeronautics', 'aircraft', 'balloon', 'aerobot', 'aerodynamic', 'spacecraft', 'glide', 'fly', 'rocket', 'spaceflight', 'aerobraking'}
35 {'engine', 'cessna', 'aircraft', 'gear', 'cabin', 'beechcraft', '1900d', 'skyhaw

In [12]:
topics[63]

{'aerodynamic',
 'aircraft',
 'airplane',
 'aviation',
 'fly',
 'fuselage',
 'plane',
 'wing'}

In [13]:
pg_t = dict()
pg_t['a'] = [31]
pg_t['b'] = [19, 32]
pg_t['c'] = [157, 230, 63, 319, 8, 201]
pg_t['d'] = [144, 45, 104, 35, 44, 165, 186]
pg_t['e'] = [8, 25, 159, 108, 50, 234]

pg_t['f'] = [235, 225, 58, 227]
pg_t['g'] = [93, 258, 139, 251]
pg_t['h'] = [145, 256, 34, 11, 77, 86, 10, 118, 295]
pg_t['i'] = [3, 271, 298]
pg_t['j'] = [247, 2, 237, 72, 23, 126, 66, 220]

In [14]:
for el in pg_t:
    group_words = []
    for t in pg_t[el]:
        group_words.extend(list(topics[t]))
#         break
    pg_t[el]= list(set(group_words))
    

In [15]:
print(pg_t[el])

['aviation', 'flight', 'aircraft', 'officer', '911', 'plane', 'crash', 'tower', 'atc', 'airspace', 'pentagon', 'gunner', 'passenger', 'cockpit', 'crashed', 'investigator', 'flew', 'crew', 'wreckage', 'pilotincommand', 'captain', 'pilot', 'display', 'fear', 'year', 'circadian', 'birmingham', 'instrument', 'operate', 'airport', '1910', 'hijack', 'control', 'phobia', 'amelia', 'duty', 'clearance', 'radar', 'operation', '1942', 'fly', 'fatigue', 'woman', 'hijacker', 'event', 'aerospace4', 'copilot', 'airline', 'anxiety', 'aviationrelated', 'controller', 'huntsville', 'faa', '1942nd', 'panel', 'aviophobia']


In [16]:
for g in pg_t:
    pg_t[g] = ' '.join(pg_t[g])

In [17]:
pg_t[g]

'aviation flight aircraft officer 911 plane crash tower atc airspace pentagon gunner passenger cockpit crashed investigator flew crew wreckage pilotincommand captain pilot display fear year circadian birmingham instrument operate airport 1910 hijack control phobia amelia duty clearance radar operation 1942 fly fatigue woman hijacker event aerospace4 copilot airline anxiety aviationrelated controller huntsville faa 1942nd panel aviophobia'

In [18]:
toopic_dict = dict()
for el in topics:
    toopic_dict[el] = ' '.join(topics[el])
toopic_dict[el]

'aircraft crew officer pilotincommand copilot pilot duty operation faa captain operate'

## Load Predictions

In [19]:
pwd

'/home/test/text_processing/ICSE'

In [35]:
# pd.set_option('display.max_rows', None)

#before
base_path = '.'
df_b = pd.read_pickle(f'{base_path}/av_base.pk')


# after q50_over_avg
d_path = '/raid/AISSEL/htest/datasets/av_data/ftest/missed_q50_over_avg'
df_f = pd.read_pickle(f'{d_path}/av_mq50_over_avg.pk')

# wit
d_path = '/raid/AISSEL/htest/datasets/av_data/wit/missed_q50_over_avg'
df_w = pd.read_pickle(f'{d_path}/av_mq50_over_avg.pk')

#random
d_path = '/raid/AISSEL/htest/datasets/av_data/wit/random_missed'
df_r = pd.read_pickle(f'{d_path}/av_mq50_over_avg.pk')
# df

In [36]:
pedestrian_list = ['aircraft', 'airplane', 'plane', 'fighter', 
                 'boeing', 'superjet', 'airbus', 'bomber', 'glide', 
                 'skyhawk', 'balloon', 'helicopter', 'seaplane', 'jet',
                 'mig35', 'su35s', 'dji', 'drone', 'uas', 'uav', 'glide', 'kite']

In [37]:
def create_gt(tag):
    context = ""
    for el in tag:
        if tag[el]:
            context = context + " " + el
    if not context:
        context = None
    return context

In [38]:
# pedestrian_list
def flag_search_gt(tag, t):
    context = ""
    for el in tag:
        if tag[el]:
            context = context + " " + el
        
    for w in t:
        if w in context:
            return float(1)
    return float(0)

In [39]:
def get_percentage(tag):
    sum_val = 0
    for el in tag:
        sum_val = sum_val + tag[el]
    return sum_val / len(tag)

In [40]:
def get_sim(u,v):

    emb1 = embedder.encode(u)
    if isinstance(v, list):
        v = ' '.join(v)
    emb2 = embedder.encode(v)
    
    return float(util.cos_sim(emb1, emb2)[0][0])

In [41]:
def simple_search(context, t):
    percentage = 0
    for w in t:
        if w in context:
            percentage = percentage + 1
    return float(percentage/len(t))

In [42]:
def calc_sim(df):
    df = df[df['tag'].notna()]
    df = df.reset_index(drop=True)
    df = df[df['gt'].notna()]
    df = df.reset_index(drop=True)
    
    df['gt_coverage'] = df.apply(lambda x: get_percentage(x.tag), axis=1)
    df['org_top_cov'] = df.apply(lambda x: simple_search(x.caption, list(topics[x.topic_id])), axis=1)
    df['ofa_top_cov'] = df.apply(lambda x: simple_search(x.ofa_caption, list(topics[x.topic_id])), axis=1)

    df['gt_ped_flag'] = df.apply(lambda x: flag_search_gt(x.tag, pedestrian_list), axis=1)
    df['org_top_sim'] = df.apply(lambda x: get_sim(x.caption, list(topics[x.topic_id])), axis=1)
    df['ofa_top_sim'] = df.apply(lambda x: get_sim(x.ofa_caption, list(topics[x.topic_id])), axis=1)
    df['ofa_con_sim'] = df.apply(lambda x: get_sim(x.ofa_caption, ['aircraft']), axis=1)
    df['org_con_sim'] = df.apply(lambda x: get_sim(x.caption, ['aircraft']), axis=1)
    # df['ofa_gt_sim'] = df.apply(lambda x: get_sim(x.ofa_caption, x.gt), axis=1)
    df['ofa_gt_sim'] = df.apply(lambda x: get_sim(x.ofa_caption, create_gt(x.tag)), axis=1)
    df['org_gt_sim'] = df.apply(lambda x: get_sim(x.caption, create_gt(x.tag)), axis=1)
    df['human_con_sim'] = df.apply(lambda x: get_sim(x.h_caption, ['aircraft']), axis=1)
    df['web_ofa_sim'] = df.apply(lambda x: get_sim(x.caption, x.ofa_caption), axis=1)
    gk = df.groupby('topics')
    pg_dict = dict()
    for el in gk.groups.keys():
        pg_dict[el] = gk.get_group(el)
        pg_dict[el] = pg_dict[el].reset_index(drop=True)
    return df, pg_dict

In [43]:
df_b, pg_dict_b = calc_sim(df_b)

In [44]:
df_f, pg_dict_f = calc_sim(df_f)

In [45]:
df_w, pg_dict_w = calc_sim(df_w)

In [46]:
df_r, pg_dict_r = calc_sim(df_r)

In [47]:
data = []
for el in pg_dict_b:
    data.append([el, pg_dict_w[el].describe().round(2).at['mean','web_ofa_sim'], pg_dict_f[el].describe().round(2).at['mean','web_ofa_sim'], pg_dict_b[el].describe().round(2).at['mean','web_ofa_sim'], pg_dict_r[el].describe().round(2).at['mean','web_ofa_sim']])

In [48]:
df_cap2cap = pd.DataFrame(data, columns = ['Topic', 'App_1', 'App_2', 'Base', 'Random'])
df_cap2cap

Unnamed: 0,Topic,App_1,App_2,Base,Random
0,a,0.43,0.49,0.46,0.45
1,b,0.42,0.43,0.42,0.43
2,c,0.36,0.36,0.36,0.31
3,d,0.32,0.32,0.31,0.29
4,e,0.33,0.29,0.27,0.28
5,f,0.35,0.36,0.36,0.29
6,g,0.33,0.28,0.29,0.24
7,h,0.35,0.37,0.34,0.36
8,i,0.35,0.34,0.29,0.33
9,j,0.38,0.44,0.38,0.38


In [49]:
data = []
for el in pg_dict_b:
    data.append([el, pg_dict_w[el].describe().round(2).at['mean','ofa_con_sim'], pg_dict_f[el].describe().round(2).at['mean','ofa_con_sim'], pg_dict_b[el].describe().round(2).at['mean','ofa_con_sim'], pg_dict_r[el].describe().round(2).at['mean','ofa_con_sim']])

In [50]:
df_cap2con = pd.DataFrame(data, columns = ['Topic', 'App_1', 'App_2', 'Base', 'Random'])
df_cap2con

Unnamed: 0,Topic,App_1,App_2,Base,Random
0,a,0.44,0.38,0.4,0.4
1,b,0.55,0.49,0.5,0.51
2,c,0.61,0.5,0.5,0.47
3,d,0.62,0.49,0.51,0.49
4,e,0.5,0.45,0.42,0.4
5,f,0.53,0.54,0.53,0.44
6,g,0.55,0.44,0.45,0.4
7,h,0.54,0.52,0.49,0.46
8,i,0.51,0.43,0.43,0.42
9,j,0.49,0.48,0.46,0.42


In [51]:
data = []
for el in pg_dict_b:
    data.append([el, pg_dict_w[el].describe().round(2).at['mean','ofa_gt_sim'], pg_dict_f[el].describe().round(2).at['mean','ofa_gt_sim'], pg_dict_b[el].describe().round(2).at['mean','ofa_gt_sim'], pg_dict_r[el].describe().round(2).at['mean','ofa_gt_sim']])

In [52]:
df_cap2top = pd.DataFrame(data, columns = ['Topic', 'App_1', 'App_2', 'Base', 'Random'])
df_cap2top

Unnamed: 0,Topic,App_1,App_2,Base,Random
0,a,0.47,0.54,0.47,0.47
1,b,0.4,0.4,0.38,0.38
2,c,0.41,0.4,0.4,0.33
3,d,0.4,0.38,0.37,0.37
4,e,0.4,0.37,0.34,0.33
5,f,0.36,0.36,0.36,0.29
6,g,0.34,0.29,0.3,0.26
7,h,0.42,0.44,0.43,0.4
8,i,0.47,0.44,0.42,0.43
9,j,0.42,0.48,0.43,0.41


In [60]:
df_cap2cap.to_csv(f'air_cap2cap.csv', index=False)
df_cap2con.to_csv(f'air_cap2con.csv', index=False)
df_cap2top.to_csv(f'air_cap2top.csv', index=False)

In [59]:
! ls

 air_cap2cap.csv			    pd_base.pk
 air_cap2con.csv			    ped_cap2cap.csv
 air_cap2top.csv			    ped_cap2con.csv
 aircraft_data.ipynb			    ped_cap2top.csv
 aircraft_sample.tsv			    ped_caption_tag_sam.ipynb
 av_base-Copy1.pk			    pedesrtrian_data_edited.ipynb
 av_base.pk				    pedestrian_sample.tsv
 av_caption_tag_sam.ipynb		    ped_image_analysis.ipynb
 av_image_analys_test.ipynb		    ped_image_analysis_test.ipynb
 captions_val2014.json			    tag_av_test_images.ipynb
 data					    tag_ped_test_images.ipynb
 deer_decode.jpg			    test.jpg
'GitHub data collection for CrossVul.csv'   tmp_free_gpus
 metrics-av.ipynb			    tsv2json.ipynb
 metrics-ped.ipynb			    WIT_test_av.ipynb
 pd_base-Copy1.pk			    WIT_test_ped.ipynb


In [54]:
# df['org_top_cov'] = df.apply(lambda x: simple_search(x.caption, list(topics[x.topic_id])), axis=1)
# df['ofa_top_cov'] = df.apply(lambda x: simple_search(x.ofa_caption, list(topics[x.topic_id])), axis=1)

# df['gt_ped_flag'] = df.apply(lambda x: flag_search_gt(x.tag, pedestrian_list), axis=1)
# df['org_top_sim'] = df.apply(lambda x: get_sim(x.caption, list(topics[x.topic_id])), axis=1)
# df['ofa_top_sim'] = df.apply(lambda x: get_sim(x.ofa_caption, list(topics[x.topic_id])), axis=1)


In [55]:
# df['ofa_con_sim'] = df.apply(lambda x: get_sim(x.ofa_caption, ['Aircraft']), axis=1)
# df['org_con_sim'] = df.apply(lambda x: get_sim(x.caption, ['Aircraft']), axis=1)
# # df['ofa_gt_sim'] = df.apply(lambda x: get_sim(x.ofa_caption, x.gt), axis=1)
# df['ofa_gt_sim'] = df.apply(lambda x: get_sim(x.ofa_caption, create_gt(x.tag)), axis=1)
# df['org_gt_sim'] = df.apply(lambda x: get_sim(x.caption, create_gt(x.tag)), axis=1)

In [56]:
# df['human_con_sim'] = df.apply(lambda x: get_sim(x.h_caption, ['Aircraft']), axis=1)
# df['web_ofa_sim'] = df.apply(lambda x: get_sim(x.caption, x.ofa_caption), axis=1)

# Random

In [57]:
# random
df_r[['gt_ped_flag', 'gt_coverage', 'org_top_sim', 'ofa_top_sim', 'ofa_gt_sim', 'ofa_con_sim', 'web_ofa_sim']].describe().round(2)

Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,0.96,0.55,0.49,0.33,0.38,0.45,0.35
std,0.2,0.21,0.11,0.13,0.14,0.13,0.15
min,0.0,0.09,0.23,-0.02,-0.08,-0.07,-0.09
25%,1.0,0.4,0.4,0.25,0.29,0.39,0.24
50%,1.0,0.56,0.48,0.33,0.38,0.47,0.34
75%,1.0,0.73,0.57,0.42,0.47,0.53,0.44
max,1.0,1.0,0.84,0.69,0.76,0.7,0.91


In [58]:
# random
for el in pg_dict_r:
    print(el)
    display(pg_dict_r[el][['gt_ped_flag', 'gt_coverage', 'org_top_sim', 'ofa_top_sim', 'ofa_gt_sim', 'ofa_con_sim', 'web_ofa_sim']].describe().round(2))

a


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,12.0,12.0,12.0,12.0,12.0,12.0,12.0
mean,1.0,0.49,0.51,0.45,0.47,0.4,0.45
std,0.0,0.03,0.05,0.14,0.14,0.11,0.12
min,1.0,0.4,0.39,0.17,0.22,0.2,0.22
25%,1.0,0.5,0.49,0.43,0.4,0.33,0.39
50%,1.0,0.5,0.52,0.48,0.51,0.43,0.44
75%,1.0,0.5,0.54,0.53,0.54,0.47,0.48
max,1.0,0.5,0.58,0.61,0.66,0.54,0.72


b


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,57.0,57.0,57.0,57.0,57.0,57.0,57.0
mean,1.0,0.61,0.48,0.36,0.38,0.51,0.43
std,0.0,0.23,0.09,0.1,0.11,0.12,0.17
min,1.0,0.14,0.29,0.04,0.14,0.21,0.08
25%,1.0,0.43,0.43,0.31,0.31,0.45,0.34
50%,1.0,0.57,0.47,0.35,0.37,0.5,0.42
75%,1.0,0.86,0.55,0.43,0.43,0.59,0.52
max,1.0,0.86,0.65,0.61,0.66,0.7,0.88


c


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,33.0,33.0,33.0,33.0,33.0,33.0,33.0
mean,0.97,0.58,0.42,0.29,0.33,0.47,0.31
std,0.17,0.28,0.1,0.13,0.13,0.12,0.11
min,0.0,0.1,0.23,-0.01,0.02,0.11,0.09
25%,1.0,0.36,0.38,0.21,0.27,0.42,0.23
50%,1.0,0.5,0.39,0.27,0.34,0.5,0.33
75%,1.0,0.88,0.46,0.37,0.4,0.55,0.39
max,1.0,1.0,0.68,0.59,0.52,0.64,0.45


d


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0
mean,1.0,0.54,0.43,0.31,0.37,0.49,0.29
std,0.0,0.18,0.09,0.1,0.1,0.14,0.13
min,1.0,0.1,0.3,0.08,0.09,0.07,0.02
25%,1.0,0.4,0.36,0.24,0.3,0.46,0.2
50%,1.0,0.5,0.4,0.27,0.36,0.5,0.29
75%,1.0,0.7,0.52,0.36,0.43,0.57,0.35
max,1.0,0.8,0.63,0.57,0.58,0.66,0.61


e


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,64.0,64.0,64.0,64.0,64.0,64.0,64.0
mean,0.91,0.47,0.49,0.31,0.33,0.4,0.28
std,0.29,0.22,0.11,0.15,0.14,0.16,0.15
min,0.0,0.11,0.31,-0.02,-0.0,0.06,-0.09
25%,1.0,0.33,0.4,0.22,0.26,0.34,0.2
50%,1.0,0.44,0.48,0.34,0.37,0.44,0.3
75%,1.0,0.67,0.54,0.42,0.43,0.51,0.35
max,1.0,0.89,0.76,0.54,0.58,0.69,0.73


f


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,1.0,0.65,0.49,0.26,0.29,0.44,0.29
std,0.0,0.23,0.1,0.1,0.13,0.13,0.14
min,1.0,0.09,0.32,0.02,0.1,0.16,0.06
25%,1.0,0.56,0.42,0.21,0.21,0.4,0.19
50%,1.0,0.82,0.49,0.26,0.31,0.46,0.28
75%,1.0,0.82,0.57,0.31,0.33,0.52,0.39
max,1.0,0.91,0.68,0.46,0.55,0.69,0.53


g


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,17.0,17.0,17.0,17.0,17.0,17.0,17.0
mean,0.88,0.66,0.38,0.21,0.26,0.4,0.24
std,0.33,0.1,0.06,0.08,0.12,0.17,0.13
min,0.0,0.55,0.31,0.07,0.08,0.05,0.04
25%,1.0,0.6,0.34,0.17,0.19,0.33,0.13
50%,1.0,0.64,0.35,0.2,0.25,0.42,0.25
75%,1.0,0.8,0.4,0.28,0.36,0.59,0.36
max,1.0,0.8,0.52,0.38,0.49,0.59,0.51


h


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,168.0,168.0,168.0,168.0,168.0,168.0,168.0
mean,0.96,0.52,0.53,0.33,0.4,0.46,0.36
std,0.19,0.17,0.11,0.11,0.14,0.12,0.13
min,0.0,0.12,0.29,0.0,0.09,0.12,0.03
25%,1.0,0.4,0.45,0.26,0.31,0.38,0.27
50%,1.0,0.5,0.54,0.33,0.4,0.46,0.35
75%,1.0,0.62,0.6,0.4,0.48,0.54,0.44
max,1.0,1.0,0.84,0.69,0.76,0.69,0.73


i


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,24.0,24.0,24.0,24.0,24.0,24.0,24.0
mean,0.88,0.55,0.41,0.36,0.43,0.42,0.33
std,0.34,0.23,0.08,0.13,0.14,0.13,0.17
min,0.0,0.11,0.32,0.04,0.05,0.12,0.05
25%,1.0,0.44,0.37,0.32,0.37,0.38,0.21
50%,1.0,0.55,0.39,0.36,0.46,0.47,0.3
75%,1.0,0.78,0.46,0.44,0.5,0.5,0.49
max,1.0,0.89,0.62,0.59,0.73,0.58,0.64


j


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0
mean,0.96,0.61,0.51,0.35,0.41,0.42,0.38
std,0.19,0.2,0.1,0.14,0.15,0.12,0.17
min,0.0,0.1,0.31,-0.02,-0.08,-0.07,-0.0
25%,1.0,0.4,0.44,0.29,0.3,0.38,0.25
50%,1.0,0.62,0.5,0.37,0.45,0.47,0.35
75%,1.0,0.75,0.59,0.45,0.51,0.5,0.48
max,1.0,0.88,0.73,0.65,0.74,0.66,0.91


# base

### keep all

In [37]:
# before
df_b[['gt_ped_flag', 'gt_coverage', 'org_top_sim', 'ofa_top_sim', 'ofa_gt_sim', 'ofa_con_sim', 'web_ofa_sim']].describe().round(2)

Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,0.96,0.55,0.49,0.35,0.4,0.48,0.35
std,0.2,0.21,0.11,0.12,0.14,0.12,0.15
min,0.0,0.09,0.22,-0.04,-0.03,0.05,-0.06
25%,1.0,0.4,0.4,0.28,0.32,0.43,0.24
50%,1.0,0.56,0.48,0.35,0.39,0.5,0.33
75%,1.0,0.73,0.58,0.43,0.48,0.55,0.44
max,1.0,1.0,0.84,0.7,0.84,0.71,0.89


In [99]:
# before
for el in pg_dict:
    print(el)
    display(pg_dict[el][['gt_ped_flag', 'gt_coverage', 'org_top_sim', 'ofa_top_sim', 'ofa_gt_sim', 'ofa_con_sim', 'web_ofa_sim']].describe().round(2))

a


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,12.0,12.0,12.0,12.0,12.0,12.0,12.0
mean,1.0,0.49,0.49,0.46,0.47,0.4,0.46
std,0.0,0.03,0.05,0.08,0.09,0.11,0.08
min,1.0,0.4,0.38,0.25,0.25,0.21,0.37
25%,1.0,0.5,0.46,0.47,0.45,0.33,0.39
50%,1.0,0.5,0.49,0.49,0.5,0.43,0.43
75%,1.0,0.5,0.52,0.5,0.52,0.45,0.52
max,1.0,0.5,0.56,0.52,0.58,0.62,0.61


b


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,57.0,57.0,57.0,57.0,57.0,57.0,57.0
mean,1.0,0.61,0.5,0.38,0.38,0.5,0.42
std,0.0,0.23,0.09,0.13,0.15,0.13,0.16
min,1.0,0.14,0.29,-0.04,-0.03,0.07,0.09
25%,1.0,0.43,0.45,0.32,0.31,0.44,0.31
50%,1.0,0.57,0.49,0.38,0.37,0.53,0.41
75%,1.0,0.86,0.57,0.46,0.44,0.58,0.53
max,1.0,0.86,0.67,0.69,0.71,0.69,0.89


c


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,33.0,33.0,33.0,33.0,33.0,33.0,33.0
mean,0.97,0.58,0.42,0.33,0.4,0.5,0.36
std,0.17,0.28,0.1,0.17,0.14,0.12,0.14
min,0.0,0.1,0.23,0.01,0.07,0.15,0.12
25%,1.0,0.36,0.37,0.21,0.3,0.44,0.27
50%,1.0,0.5,0.39,0.31,0.41,0.53,0.35
75%,1.0,0.88,0.45,0.43,0.48,0.57,0.44
max,1.0,1.0,0.67,0.69,0.73,0.71,0.83


d


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0
mean,1.0,0.54,0.45,0.33,0.37,0.51,0.31
std,0.0,0.18,0.09,0.08,0.06,0.06,0.15
min,1.0,0.1,0.31,0.16,0.24,0.35,0.07
25%,1.0,0.4,0.38,0.29,0.3,0.49,0.2
50%,1.0,0.5,0.42,0.33,0.38,0.52,0.28
75%,1.0,0.7,0.51,0.39,0.4,0.55,0.42
max,1.0,0.8,0.69,0.49,0.52,0.65,0.59


e


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,64.0,64.0,64.0,64.0,64.0,64.0,64.0
mean,0.91,0.47,0.48,0.31,0.34,0.42,0.27
std,0.29,0.22,0.11,0.14,0.14,0.16,0.14
min,0.0,0.11,0.31,0.01,0.03,0.05,-0.06
25%,1.0,0.33,0.39,0.23,0.29,0.38,0.18
50%,1.0,0.44,0.48,0.34,0.36,0.47,0.27
75%,1.0,0.67,0.53,0.42,0.44,0.52,0.33
max,1.0,0.89,0.76,0.53,0.56,0.67,0.68


f


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,1.0,0.65,0.5,0.32,0.36,0.53,0.36
std,0.0,0.23,0.1,0.06,0.09,0.08,0.12
min,1.0,0.09,0.33,0.18,0.25,0.39,0.19
25%,1.0,0.56,0.44,0.3,0.29,0.48,0.29
50%,1.0,0.82,0.51,0.32,0.32,0.53,0.31
75%,1.0,0.82,0.58,0.34,0.41,0.6,0.46
max,1.0,0.91,0.7,0.44,0.54,0.65,0.58


g


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,17.0,17.0,17.0,17.0,17.0,17.0,17.0
mean,0.88,0.66,0.37,0.24,0.3,0.45,0.29
std,0.33,0.1,0.06,0.05,0.06,0.12,0.12
min,0.0,0.55,0.3,0.12,0.21,0.16,0.01
25%,1.0,0.6,0.32,0.21,0.26,0.42,0.23
50%,1.0,0.64,0.36,0.24,0.29,0.47,0.32
75%,1.0,0.8,0.39,0.27,0.32,0.52,0.34
max,1.0,0.8,0.53,0.33,0.43,0.61,0.51


h


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,168.0,168.0,168.0,168.0,168.0,168.0,168.0
mean,0.96,0.52,0.53,0.35,0.43,0.49,0.34
std,0.19,0.17,0.11,0.11,0.13,0.1,0.11
min,0.0,0.12,0.3,-0.02,0.05,0.1,-0.0
25%,1.0,0.4,0.45,0.29,0.34,0.43,0.27
50%,1.0,0.5,0.55,0.33,0.42,0.49,0.33
75%,1.0,0.62,0.6,0.38,0.5,0.55,0.42
max,1.0,1.0,0.83,0.71,0.8,0.69,0.65


i


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,24.0,24.0,24.0,24.0,24.0,24.0,24.0
mean,0.88,0.55,0.39,0.33,0.42,0.43,0.29
std,0.34,0.23,0.08,0.12,0.14,0.15,0.16
min,0.0,0.11,0.29,0.06,0.08,0.11,0.04
25%,1.0,0.44,0.32,0.27,0.36,0.38,0.17
50%,1.0,0.55,0.36,0.35,0.42,0.46,0.26
75%,1.0,0.78,0.43,0.39,0.52,0.52,0.38
max,1.0,0.89,0.63,0.64,0.71,0.66,0.72


j


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0
mean,0.96,0.61,0.5,0.36,0.43,0.46,0.38
std,0.19,0.2,0.11,0.15,0.17,0.12,0.19
min,0.0,0.1,0.29,-0.03,0.02,0.1,0.02
25%,1.0,0.4,0.42,0.27,0.32,0.44,0.24
50%,1.0,0.62,0.5,0.33,0.47,0.5,0.36
75%,1.0,0.75,0.57,0.47,0.55,0.54,0.49
max,1.0,0.88,0.72,0.57,0.84,0.65,0.84


# Fatemeh missing on q50 over avg

### keep all

In [38]:
df_f[['gt_ped_flag', 'gt_coverage', 'org_gt_sim', 'ofa_top_sim', 'ofa_gt_sim', 'ofa_con_sim', 'web_ofa_sim']].describe().round(2)

Unnamed: 0,gt_ped_flag,gt_coverage,org_gt_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,0.96,0.55,0.51,0.38,0.42,0.49,0.37
std,0.2,0.21,0.13,0.13,0.14,0.12,0.16
min,0.0,0.09,0.18,-0.08,-0.02,0.07,-0.07
25%,1.0,0.4,0.42,0.3,0.33,0.43,0.26
50%,1.0,0.56,0.5,0.37,0.42,0.51,0.35
75%,1.0,0.73,0.6,0.45,0.53,0.56,0.46
max,1.0,1.0,0.9,0.73,0.84,0.7,0.94


In [140]:
for el in pg_dict:
    print(el)
    display(pg_dict[el][['gt_ped_flag', 'gt_coverage', 'org_gt_sim', 'ofa_top_sim', 'ofa_gt_sim', 'ofa_con_sim', 'web_ofa_sim']].describe().round(2))

a


Unnamed: 0,gt_ped_flag,gt_coverage,org_gt_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,12.0,12.0,12.0,12.0,12.0,12.0,12.0
mean,1.0,0.49,0.5,0.52,0.54,0.38,0.49
std,0.0,0.03,0.09,0.05,0.08,0.09,0.11
min,1.0,0.4,0.34,0.42,0.38,0.2,0.34
25%,1.0,0.5,0.48,0.5,0.53,0.35,0.42
50%,1.0,0.5,0.52,0.53,0.55,0.41,0.47
75%,1.0,0.5,0.53,0.56,0.59,0.43,0.56
max,1.0,0.5,0.62,0.58,0.65,0.54,0.76


b


Unnamed: 0,gt_ped_flag,gt_coverage,org_gt_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,57.0,57.0,57.0,57.0,57.0,57.0,57.0
mean,1.0,0.61,0.49,0.39,0.4,0.49,0.43
std,0.0,0.23,0.1,0.14,0.14,0.15,0.16
min,1.0,0.14,0.26,0.04,0.11,0.1,0.06
25%,1.0,0.43,0.42,0.31,0.31,0.45,0.34
50%,1.0,0.57,0.47,0.41,0.38,0.52,0.42
75%,1.0,0.86,0.55,0.46,0.5,0.59,0.51
max,1.0,0.86,0.69,0.69,0.71,0.7,0.88


c


Unnamed: 0,gt_ped_flag,gt_coverage,org_gt_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,33.0,33.0,33.0,33.0,33.0,33.0,33.0
mean,0.97,0.58,0.47,0.34,0.4,0.5,0.36
std,0.17,0.28,0.09,0.15,0.13,0.1,0.14
min,0.0,0.1,0.24,0.07,0.14,0.31,0.11
25%,1.0,0.36,0.43,0.24,0.32,0.43,0.28
50%,1.0,0.5,0.47,0.31,0.41,0.54,0.36
75%,1.0,0.88,0.52,0.4,0.48,0.57,0.44
max,1.0,1.0,0.72,0.69,0.73,0.7,0.77


d


Unnamed: 0,gt_ped_flag,gt_coverage,org_gt_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0
mean,1.0,0.54,0.41,0.35,0.38,0.49,0.32
std,0.0,0.18,0.12,0.11,0.1,0.1,0.14
min,1.0,0.1,0.18,0.1,0.2,0.14,0.06
25%,1.0,0.4,0.34,0.26,0.29,0.45,0.23
50%,1.0,0.5,0.39,0.35,0.39,0.52,0.31
75%,1.0,0.7,0.5,0.45,0.45,0.56,0.41
max,1.0,0.8,0.63,0.59,0.58,0.65,0.62


e


Unnamed: 0,gt_ped_flag,gt_coverage,org_gt_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,64.0,64.0,64.0,64.0,64.0,64.0,64.0
mean,0.91,0.47,0.48,0.34,0.37,0.45,0.29
std,0.29,0.22,0.14,0.14,0.14,0.15,0.16
min,0.0,0.11,0.24,0.01,-0.01,0.07,-0.07
25%,1.0,0.33,0.39,0.28,0.3,0.39,0.2
50%,1.0,0.44,0.45,0.37,0.4,0.46,0.27
75%,1.0,0.67,0.53,0.45,0.46,0.54,0.34
max,1.0,0.89,0.86,0.55,0.6,0.7,0.94


f


Unnamed: 0,gt_ped_flag,gt_coverage,org_gt_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,1.0,0.65,0.51,0.33,0.36,0.54,0.36
std,0.0,0.23,0.11,0.05,0.07,0.12,0.13
min,1.0,0.09,0.33,0.22,0.27,0.18,0.19
25%,1.0,0.56,0.42,0.3,0.31,0.48,0.25
50%,1.0,0.82,0.49,0.34,0.35,0.57,0.35
75%,1.0,0.82,0.61,0.36,0.37,0.61,0.42
max,1.0,0.91,0.71,0.4,0.55,0.7,0.67


g


Unnamed: 0,gt_ped_flag,gt_coverage,org_gt_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,17.0,17.0,17.0,17.0,17.0,17.0,17.0
mean,0.88,0.66,0.48,0.24,0.29,0.44,0.28
std,0.33,0.1,0.11,0.09,0.1,0.15,0.12
min,0.0,0.55,0.35,0.13,0.18,0.14,0.05
25%,1.0,0.6,0.37,0.19,0.21,0.38,0.21
50%,1.0,0.64,0.45,0.21,0.23,0.45,0.26
75%,1.0,0.8,0.55,0.28,0.35,0.54,0.34
max,1.0,0.8,0.72,0.45,0.46,0.69,0.63


h


Unnamed: 0,gt_ped_flag,gt_coverage,org_gt_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,168.0,168.0,168.0,168.0,168.0,168.0,168.0
mean,0.96,0.52,0.56,0.36,0.44,0.52,0.37
std,0.19,0.17,0.12,0.11,0.13,0.1,0.13
min,0.0,0.12,0.22,0.03,0.11,0.12,0.07
25%,1.0,0.4,0.48,0.31,0.35,0.45,0.28
50%,1.0,0.5,0.57,0.35,0.43,0.54,0.37
75%,1.0,0.62,0.65,0.41,0.53,0.58,0.45
max,1.0,1.0,0.9,0.76,0.84,0.7,0.74


i


Unnamed: 0,gt_ped_flag,gt_coverage,org_gt_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,24.0,24.0,24.0,24.0,24.0,24.0,24.0
mean,0.88,0.55,0.44,0.36,0.44,0.43,0.34
std,0.34,0.23,0.09,0.12,0.15,0.13,0.17
min,0.0,0.11,0.19,0.05,0.01,0.08,0.04
25%,1.0,0.44,0.37,0.25,0.38,0.39,0.23
50%,1.0,0.55,0.47,0.38,0.47,0.46,0.32
75%,1.0,0.78,0.51,0.43,0.51,0.53,0.44
max,1.0,0.89,0.55,0.61,0.74,0.64,0.69


j


Unnamed: 0,gt_ped_flag,gt_coverage,org_gt_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0
mean,0.96,0.61,0.54,0.41,0.48,0.48,0.44
std,0.19,0.2,0.11,0.15,0.17,0.1,0.18
min,0.0,0.1,0.34,-0.08,-0.02,0.11,0.07
25%,1.0,0.4,0.44,0.31,0.38,0.45,0.28
50%,1.0,0.62,0.54,0.46,0.53,0.5,0.41
75%,1.0,0.75,0.63,0.56,0.62,0.54,0.58
max,1.0,0.88,0.77,0.69,0.84,0.66,0.9


# WIT

In [39]:
df_w[['gt_ped_flag', 'gt_coverage', 'org_top_sim', 'ofa_top_sim', 'ofa_gt_sim', 'ofa_con_sim', 'web_ofa_sim']].describe().round(2)

Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,0.96,0.55,0.49,0.37,0.41,0.54,0.36
std,0.2,0.21,0.11,0.11,0.12,0.13,0.13
min,0.0,0.09,0.22,-0.01,-0.02,0.05,-0.01
25%,1.0,0.4,0.4,0.3,0.33,0.48,0.27
50%,1.0,0.56,0.48,0.37,0.41,0.55,0.34
75%,1.0,0.73,0.58,0.44,0.48,0.62,0.43
max,1.0,1.0,0.84,0.72,0.83,0.85,0.93


In [58]:
for el in pg_dict:
    print(el)
    display(pg_dict[el][['gt_ped_flag', 'gt_coverage', 'org_top_sim', 'ofa_top_sim', 'ofa_gt_sim', 'ofa_con_sim', 'web_ofa_sim']].describe().round(2))

a


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,12.0,12.0,12.0,12.0,12.0,12.0,12.0
mean,1.0,0.49,0.49,0.44,0.47,0.44,0.43
std,0.0,0.03,0.05,0.12,0.14,0.09,0.14
min,1.0,0.4,0.38,0.2,0.22,0.3,0.18
25%,1.0,0.5,0.46,0.39,0.37,0.39,0.37
50%,1.0,0.5,0.49,0.47,0.52,0.45,0.44
75%,1.0,0.5,0.52,0.53,0.57,0.51,0.46
max,1.0,0.5,0.56,0.61,0.71,0.58,0.75


b


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,57.0,57.0,57.0,57.0,57.0,57.0,57.0
mean,1.0,0.61,0.5,0.39,0.4,0.55,0.42
std,0.0,0.23,0.09,0.12,0.13,0.13,0.15
min,1.0,0.14,0.29,0.04,0.06,0.12,0.02
25%,1.0,0.43,0.45,0.34,0.32,0.48,0.34
50%,1.0,0.57,0.49,0.4,0.39,0.56,0.43
75%,1.0,0.86,0.57,0.46,0.47,0.65,0.51
max,1.0,0.86,0.67,0.64,0.74,0.76,0.75


c


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,33.0,33.0,33.0,33.0,33.0,33.0,33.0
mean,0.97,0.58,0.42,0.35,0.41,0.61,0.36
std,0.17,0.28,0.1,0.1,0.1,0.11,0.1
min,0.0,0.1,0.23,0.17,0.23,0.36,0.17
25%,1.0,0.36,0.37,0.27,0.35,0.55,0.31
50%,1.0,0.5,0.39,0.35,0.41,0.63,0.38
75%,1.0,0.88,0.45,0.42,0.48,0.7,0.42
max,1.0,1.0,0.67,0.55,0.66,0.85,0.54


d


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0
mean,1.0,0.54,0.45,0.36,0.4,0.62,0.32
std,0.0,0.18,0.09,0.1,0.1,0.13,0.12
min,1.0,0.1,0.31,0.01,0.1,0.1,0.12
25%,1.0,0.4,0.38,0.35,0.35,0.58,0.22
50%,1.0,0.5,0.42,0.38,0.39,0.64,0.29
75%,1.0,0.7,0.51,0.42,0.45,0.7,0.41
max,1.0,0.8,0.69,0.58,0.59,0.73,0.61


e


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,64.0,64.0,64.0,64.0,64.0,64.0,64.0
mean,0.91,0.47,0.48,0.37,0.4,0.5,0.33
std,0.29,0.22,0.11,0.14,0.13,0.17,0.13
min,0.0,0.11,0.31,0.01,0.04,0.05,-0.01
25%,1.0,0.33,0.39,0.33,0.33,0.42,0.27
50%,1.0,0.44,0.48,0.41,0.41,0.5,0.33
75%,1.0,0.67,0.53,0.47,0.48,0.61,0.39
max,1.0,0.89,0.76,0.57,0.71,0.85,0.67


f


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,1.0,0.65,0.5,0.34,0.36,0.53,0.35
std,0.0,0.23,0.1,0.07,0.08,0.07,0.1
min,1.0,0.09,0.33,0.17,0.27,0.32,0.23
25%,1.0,0.56,0.44,0.32,0.31,0.5,0.29
50%,1.0,0.82,0.51,0.33,0.32,0.54,0.32
75%,1.0,0.82,0.58,0.36,0.4,0.57,0.4
max,1.0,0.91,0.7,0.52,0.55,0.65,0.69


g


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,17.0,17.0,17.0,17.0,17.0,17.0,17.0
mean,0.88,0.66,0.37,0.27,0.34,0.55,0.33
std,0.33,0.1,0.06,0.07,0.08,0.15,0.11
min,0.0,0.55,0.3,0.11,0.21,0.32,0.16
25%,1.0,0.6,0.32,0.22,0.29,0.45,0.23
50%,1.0,0.64,0.36,0.24,0.35,0.51,0.33
75%,1.0,0.8,0.39,0.33,0.4,0.63,0.35
max,1.0,0.8,0.53,0.41,0.48,0.85,0.57


h


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,168.0,168.0,168.0,168.0,168.0,168.0,168.0
mean,0.96,0.52,0.53,0.35,0.42,0.54,0.35
std,0.19,0.17,0.11,0.09,0.1,0.09,0.1
min,0.0,0.12,0.3,0.08,0.19,0.22,0.13
25%,1.0,0.4,0.45,0.3,0.34,0.48,0.28
50%,1.0,0.5,0.55,0.34,0.42,0.57,0.33
75%,1.0,0.62,0.6,0.4,0.47,0.58,0.4
max,1.0,1.0,0.83,0.65,0.76,0.76,0.67


i


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,24.0,24.0,24.0,24.0,24.0,24.0,24.0
mean,0.88,0.55,0.39,0.37,0.47,0.51,0.35
std,0.34,0.23,0.08,0.15,0.17,0.15,0.18
min,0.0,0.11,0.29,-0.02,-0.02,0.08,0.09
25%,1.0,0.44,0.32,0.33,0.43,0.44,0.21
50%,1.0,0.55,0.36,0.39,0.5,0.53,0.32
75%,1.0,0.78,0.43,0.47,0.53,0.6,0.53
max,1.0,0.89,0.63,0.54,0.72,0.76,0.62


j


Unnamed: 0,gt_ped_flag,gt_coverage,org_top_sim,ofa_top_sim,ofa_gt_sim,ofa_con_sim,web_ofa_sim
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0
mean,0.96,0.61,0.5,0.36,0.42,0.49,0.38
std,0.19,0.2,0.11,0.14,0.15,0.11,0.17
min,0.0,0.1,0.29,0.04,-0.0,0.1,0.09
25%,1.0,0.4,0.42,0.26,0.33,0.46,0.26
50%,1.0,0.62,0.5,0.35,0.42,0.49,0.34
75%,1.0,0.75,0.57,0.46,0.51,0.53,0.47
max,1.0,0.88,0.72,0.75,0.83,0.72,0.93


In [260]:
for el in pg_dict:
    print(el)
    print(f"{pg_dict[el].describe().round(2).at['mean','web_ofa_sim']}")
    print()

a
0.43

b
0.42

c
0.36

d
0.32

e
0.33

f
0.35

g
0.33

h
0.35

i
0.35

j
0.38



In [261]:
for el in pg_dict:
    print(el)
    print(f"{pg_dict[el].describe().round(2).at['mean','ofa_con_sim']}")
    print()

a
0.44

b
0.55

c
0.61

d
0.62

e
0.5

f
0.53

g
0.55

h
0.54

i
0.51

j
0.49



In [262]:
for el in pg_dict:
    print(el)
    print(f"{pg_dict[el].describe().round(2).at['mean','ofa_gt_sim']}")
    print()

a
0.47

b
0.4

c
0.41

d
0.4

e
0.4

f
0.36

g
0.34

h
0.42

i
0.47

j
0.42



# CLIP

In [19]:
model, preprocess = clip.load("ViT-B/32", device=device, jit=False)

In [20]:
d_path = '.'
#before
df_b = pd.read_pickle(f'{d_path}/av_base.pk')


# after fat
d_path = '/raid/AISSEL/htest/datasets/av_data/ftest/missed_q50_over_avg'
df_f = pd.read_pickle(f'{d_path}/av_mq50_over_avg.pk')

# wit
d_path = '/raid/AISSEL/htest/datasets/av_data/wit/missed_q50_over_avg'
df_w = pd.read_pickle(f'{d_path}/av_mq50_over_avg.pk')

In [21]:
df_b = df_b.dropna()
df_b = df_b.reset_index(drop=True)
df_f = df_f.dropna()
df_f = df_f.reset_index(drop=True)
df_w = df_w.dropna()
df_w = df_w.reset_index(drop=True)

In [22]:
df_b

Unnamed: 0,uniq_id,image_id,caption,topic_id,labels,image,topic2caption_sim,concept2caption_sim,topics,tag,gt,h_caption,ofa_caption
0,4,4,Firefighting Drones,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.512088,0.489315,a,"{'aircraft': 0, 'drone': 0, 'sensor': 0, 'uav'...",uav uavs unmanned uas,Two person operating a drone,two men are standing in front of a drone
1,27,27,DJI mavic 2 pro in flight,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.506123,0.303162,a,"{'aircraft': 0, 'drone': 1, 'sensor': 0, 'uav'...",drone uav uavs unmanned uas,A drone flying in the air,a small drone flying in the sky
2,102,102,Ruko F11 GIM2 Drone with Camera for Adults 4K ...,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.425304,0.273581,a,"{'aircraft': 0, 'drone': 1, 'sensor': 0, 'uav'...",drone uav uavs unmanned uas,Drone with some of the parts,a drone next to a speaker and aepercode5664�� U�
3,141,141,Police Drone Infographic,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.583138,0.356735,a,"{'aircraft': 0, 'drone': 1, 'sensor': 0, 'uav'...",drone uav uavs unmanned uas,Police drone advertisement,a drone in a room with the words demon flypoli...
4,156,156,Teal drones swarm,31,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.503839,0.290723,a,"{'aircraft': 0, 'drone': 1, 'sensor': 0, 'uav'...",drone uav uavs unmanned uas,A drone flying in the air,a military helicopter flying in the desert
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,1792,1792,The various air traffic control facilities enc...,66,,iVBORw0KGgoAAAANSUhEUgAAASIAAAEhCAIAAABUSDpPAA...,0.488674,0.576398,j,"{'clearance': 1, 'aircraft': 1, 'airspace': 1,...",clearance aircraft airspace airport flight radar,Air traffic control system design,a diagram of an air traffic control system
513,1795,1795,Its Our Passion Airspace Aviation,66,,iVBORw0KGgoAAAANSUhEUgAAAhcAAAGiCAYAAABUNuQTAA...,0.419291,0.538884,j,"{'clearance': 0, 'aircraft': 1, 'airspace': 1,...",aircraft airspace airport flight,An aircraft flying in the air,an airplane is flying in the sky
514,1947,1947,Lufthansa Business Class on longhaul aircraft,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.333263,0.526307,j,"{'aircraft': 1, 'airline': 1, 'tower': 0, '911...",aircraft airline flight,Passengers inside an airplane,a person sleeping on an airplane seat
515,1979,1979,911 Attacks Pentagon,220,,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,0.655778,0.354867,j,"{'aircraft': 1, 'airline': 0, 'tower': 0, '911...",aircraft pentagon plane flight,A pilot in the driving seat of an aircraft,a painting of two people in an airplane


In [23]:
print(len(df_b))
print(len(df_f))
print(len(df_w))

517
517
517


In [24]:
tmp_df = pd.DataFrame()
tmp_df['image'] = df_b['image'].to_list()
# tmp_df['or_caption'] = df_b['caption'].to_list()
tmp_df['bf_caption'] = df_b['ofa_caption'].to_list()
tmp_df['aw_caption'] = df_w['ofa_caption'].to_list()
tmp_df['af_caption'] = df_f['ofa_caption'].to_list()
# tmp_df['hu_caption'] = df_b['h_caption'].to_list()
tmp_df['topics'] = df_b['topics'].to_list()

In [25]:
tmp_df

Unnamed: 0,image,bf_caption,aw_caption,af_caption,topics
0,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,two men are standing in front of a drone,two men are flying a drone in front of a gas s...,two men standing in front of a drone,a
1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a small drone flying in the sky,English A drone flying in the air,a drone flying in the air,a
2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a drone next to a speaker and aepercode5664�� U�,English A unmanned aerial vehicle with remote ...,a drone with a remote control and a dvd player,a
3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a drone in a room with the words demon flypoli...,A drone drones in the field police and law enf...,a drone in the field police and law enforcemen...,a
4,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a military helicopter flying in the desert,A B212 in flight,a drone flying over the desert,a
...,...,...,...,...,...
512,iVBORw0KGgoAAAANSUhEUgAAASIAAAEhCAIAAABUSDpPAA...,a diagram of an air traffic control system,A aircraft in the air traffic control system,a diagram of an air traffic control system,j
513,iVBORw0KGgoAAAANSUhEUgAAAhcAAAGiCAYAAABUNuQTAA...,an airplane is flying in the sky,A airplane taking off from the runway,an airplane taking off from an airport runway,j
514,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a person sleeping on an airplane seat,A flight attendant and passenger asleep in the...,a baby sleeping in the aisle of an airplane,j
515,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a painting of two people in an airplane,A pilots in the cockpit of a fighter aircraft ...,two pilots in an airplane with a fire in the b...,j


In [26]:
def clip_3(row):
    image_64_decode = base64.b64decode(row['image']) 
    image_result = open('test.jpg', 'wb')
    image_result.write(image_64_decode)
    try:
        image = Image.open('test.jpg')
        image = preprocess(image).unsqueeze(0).to(device)

        with torch.no_grad():
            image_features = model.encode_image(image)

        text_snippets = [row['bf_caption'], row['aw_caption'], row['af_caption']]
#         text_snippets = [row['bf_caption'], row['aw_caption'], row['af_caption'], row['hu_caption']]
        text = clip.tokenize(text_snippets).to(device)

        with torch.no_grad():
            text_features = model.encode_text(text)
        
        with torch.no_grad():
            logits_per_image, logits_per_text = model(image, text)
            probs = list(logits_per_image.softmax(dim=-1).cpu().numpy()[0])
        
        return probs[0], probs[1], probs[2]#, probs[3]
    except:
        return None

In [27]:
clip_scores = tmp_df.apply(clip_3, axis=1)

In [28]:
bf_score = []
aw_score = []
af_score = []
hu_score = []
for el in clip_scores:
#     print(el)
    if el:
        bf_score.append(el[0])
        aw_score.append(el[1])
        af_score.append(el[2])
#         hu_score.append(el[3])
    else:
        bf_score.append(None)
        aw_score.append(None)
        af_score.append(None)
#         hu_score.append(None)

In [29]:
tmp_df['bf_score'] = bf_score
tmp_df['aw_score'] = aw_score
tmp_df['af_score'] = af_score
# tmp_df['hu_score'] = hu_score

In [30]:
tmp_df

Unnamed: 0,image,bf_caption,aw_caption,af_caption,topics,bf_score,aw_score,af_score
0,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,two men are standing in front of a drone,two men are flying a drone in front of a gas s...,two men standing in front of a drone,a,0.168091,0.729980,0.101929
1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a small drone flying in the sky,English A drone flying in the air,a drone flying in the air,a,0.109863,0.524414,0.365967
2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a drone next to a speaker and aepercode5664�� U�,English A unmanned aerial vehicle with remote ...,a drone with a remote control and a dvd player,a,0.766602,0.042572,0.190796
3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a drone in a room with the words demon flypoli...,A drone drones in the field police and law enf...,a drone in the field police and law enforcemen...,a,0.546387,0.364014,0.089233
4,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a military helicopter flying in the desert,A B212 in flight,a drone flying over the desert,a,0.103699,0.000296,0.895996
...,...,...,...,...,...,...,...,...
512,iVBORw0KGgoAAAANSUhEUgAAASIAAAEhCAIAAABUSDpPAA...,a diagram of an air traffic control system,A aircraft in the air traffic control system,a diagram of an air traffic control system,j,0.497803,0.004307,0.497803
513,iVBORw0KGgoAAAANSUhEUgAAAhcAAAGiCAYAAABUNuQTAA...,an airplane is flying in the sky,A airplane taking off from the runway,an airplane taking off from an airport runway,j,0.049286,0.497559,0.453125
514,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a person sleeping on an airplane seat,A flight attendant and passenger asleep in the...,a baby sleeping in the aisle of an airplane,j,0.812988,0.045868,0.141235
515,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a painting of two people in an airplane,A pilots in the cockpit of a fighter aircraft ...,two pilots in an airplane with a fire in the b...,j,0.004925,0.675781,0.319336


In [31]:
tmp_df = tmp_df.dropna()
tmp_df = tmp_df.reset_index(drop=True)
tmp_df

Unnamed: 0,image,bf_caption,aw_caption,af_caption,topics,bf_score,aw_score,af_score
0,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,two men are standing in front of a drone,two men are flying a drone in front of a gas s...,two men standing in front of a drone,a,0.168091,0.729980,0.101929
1,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a small drone flying in the sky,English A drone flying in the air,a drone flying in the air,a,0.109863,0.524414,0.365967
2,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a drone next to a speaker and aepercode5664�� U�,English A unmanned aerial vehicle with remote ...,a drone with a remote control and a dvd player,a,0.766602,0.042572,0.190796
3,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a drone in a room with the words demon flypoli...,A drone drones in the field police and law enf...,a drone in the field police and law enforcemen...,a,0.546387,0.364014,0.089233
4,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a military helicopter flying in the desert,A B212 in flight,a drone flying over the desert,a,0.103699,0.000296,0.895996
...,...,...,...,...,...,...,...,...
507,iVBORw0KGgoAAAANSUhEUgAAASIAAAEhCAIAAABUSDpPAA...,a diagram of an air traffic control system,A aircraft in the air traffic control system,a diagram of an air traffic control system,j,0.497803,0.004307,0.497803
508,iVBORw0KGgoAAAANSUhEUgAAAhcAAAGiCAYAAABUNuQTAA...,an airplane is flying in the sky,A airplane taking off from the runway,an airplane taking off from an airport runway,j,0.049286,0.497559,0.453125
509,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a person sleeping on an airplane seat,A flight attendant and passenger asleep in the...,a baby sleeping in the aisle of an airplane,j,0.812988,0.045868,0.141235
510,/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBw...,a painting of two people in an airplane,A pilots in the cockpit of a fighter aircraft ...,two pilots in an airplane with a fire in the b...,j,0.004925,0.675781,0.319336


In [32]:
tmp_df.describe().round(4)

Unnamed: 0,bf_score,aw_score,af_score
count,512.0,512.0,512.0
mean,0.355,0.2797,0.3653
std,0.3042,0.3177,0.3088
min,0.0,0.0,0.0
25%,0.0696,0.0189,0.0885
50%,0.3116,0.1458,0.2966
75%,0.5602,0.492,0.5701
max,1.0,0.999,1.0


In [37]:
28 + 37 +36 

101

In [33]:
gk = tmp_df.groupby('topics')
pg_dict = dict()
for el in gk.groups.keys():
    pg_dict[el] = gk.get_group(el)
    pg_dict[el] = pg_dict[el].reset_index(drop=True)
# before


In [34]:
data = []
for el in pg_dict:
    data.append([el, pg_dict[el].describe().round(2).at['mean','aw_score'], pg_dict[el].describe().round(2).at['mean','af_score'], pg_dict[el].describe().round(2).at['mean','bf_score']])

In [35]:
df_clip = pd.DataFrame(data, columns = ['Topic', 'App_1', 'App_2', 'Base'])
df_clip

Unnamed: 0,Topic,App_1,App_2,Base
0,a,0.28,0.33,0.39
1,b,0.2,0.37,0.43
2,c,0.17,0.46,0.37
3,d,0.32,0.39,0.28
4,e,0.41,0.28,0.3
5,f,0.32,0.34,0.34
6,g,0.39,0.31,0.3
7,h,0.28,0.34,0.38
8,i,0.27,0.43,0.3
9,j,0.22,0.43,0.35


In [36]:
df_clip.to_csv(f'air_clip.csv', index=False)

In [None]:
for el in pg_dict:
    print(el)
    print(f"{pg_dict[el].describe().round(2).at['mean','aw_score']}, {pg_dict[el].describe().round(2).at['mean','af_score']}, {pg_dict[el].describe().round(2).at['mean','bf_score']}")
    print()

In [74]:
gk = tmp_df.groupby('topics')
pg_dict = dict()
for el in gk.groups.keys():
    pg_dict[el] = gk.get_group(el)
    pg_dict[el] = pg_dict[el].reset_index(drop=True)
# before
for el in pg_dict:
    print(el)
    print(f"{pg_dict[el].describe().round(2).at['mean','af_score']}, {pg_dict[el].describe().round(2).at['mean','bf_score']}")
    print()

a
0.48, 0.52

b
0.48, 0.52

c
0.56, 0.44

d
0.55, 0.45

e
0.53, 0.47

f
0.49, 0.51

g
0.5, 0.5

h
0.45, 0.55

i
0.56, 0.44

j
0.55, 0.45



In [48]:
gk = tmp_df.groupby('topics')
pg_dict = dict()
for el in gk.groups.keys():
    pg_dict[el] = gk.get_group(el)
    pg_dict[el] = pg_dict[el].reset_index(drop=True)
# before
for el in pg_dict:
    print(el)
    print(f"{pg_dict[el].describe().round(2).at['mean','aw_score']}, {pg_dict[el].describe().round(2).at['mean','bf_score']}")
    print()

a
0.38, 0.62

b
0.33, 0.67

c
0.34, 0.66

d
0.53, 0.47

e
0.57, 0.43

f
0.45, 0.55

g
0.48, 0.52

h
0.39, 0.61

i
0.47, 0.53

j
0.38, 0.62



In [49]:
gk = tmp_df.groupby('topics')
pg_dict = dict()
for el in gk.groups.keys():
    pg_dict[el] = gk.get_group(el)
    pg_dict[el] = pg_dict[el].reset_index(drop=True)
# before
for el in pg_dict:
    print(el)
    display(pg_dict[el].describe().round(2))

a


Unnamed: 0,bf_score,aw_score
count,12.0,12.0
mean,0.62,0.38
std,0.36,0.36
min,0.1,0.0
25%,0.24,0.04
50%,0.67,0.33
75%,0.96,0.77
max,1.0,0.9


b


Unnamed: 0,bf_score,aw_score
count,56.0,56.0
mean,0.67,0.33
std,0.34,0.34
min,0.0,0.0
25%,0.48,0.05
50%,0.82,0.18
75%,0.95,0.52
max,1.0,1.0


c


Unnamed: 0,bf_score,aw_score
count,33.0,33.0
mean,0.66,0.34
std,0.35,0.35
min,0.03,0.0
25%,0.46,0.03
50%,0.84,0.16
75%,0.97,0.54
max,1.0,0.97


d


Unnamed: 0,bf_score,aw_score
count,36.0,36.0
mean,0.47,0.53
std,0.42,0.42
min,0.01,0.0
25%,0.06,0.04
50%,0.31,0.69
75%,0.96,0.94
max,1.0,0.99


e


Unnamed: 0,bf_score,aw_score
count,61.0,61.0
mean,0.43,0.57
std,0.38,0.38
min,0.0,0.0
25%,0.04,0.2
50%,0.32,0.68
75%,0.8,0.96
max,1.0,1.0


f


Unnamed: 0,bf_score,aw_score
count,25.0,25.0
mean,0.55,0.45
std,0.39,0.39
min,0.0,0.0
25%,0.25,0.06
50%,0.61,0.39
75%,0.94,0.75
max,1.0,1.0


g


Unnamed: 0,bf_score,aw_score
count,17.0,17.0
mean,0.52,0.48
std,0.41,0.41
min,0.01,0.01
25%,0.12,0.05
50%,0.67,0.33
75%,0.95,0.88
max,0.99,0.99


h


Unnamed: 0,bf_score,aw_score
count,168.0,168.0
mean,0.61,0.39
std,0.35,0.35
min,0.0,0.0
25%,0.31,0.04
50%,0.67,0.33
75%,0.96,0.69
max,1.0,1.0


i


Unnamed: 0,bf_score,aw_score
count,24.0,24.0
mean,0.53,0.47
std,0.4,0.4
min,0.0,0.0
25%,0.05,0.08
50%,0.57,0.43
75%,0.91,0.95
max,1.0,1.0


j


Unnamed: 0,bf_score,aw_score
count,80.0,80.0
mean,0.62,0.38
std,0.34,0.34
min,0.0,0.0
25%,0.31,0.06
50%,0.76,0.24
75%,0.94,0.69
max,1.0,1.0
