In [2]:
import pandas as pd
import copy
import re
import os
from SPARQLWrapper import SPARQLWrapper, JSON
import sys

sys.path.append(os.path.abspath('../src'))
from src import database as db
from src import clusterize

In [3]:
ENDPOINT = "https://okapi.ina.fr/antract/api/saphir/sparql_search"
sparql = SPARQLWrapper(ENDPOINT)

db.init(conf='../config/config.yaml')

In [4]:
with open('segment_person.rq', 'r') as f:
  query = f.read()

In [5]:
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()["results"]["bindings"]

In [6]:
def sparql_to_pandas(results):
    res = copy.deepcopy(results)
    for r in res:
        for p in r:
            value = r[p]['value']
            if p in ['start', 'end']:
                value = re.sub(r"T(\d{2}:\d{2}:\d{2}).+", "\g<1>", value)
            if p == 'prop':
                value = re.split(r"[/#]", value)[-1]
            r[p] = value
    return pd.DataFrame.from_dict(res)

In [7]:
data = sparql_to_pandas(results)
data.head()

Unnamed: 0,notice,prop,title,start,end,analysis,media,url
0,http://www.ina.fr/emission/AFE85004910,aPourParticipant,PREMIERE REUNION DE L'EQUIPE MINISTERIELLE D'E...,00:00:13,00:00:49,http://www.ina.fr/analysis/AFE86004832,http://www.ina.fr/media/AFE86004832,/Media/AF/AFE86004832.mp4
1,http://www.ina.fr/emission/AFE85004918,aPourParticipant,Eisenhower Président,00:06:20,00:08:16,http://www.ina.fr/analysis/AFE86004832,http://www.ina.fr/media/AFE86004832,/Media/AF/AFE86004832.mp4
2,http://www.ina.fr/emission/AFE85005763,aPourParticipant,Voyage de Geneviève de Galard aux Etats Unis,00:02:53,00:03:30,http://www.ina.fr/analysis/AFE86003422,http://www.ina.fr/media/AFE86003422,/Media/AF/AFE86003422.mp4
3,http://www.ina.fr/emission/AFE85007869,aPourParticipant,L'ODYSSEE SUD AMERICAINE DU VICE PRESIDENT NIXON,00:02:07,00:03:11,http://www.ina.fr/analysis/AFE86003620,http://www.ina.fr/media/AFE86003620,/Media/AF/AFE86003620.mp4
4,http://www.ina.fr/emission/AFE85008771,aPourParticipant,Nixon candidat du Parti Républicain,00:02:03,00:02:35,http://www.ina.fr/analysis/AFE86003735,http://www.ina.fr/media/AFE86003735,/Media/AF/AFE86003735.mp4


In [8]:
def to_sec(value):
    hh,mm,ss = value.split(':')
    return 3600 * int(hh) + 60 * int(mm) + int(ss)

In [9]:
def check(segment):
    start = to_sec(segment['start'])
    end = to_sec(segment['end'])
    facerec = db.get_all_about(segment['media'], 'antract')
    if not facerec or 'tracks' not in facerec or len(facerec['tracks']) < 1:
        return None
    
    tracks = facerec['tracks']
    tracks = clusterize.main(clusterize.from_dict(tracks), confidence_threshold=0.5, merge_cluster=True)

    hits = 0
    duration = 0
    for t in tracks:
        t_start = t['start_npt']
        t_end = t['end_npt']
        if t['name'] == 'Dwight Eisenhower' and t_start >= start and t_end <= end:
            hits += 1
            duration += t_end - t_start
            
    return hits, duration

In [14]:
not_analysed = 0
found = 0
hits = 0
secs = 0
missed = 0

img_cont_found = 0
img_cont_hits = 0
img_cont_secs = 0
img_cont_missed = 0
partic_found = 0
partic_hits = 0
partic_secs = 0
partic_missed = 0


for index, segment in data.iterrows():
    res = check(segment)
    img_cont = segment['prop'] == 'imageContient'


    if res is None:
        not_analysed += 1
        continue
    
    count, secs = res
    
    if count > 0:
        found += 1
        hits += count
        secs += secs
        if img_cont:
            img_cont_found += 1
            img_cont_hits += count
            img_cont_secs += secs
        else:
            partic_found += 1
            partic_hits += count
            partic_secs += secs
            
    else:
        missed += 1
        if img_cont:
            img_cont_missed += 1
        else:
            partic_missed += 1


print('Not analysed: %d' % not_analysed)
print('Missed: %d' % missed)
print('Found: %d' % found)
if found >0:
    print('Avg hits: %.2f' % (hits / found))
    print('Avg secs: %.2f' % (secs / found))

print('##############')
print('ina:imageContient')
print('Missed: %d' % img_cont_missed)
print('Found: %d' % img_cont_found)
if img_cont_found >0 :
    print('Avg hits: %.2f' % (img_cont_hits / img_cont_found))
    print('Avg secs: %.2f' % (img_cont_secs / img_cont_found))

print('##############')
print('ina:aPourParticipant')
print('Missed: %d' % partic_missed)
print('Found: %d' % partic_found)
if partic_found >0 :
    print('Avg hits: %.2f' % (partic_hits / partic_found))
    print('Avg secs: %.2f' % (partic_secs / partic_found))


Not analysed: 91
Missed: 0
Found: 3
Avg hits: 1.33
Avg secs: 0.67
##############
ina:imageContient
Missed: 0
Found: 0
##############
ina:aPourParticipant
Missed: 0
Found: 3
Avg hits: 1.33
Avg secs: 4.00
