In [1]:
import pandas as pd
import copy
import re
import os
from SPARQLWrapper import SPARQLWrapper, JSON
import sys

sys.path.append(os.path.abspath('../src'))
from src import database as db
from src import clusterize

In [2]:
ENDPOINT = "https://okapi.ina.fr/antract/api/saphir/sparql_search"
sparql = SPARQLWrapper(ENDPOINT)

db.init(conf='../config/config.yaml')

In [3]:
with open('segment_by_person.rq', 'r') as f:
  query = f.read()

In [4]:
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()["results"]["bindings"]

In [5]:
def sparql_to_pandas(results):
    res = copy.deepcopy(results)
    for r in res:
        for p in r:
            value = r[p]['value']
            if p in ['start', 'end']:
                value = re.sub(r"T(\d{2}:\d{2}:\d{2}).+", "\g<1>", value)
            if p == 'prop':
                value = re.split(r"[/#]", value)[-1]
            r[p] = value
    return pd.DataFrame.from_dict(res)

In [6]:
data = sparql_to_pandas(results)
data.head()

Unnamed: 0,notice,prop,title,start,end,analysis,media,url
0,http://www.ina.fr/emission/AFE85004910,aPourParticipant,PREMIERE REUNION DE L'EQUIPE MINISTERIELLE D'E...,00:00:13,00:00:49,http://www.ina.fr/analysis/AFE86004832,http://www.ina.fr/media/AFE86004832,/Media/AF/AFE86004832.mp4
1,http://www.ina.fr/emission/AFE85004918,aPourParticipant,Eisenhower Président,00:06:20,00:08:16,http://www.ina.fr/analysis/AFE86004832,http://www.ina.fr/media/AFE86004832,/Media/AF/AFE86004832.mp4
2,http://www.ina.fr/emission/AFE85005763,aPourParticipant,Voyage de Geneviève de Galard aux Etats Unis,00:02:53,00:03:30,http://www.ina.fr/analysis/AFE86003422,http://www.ina.fr/media/AFE86003422,/Media/AF/AFE86003422.mp4
3,http://www.ina.fr/emission/AFE85007869,aPourParticipant,L'ODYSSEE SUD AMERICAINE DU VICE PRESIDENT NIXON,00:02:07,00:03:11,http://www.ina.fr/analysis/AFE86003620,http://www.ina.fr/media/AFE86003620,/Media/AF/AFE86003620.mp4
4,http://www.ina.fr/emission/AFE85008771,aPourParticipant,Nixon candidat du Parti Républicain,00:02:03,00:02:35,http://www.ina.fr/analysis/AFE86003735,http://www.ina.fr/media/AFE86003735,/Media/AF/AFE86003735.mp4


In [7]:
def to_sec(value):
    hh,mm,ss = value.split(':')
    return 3600 * int(hh) + 60 * int(mm) + int(ss)

In [8]:
def check(segment):
    start = to_sec(segment['start'])
    end = to_sec(segment['end'])
    facerec = db.get_all_about(segment['media'], 'antract')
    if not facerec or 'tracks' not in facerec or len(facerec['tracks']) < 1:
        return None
    
    tracks = facerec['tracks']
    tracks = clusterize.main(clusterize.from_dict(tracks), confidence_threshold=0.5, merge_cluster=True)

    hits = 0
    duration = 0
    for t_parser in tracks:
        t_start = t_parser['start_npt']
        t_end = t_parser['end_npt']
        if t_parser['name'] == 'Dwight Eisenhower' and t_start >= start and t_end <= end:
            hits += 1
            duration += t_end - t_start
            
    return hits, duration

In [9]:
not_analysed = 0
found = 0
hits = 0
tot_secs = 0
missed = 0

img_cont_found = 0
img_cont_hits = 0
img_cont_secs = 0
img_cont_missed = 0
partic_found = 0
partic_hits = 0
partic_secs = 0
partic_missed = 0

array_hits = []

for index, segment in data.iterrows():
    res = check(segment)
    img_cont = segment['prop'] == 'imageContient'


    if res is None:
        not_analysed += 1
        continue
    
    count, secs = res
    
    if count > 0:
        found += 1
        hits += count
        tot_secs += secs
        if img_cont:
            img_cont_found += 1
            img_cont_hits += count
            img_cont_secs += secs
        else:
            partic_found += 1
            partic_hits += count
            partic_secs += secs
    else:
        missed += 1
        if img_cont:
            img_cont_missed += 1
        else:
            partic_missed += 1

    array_hits.append(count)
    
print('Not analysed: %d' % not_analysed)
print('Missed: %d' % missed)
print('Found: %d' % found)
if found >0:
    print('Avg hits: %.2f' % (hits / found))
    print('Avg secs: %.2f' % (tot_secs / found))

print('##############')
print('ina:imageContient')
print('Missed: %d' % img_cont_missed)
print('Found: %d' % img_cont_found)
if img_cont_found >0 :
    print('Avg hits: %.2f' % (img_cont_hits / img_cont_found))
    print('Avg secs: %.2f' % (img_cont_secs / img_cont_found))

print('##############')
print('ina:aPourParticipant')
print('Missed: %d' % partic_missed)
print('Found: %d' % partic_found)
if partic_found >0 :
    print('Avg hits: %.2f' % (partic_hits / partic_found))
    print('Avg secs: %.2f' % (partic_secs / partic_found))


Not analysed: 0
Missed: 61
Found: 33
Avg hits: 1.36
Avg secs: 5.36
##############
ina:imageContient
Missed: 28
Found: 7
Avg hits: 1.29
Avg secs: 6.00
##############
ina:aPourParticipant
Missed: 33
Found: 26
Avg hits: 1.38
Avg secs: 5.19


In [10]:
data['hits'] = array_hits

In [11]:
data[['hits','media', 'prop', 'start', 'end']].sort_values(by='hits', ascending=True)

Unnamed: 0,hits,media,prop,start,end
46,0,http://www.ina.fr/media/AFE86004844,aPourParticipant,00:02:46,00:03:13
58,0,http://www.ina.fr/media/AFE86004878,aPourParticipant,00:05:14,00:06:08
57,0,http://www.ina.fr/media/AFE86003760,imageContient,00:00:13,00:02:07
55,0,http://www.ina.fr/media/AFE86004824,aPourParticipant,00:05:28,00:06:01
54,0,http://www.ina.fr/media/AFE86004572,aPourParticipant,00:02:21,00:02:48
...,...,...,...,...,...
64,2,http://www.ina.fr/media/AFE86003841,aPourParticipant,00:06:23,00:06:51
13,2,http://www.ina.fr/media/AFE86004876,aPourParticipant,00:02:56,00:03:45
1,2,http://www.ina.fr/media/AFE86004832,aPourParticipant,00:06:20,00:08:16
19,2,http://www.ina.fr/media/AFE86004831,aPourParticipant,00:00:54,00:01:26
