Create the MeMAD Ground Truth
==========================

In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath('../'))

import re
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from frame_collector import FrameCollector
from src.utils.media_fragment import convert_to_seconds_npt
from src.connectors import memad_connector as memad
from src.connectors import limecraft_connector as limecraft

The [MeMAD Knowledge Graph](http://data.memad.eu) contains information about the presence of people in particular video segments.

The following cells retrieve the 6 people with the highest number of appearence, for then collecting all segments (+ media uri and start/end time) involving them.

For clarity, we will call _media_ an entire video resource (e.g. a mpeg4 file), while with _segment_ a temporal fragment of variable length, to not be confused with _shots_. See definitions of MediaResource and Part in the [ebucore ontology](https://www.ebu.ch/metadata/ontologies/ebucore/)

In [2]:
ENDPOINT = "http://data.memad.eu/sparql-endpoint"
sparql = SPARQLWrapper(ENDPOINT)
sparql.setReturnFormat(JSON)

In [3]:
with open('queries/memad_people_part.rq') as file:
    query = file.read()

sparql.setQuery(query)
people = sparql.query().convert()["results"]["bindings"]
for x in people:
    for k in x:
        x[k] = x[k]['value']
df_people = pd.DataFrame.from_dict(people)
df_people['nb'] = df_people['nb'].apply(pd.to_numeric)
df_people

Unnamed: 0,g,c,label,nb
0,http://data.memad.eu/graph/ina-pa,http://data.memad.eu/agent/rincquesen-nathanae...,"Rincquesen, Nathanaël de",172
1,http://data.memad.eu/graph/ina-ld,http://data.memad.eu/agent/rincquesen-nathanae...,"Rincquesen, Nathanaël de",172
2,http://data.memad.eu/graph/ina-ld,http://data.memad.eu/agent/lucet-elise,"Lucet, Elise",135
3,http://data.memad.eu/graph/ina-pa,http://data.memad.eu/agent/lucet-elise,"Lucet, Elise",135
4,http://data.memad.eu/graph/ina-pa,http://data.memad.eu/agent/le-saint-sophie,"Le Saint, Sophie",55
...,...,...,...,...
962,http://data.memad.eu/graph/ina-ld,http://data.memad.eu/agent/merah-zoulikha,"Merah, Zoulikha",1
963,http://data.memad.eu/graph/ina-ld,http://data.memad.eu/agent/grappe-frederic,"Grappe, Frédéric",1
964,http://data.memad.eu/graph/ina-ld,http://data.memad.eu/agent/dardenne-luc,"Dardenne, Luc",1
965,http://data.memad.eu/graph/ina-ld,http://data.memad.eu/agent/dugarry-christophe,"Dugarry, Christophe",1


Select top 6 celebrities

In [4]:
people_uri = []
people_labels = []
iterator = df_people.itertuples()
while len(people_uri) < 6:
    p = next(iterator)
    if p.c not in people_uri:
        people_uri.append(p.c)
        people_labels.append(p.label)
list(zip(people_uri, people_labels))

[('http://data.memad.eu/agent/rincquesen-nathanael-de',
  'Rincquesen, Nathanaël de'),
 ('http://data.memad.eu/agent/lucet-elise', 'Lucet, Elise'),
 ('http://data.memad.eu/agent/le-saint-sophie', 'Le Saint, Sophie'),
 ('http://data.memad.eu/agent/delahousse-laurent', 'Delahousse, Laurent'),
 ('http://data.memad.eu/agent/gastrin-sophie', 'Gastrin, Sophie'),
 ('http://data.memad.eu/agent/drucker-marie', 'Drucker, Marie')]

In [5]:
with open('queries/memad_parts_by_person.rq') as file:
    query = file.read()

In [6]:
def pad_spaces(text):
    pad = 25 - len(text)
    return text + ''.join([' ' for _ in range(0,pad)])
    
def get_segments(person, name):
    q = query.replace('?person', '<%s>' % person)
    sparql.setQuery(q)
    results = sparql.query().convert()["results"]["bindings"]
    for r in results:
        for p in r:
            value = r[p]['value']
            if p in ['start', 'end']:
                value = convert_to_seconds_npt(re.sub(r"T(\d{2}:\d{2}:\d{2}).+", "\g<1>", value))
            if p == 'prop':
                value = re.split(r"[/#]", value)[-1]
            r[p] = value
        r['person'] = name
        r['person_uri'] = person
    
    results = [r for r in results if 'start' in r and r['start'] < r['end']]
    print('- %s\t%s'%(pad_spaces(name), len(results)))
    
    
    return pd.DataFrame.from_dict(results) 

print('Num. results per person:')
df = pd.concat([get_segments(uri, name) 
                     for uri, name in tqdm(zip(people_uri, people_labels), total=len(people_uri))],
               ignore_index=True)
df

Num. results per person:


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

- Rincquesen, Nathanaël de 	172
- Lucet, Elise             	142
- Le Saint, Sophie         	55
- Delahousse, Laurent      	44
- Gastrin, Sophie          	39
- Drucker, Marie           	79



Unnamed: 0,p,media,start,end,locator,person,person_uri
0,http://data.memad.eu/fr2/7h00-le-journal/107a2...,http://data.memad.eu/fr2/7h00-le-journal/11c0b...,158.0,188.0,https://platform.limecraft.com/api/production/...,"Rincquesen, Nathanaël de",http://data.memad.eu/agent/rincquesen-nathanae...
1,http://data.memad.eu/fr2/7h00-le-journal/1890a...,http://data.memad.eu/fr2/7h00-le-journal/11c0b...,22.0,74.0,https://platform.limecraft.com/api/production/...,"Rincquesen, Nathanaël de",http://data.memad.eu/agent/rincquesen-nathanae...
2,http://data.memad.eu/fr2/7h00-le-journal/1d8ce...,http://data.memad.eu/fr2/7h00-le-journal/11c0b...,242.0,245.0,https://platform.limecraft.com/api/production/...,"Rincquesen, Nathanaël de",http://data.memad.eu/agent/rincquesen-nathanae...
3,http://data.memad.eu/fr2/7h00-le-journal/21ac2...,http://data.memad.eu/fr2/7h00-le-journal/11c0b...,476.0,506.0,https://platform.limecraft.com/api/production/...,"Rincquesen, Nathanaël de",http://data.memad.eu/agent/rincquesen-nathanae...
4,http://data.memad.eu/fr2/7h00-le-journal/43b30...,http://data.memad.eu/fr2/7h00-le-journal/11c0b...,188.0,209.0,https://platform.limecraft.com/api/production/...,"Rincquesen, Nathanaël de",http://data.memad.eu/agent/rincquesen-nathanae...
...,...,...,...,...,...,...,...
526,http://data.memad.eu/fr2/20-heures/f7f9040681d...,http://data.memad.eu/fr2/20-heures/68bd349849e...,2186.0,2203.0,https://platform.limecraft.com/api/production/...,"Drucker, Marie",http://data.memad.eu/agent/drucker-marie
527,http://data.memad.eu/fr2/20-heures/cfe1c51a95a...,http://data.memad.eu/fr2/20-heures/55e49abf526...,544.0,561.0,https://platform.limecraft.com/api/production/...,"Drucker, Marie",http://data.memad.eu/agent/drucker-marie
528,http://data.memad.eu/fr2/20-heures/f7f9040681d...,http://data.memad.eu/fr2/20-heures/68bd349849e...,2185.0,2204.0,https://platform.limecraft.com/api/production/...,"Drucker, Marie",http://data.memad.eu/agent/drucker-marie
529,http://data.memad.eu/fr2/20-heures/f7f9040681d...,http://data.memad.eu/fr2/20-heures/68bd349849e...,2186.0,2204.0,https://platform.limecraft.com/api/production/...,"Drucker, Marie",http://data.memad.eu/agent/drucker-marie


Some media are repeated (meaning that people appear in more than a segment for each media).

In [7]:
df['media'].value_counts()

http://data.memad.eu/fr2/20-heures/68bd349849eaaeaa3986725f4b0533b63327e23b          41
http://data.memad.eu/fr2/13-heures/f0c2067e0351dd639fcdf0843e4fedf1886be320          29
http://data.memad.eu/fr2/13-heures/312f56e3ca31ea7ab20b540d127830a0ccf9d591          25
http://data.memad.eu/fr2/20-heures/55e49abf5266c5f0942b53b547a2c290693b5dfb          25
http://data.memad.eu/fr2/13-heures/b8919ac070443f2b9482f3f577ea7af34c04e6f4          23
http://data.memad.eu/fr2/13-heures/72322c5279865668cc4d5c9ff1d7646366939f6a          22
http://data.memad.eu/fr2/13-heures/ba98967dff964c47c89ba23d187dd241aaa28b55          22
http://data.memad.eu/fr2/20-heures/bc94cb0df1a39668823168bae83067b81f5d5a28          22
http://data.memad.eu/fr2/13-heures/dd5bc1b8a44809fb736ac6fbbf21a257abaef046          21
http://data.memad.eu/fr2/8h00-le-journal/92c68cd73e858dea2f7fb32ab94dc6b6d3dd0fbf    19
http://data.memad.eu/fr2/13-heures/3ac1dfecf3092bedab3ef74252ceb05f572d2269          18
http://data.memad.eu/fr2/7h00-le

Also some parts are repeated, meaning that in those cases more people appear

In [8]:
df['p'].value_counts()

http://data.memad.eu/fr2/20-heures/8e091b64bcd316b068f30e42c69bab89bf4a4d85          4
http://data.memad.eu/fr2/20-heures/632223105927ac0bca2fd5d2af6a8a4ab17e7b35          4
http://data.memad.eu/fr2/20-heures/341912ffa39e2082cdb4d280dab0ac401aa37ba2          4
http://data.memad.eu/fr2/20-heures/f7f9040681d94f67f69cf0ddb00dfa3bc763ca95          4
http://data.memad.eu/fr2/20-heures/c34e177f830a029e2c8456644d08ce2de28ba154          4
                                                                                    ..
http://data.memad.eu/fr2/8h00-le-journal/a928c7a6d6719d5385614f0bf164a64e8e1a0982    1
http://data.memad.eu/fr2/7h00-le-journal/281746dd802ed78623391b3668dca5569487e287    1
http://data.memad.eu/fr2/13-heures/63fbd6c78d247a7f7b969ecaba6b4b2e74655f84          1
http://data.memad.eu/fr2/7h00-le-journal/58fb2156d600479af85a0a311640e0631a764d88    1
http://data.memad.eu/fr2/13-heures/1abec2cd8a0cf56192f7205cde92e136ddeb56a1          1
Name: p, Length: 476, dtype: int64

In [9]:
df['duration'] = df['end'] - df['start']
df['duration'].describe()

count    531.000000
mean      61.361582
std       69.655034
min        1.000000
25%       20.000000
50%       49.000000
75%       74.000000
max      585.000000
Name: duration, dtype: float64

Given that the segments are not divided in shots, I would focus on the shortest ones.

In [10]:
df_subset = df[df['duration'] < 120]
print('Subset length: %d' %len(df_subset))

df_subset['person_uri'].value_counts()

Subset length: 483


http://data.memad.eu/agent/rincquesen-nathanael-de    172
http://data.memad.eu/agent/lucet-elise                103
http://data.memad.eu/agent/drucker-marie               75
http://data.memad.eu/agent/le-saint-sophie             55
http://data.memad.eu/agent/gastrin-sophie              39
http://data.memad.eu/agent/delahousse-laurent          39
Name: person_uri, dtype: int64

Run the frame collector, which extract for each segment frames at positions n/4, n/2 and 3n/4 (n = center of the segment).

Sometimes the collector will fail (e.g. video not available or no faces found). We will remove those cases later.

In [32]:
old_loc = ''
fc = None

df_subset.sort_values(by='media', ascending=False)

table = df_subset

def ex_frame(row):
    global fc
    global old_loc
    
    loc = memad.get_locator_for(row['media'])
    if not loc:
        print('No loc for: '+ row['media'])
        return 0
    
    loc = limecraft.locator2video(loc['locator']['value'])
    if loc != old_loc:
        fc = FrameCollector(loc, 'memad_gt', id=row['media'])
        old_loc = loc
    
    start = row['start']
    end = row['end']
    duration = end - start
    quarter = int(duration / 4)
    quarter1 = start + quarter 
    center = start + quarter * 2
    quarter2 = start + quarter * 3
    
    try:
        x = fc.run(frame_no=quarter1)
        if x == 0:
            x = fc.run(frame_no=center)
        if x == 0:
            x = fc.run(frame_no=quarter2)
        return x
    except:
        print('Error: ' + loc)
        return 0

frames = [ex_frame(row) for idx, row in tqdm(table.iterrows(), total=len(table))]

HBox(children=(FloatProgress(value=0.0, max=483.0), HTML(value='')))





In [33]:
sframes = [l if l is not None else 0 for l in frames]
frames

['./frames/memad_gt/7h00-le-journal_11c0beb19cddc328195dc91f781280d013756b6b_165.jpg',
 './frames/memad_gt/7h00-le-journal_11c0beb19cddc328195dc91f781280d013756b6b_35.jpg',
 './frames/memad_gt/7h00-le-journal_11c0beb19cddc328195dc91f781280d013756b6b_242.jpg',
 './frames/memad_gt/7h00-le-journal_11c0beb19cddc328195dc91f781280d013756b6b_483.jpg',
 './frames/memad_gt/7h00-le-journal_11c0beb19cddc328195dc91f781280d013756b6b_193.jpg',
 './frames/memad_gt/7h00-le-journal_11c0beb19cddc328195dc91f781280d013756b6b_344.jpg',
 './frames/memad_gt/7h00-le-journal_11c0beb19cddc328195dc91f781280d013756b6b_453.jpg',
 0,
 './frames/memad_gt/7h00-le-journal_11c0beb19cddc328195dc91f781280d013756b6b_217.jpg',
 './frames/memad_gt/7h00-le-journal_11c0beb19cddc328195dc91f781280d013756b6b_513.jpg',
 './frames/memad_gt/7h00-le-journal_11c0beb19cddc328195dc91f781280d013756b6b_660.jpg',
 './frames/memad_gt/7h00-le-journal_11c0beb19cddc328195dc91f781280d013756b6b_405.jpg',
 './frames/memad_gt/7h00-le-journal_11c0

In [34]:
df_subset = df_subset.reset_index(drop=True)

df_subset['frame'] = frames
df_extracted = df_subset[df_subset['frame'] != 0]
df_extracted

Unnamed: 0,p,media,start,end,locator,person,person_uri,duration,frame
0,http://data.memad.eu/fr2/7h00-le-journal/107a2...,http://data.memad.eu/fr2/7h00-le-journal/11c0b...,158.0,188.0,https://platform.limecraft.com/api/production/...,"Rincquesen, Nathanaël de",http://data.memad.eu/agent/rincquesen-nathanae...,30.0,./frames/memad_gt/7h00-le-journal_11c0beb19cdd...
1,http://data.memad.eu/fr2/7h00-le-journal/1890a...,http://data.memad.eu/fr2/7h00-le-journal/11c0b...,22.0,74.0,https://platform.limecraft.com/api/production/...,"Rincquesen, Nathanaël de",http://data.memad.eu/agent/rincquesen-nathanae...,52.0,./frames/memad_gt/7h00-le-journal_11c0beb19cdd...
2,http://data.memad.eu/fr2/7h00-le-journal/1d8ce...,http://data.memad.eu/fr2/7h00-le-journal/11c0b...,242.0,245.0,https://platform.limecraft.com/api/production/...,"Rincquesen, Nathanaël de",http://data.memad.eu/agent/rincquesen-nathanae...,3.0,./frames/memad_gt/7h00-le-journal_11c0beb19cdd...
3,http://data.memad.eu/fr2/7h00-le-journal/21ac2...,http://data.memad.eu/fr2/7h00-le-journal/11c0b...,476.0,506.0,https://platform.limecraft.com/api/production/...,"Rincquesen, Nathanaël de",http://data.memad.eu/agent/rincquesen-nathanae...,30.0,./frames/memad_gt/7h00-le-journal_11c0beb19cdd...
4,http://data.memad.eu/fr2/7h00-le-journal/43b30...,http://data.memad.eu/fr2/7h00-le-journal/11c0b...,188.0,209.0,https://platform.limecraft.com/api/production/...,"Rincquesen, Nathanaël de",http://data.memad.eu/agent/rincquesen-nathanae...,21.0,./frames/memad_gt/7h00-le-journal_11c0beb19cdd...
...,...,...,...,...,...,...,...,...,...
477,http://data.memad.eu/fr2/20-heures/f7f9040681d...,http://data.memad.eu/fr2/20-heures/68bd349849e...,2185.0,2203.0,https://platform.limecraft.com/api/production/...,"Drucker, Marie",http://data.memad.eu/agent/drucker-marie,18.0,./frames/memad_gt/20-heures_68bd349849eaaeaa39...
478,http://data.memad.eu/fr2/20-heures/f7f9040681d...,http://data.memad.eu/fr2/20-heures/68bd349849e...,2186.0,2203.0,https://platform.limecraft.com/api/production/...,"Drucker, Marie",http://data.memad.eu/agent/drucker-marie,17.0,./frames/memad_gt/20-heures_68bd349849eaaeaa39...
480,http://data.memad.eu/fr2/20-heures/f7f9040681d...,http://data.memad.eu/fr2/20-heures/68bd349849e...,2185.0,2204.0,https://platform.limecraft.com/api/production/...,"Drucker, Marie",http://data.memad.eu/agent/drucker-marie,19.0,./frames/memad_gt/20-heures_68bd349849eaaeaa39...
481,http://data.memad.eu/fr2/20-heures/f7f9040681d...,http://data.memad.eu/fr2/20-heures/68bd349849e...,2186.0,2204.0,https://platform.limecraft.com/api/production/...,"Drucker, Marie",http://data.memad.eu/agent/drucker-marie,18.0,./frames/memad_gt/20-heures_68bd349849eaaeaa39...


In [35]:
df_extracted['person'].value_counts()

Rincquesen, Nathanaël de    125
Lucet, Elise                 62
Drucker, Marie               56
Le Saint, Sophie             35
Gastrin, Sophie              30
Delahousse, Laurent          27
Name: person, dtype: int64

In [36]:
df_extracted.to_csv(r'intermediate/memad_parts.csv', index = False)

In [None]:
df_extracted = pd.read_csv('intermediate/memad_parts.csv')
df_extracted

Among the taken segments, some are going "outside" the length of the original media (because of metadata errors).
We mark those as `overflowing`

In [13]:
duration_query = '''  
SELECT SAMPLE(?duration) as ?duration WHERE {
    ?p a ebucore:TVProgramme ;
        ebucore:duration ?duration }
'''
overflowing = []
for i, x in tqdm(df_extracted.iterrows(), total=len(df_extracted)):
    sparql.setQuery(duration_query.replace('?p', f'<{x["media"]}>'))
    res = sparql.query().convert()["results"]["bindings"]
    duration = float(res[0]['duration']['value'])
    overflowing.append(x['end'] > duration)
df_extracted['overflowing'] = overflowing

HBox(children=(FloatProgress(value=0.0, max=335.0), HTML(value='')))




### Manually check and correct the dataset

1. Extract 1 folders for each person
2. Manually move those to 2 groups: containing and not containing the person
3. Re-import in this notebook and fix the table

##### 1. Extract 1 folder for each person

In [49]:
from shutil import copyfile

for p in df_extracted['person'].unique():
    out_path = './dataset_memad/%s' % p.replace(' ', '_')
    os.makedirs(out_path, exist_ok=True)

    for i, row in df_extracted[df_extracted['person'] == p].iterrows():
        x = row['frame']
        fname = x.split('/')[-1]
        copyfile(x, os.path.join(out_path, fname))

##### 2. Manually move those to 2 groups: containing and not containing the person


_(This action is made offline)_

##### 3. Re-import in this notebook and fix the table

In [5]:
manual_good = []
manual_bad = []
for p in os.listdir('./dataset_memad'):
    if p == '.DS_Store':
        continue
        

    for f in os.listdir(os.path.join('./dataset_memad', p)):
        if f == '.DS_Store':
            continue
        if p == 'unknown':
            manual_bad.append('./frames/memad_gt/'+f)
        else:
            manual_good.append((p,'./frames/memad_gt/'+f))

In [6]:
unique, counts = np.unique(np.array(manual_good)[:,0], return_counts=True)
for a,b in zip(unique,counts):
    print(a, b)

Delahousse,_Laurent 10
Drucker,_Marie 18
Gastrin,_Sophie 25
Le_Saint,_Sophie 26
Lucet,_Elise 28
Rincquesen,_Nathanaël_de 65


In [30]:
to_take = []
for p in unique:
    g = [b for a,b in manual_good if a == p]
    g = [b for b in g if df_extracted[df_extracted['frame']==b]['overflowing'].iloc[0] == False]
    to_take = np.append(to_take, np.random.choice(g, size=np.min([len(g),10]), replace=False))
# valid_shots[valid_shots.person.str.contains('Mitterrand')][valid_shots.confidence > 0.4]

Number of distinct segments with known faces:

In [31]:
tot_segments = len(np.unique(to_take))
tot_segments

55

We want to arrive to 100 segments, so we include some without known people

In [32]:
tot_unknown = 100 - tot_segments
tot_unknown

45

In [33]:
dataset = []
for p, f in manual_good:
    if f not in to_take:
        continue 
    person = p.replace('_', ' ')
    row = df_extracted[df_extracted.frame == f][['media', 'locator', 'p', 'start', 'end']].iloc[0].to_dict()
    row['person'] = person
    dataset.append(row)

for f in np.random.choice(manual_bad, tot_unknown, replace=False):
    row = df_extracted[df_extracted.frame == f][['media', 'locator', 'p', 'start', 'end']].iloc[0].to_dict()
    row['person'] = 0
    dataset.append(row)

df_dataset = pd.DataFrame.from_dict(dataset)
df_dataset

Unnamed: 0,media,locator,p,start,end,person
0,http://data.memad.eu/fr2/7h30-le-journal/95864...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/7h30-le-journal/7d0d1...,462.0,487.0,"Le Saint, Sophie"
1,http://data.memad.eu/fr2/7h30-le-journal/25fd9...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/7h30-le-journal/af6f8...,358.0,421.0,"Le Saint, Sophie"
2,http://data.memad.eu/fr2/7h30-le-journal/4feee...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/7h30-le-journal/842d9...,364.0,413.0,"Le Saint, Sophie"
3,http://data.memad.eu/fr2/6h30-le-journal/0474b...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/6h30-le-journal/48865...,234.0,236.0,"Le Saint, Sophie"
4,http://data.memad.eu/fr2/6h30-le-journal/4f852...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/6h30-le-journal/498e9...,195.0,217.0,"Le Saint, Sophie"
...,...,...,...,...,...,...
95,http://data.memad.eu/fr2/13-heures/3ac1dfecf30...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/13-heures/f3a2bc632d9...,340.0,364.0,0
96,http://data.memad.eu/fr2/13-heures/f0c2067e035...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/13-heures/8311ffe9315...,925.0,968.0,0
97,http://data.memad.eu/fr2/13-heures/2c506ee4e8d...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/13-heures/9c802bb4d1c...,485.0,524.0,0
98,http://data.memad.eu/fr2/20-heures/bc94cb0df1a...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/20-heures/a1cd9fbd25b...,1761.0,1786.0,0


Distribution of people in the dataset:

In [34]:
df_dataset['person'].value_counts()

0                            45
Le Saint, Sophie             10
Drucker, Marie               10
Lucet, Elise                 10
Rincquesen, Nathanaël de    10
Gastrin, Sophie              10
Delahousse, Laurent           5
Name: person, dtype: int64

How many known faces we find in the same segment?

In [35]:
df_dataset['p'].value_counts()

http://data.memad.eu/fr2/20-heures/a9847a680cfc4491f40ae831e4ddce5379236e9e          2
http://data.memad.eu/fr2/20-heures/cc39d21635c8a943d384dde70afebdb371fc2fe3          2
http://data.memad.eu/fr2/20-heures/c34e177f830a029e2c8456644d08ce2de28ba154          2
http://data.memad.eu/fr2/20-heures/9fc09633ea05d40826c3fe49e1028160e9384fea          2
http://data.memad.eu/fr2/20-heures/341912ffa39e2082cdb4d280dab0ac401aa37ba2          2
                                                                                    ..
http://data.memad.eu/fr2/6h30-le-journal/498e926f332cddfd8639acd97d3230c36a4390e8    1
http://data.memad.eu/fr2/7h30-le-journal/ec13d7b303640cb9f3fa8e26b624fd8b5d2bf585    1
http://data.memad.eu/fr2/13-heures/2235aab3afd0ced435f2cca4cc1c8b08aaff125e          1
http://data.memad.eu/fr2/13-heures/8311ffe93154950e83b7d2bd367d01a193e5af1b          1
http://data.memad.eu/fr2/13-heures/aa59fe47af2a17462bc03f2f56577fcc2d56048a          1
Name: p, Length: 95, dtype: int64

In [36]:
np.unique(df_dataset['p'].value_counts().to_list(), return_counts=True)

(array([1, 2]), array([90,  5]))

In [None]:
df_dataset.to_csv('dataset_memad.csv', index=False)  

In [5]:
df_dataset = pd.read_csv('dataset_memad.csv')
df_dataset

Unnamed: 0,media,locator,p,start,end,person
0,http://data.memad.eu/fr2/7h30-le-journal/95864...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/7h30-le-journal/7d0d1...,462,487,"Le Saint, Sophie"
1,http://data.memad.eu/fr2/7h30-le-journal/25fd9...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/7h30-le-journal/af6f8...,358,421,"Le Saint, Sophie"
2,http://data.memad.eu/fr2/7h30-le-journal/4feee...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/7h30-le-journal/842d9...,364,413,"Le Saint, Sophie"
3,http://data.memad.eu/fr2/6h30-le-journal/0474b...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/6h30-le-journal/48865...,229,239,"Le Saint, Sophie"
4,http://data.memad.eu/fr2/6h30-le-journal/4f852...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/6h30-le-journal/498e9...,195,217,"Le Saint, Sophie"
...,...,...,...,...,...,...
95,http://data.memad.eu/fr2/13-heures/3ac1dfecf30...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/13-heures/f3a2bc632d9...,340,364,0
96,http://data.memad.eu/fr2/13-heures/f0c2067e035...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/13-heures/8311ffe9315...,925,968,0
97,http://data.memad.eu/fr2/13-heures/2c506ee4e8d...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/13-heures/9c802bb4d1c...,485,524,0
98,http://data.memad.eu/fr2/20-heures/bc94cb0df1a...,https://platform.limecraft.com/api/production/...,http://data.memad.eu/fr2/20-heures/a1cd9fbd25b...,1761,1786,0
