## New dataset
1. List of ids from jsons
2. Retrieve metadata from manifest file
3. Create dataframe wih timestamps

#### Interval: 2019 01 tm 04

In [3]:
path_csv = "/AI/data/samenvattingen/ArchiefExport_2019.csv"
path_2019 = "/AI/data/samenvattingen/2019"

In [4]:
import pandas as pd
import os
import numpy as np
import json

### 1 List of IDs from summarized files

In [5]:
def get_ids(path):
   # Get id from file structure
   ID_list = []
   for root,dir,files in os.walk(path):
       if len(files) > 0:
           for f in files:
               ID_list.append(f)
   return(ID_list)
 
id_json = get_ids(path_2019)
ids = []
for id in range(len(id_json)):
   ids.append(id_json[id].split(".")[0])

ids[:5]

['895C5577-0CDA-45A3-835A-060EF2454191',
 '30BCF69B-7CCE-4097-A7AC-05594E837479',
 'BA791733-2732-449A-8DC4-077B642085EB',
 '906D9A09-6E9B-4B01-ADEB-0A13D29D4A71',
 'E94B5735-A16D-41A9-A05A-09EE40E240A3']

### 2 Process manifest file

In [6]:
# Retrieve only processed ids
df = pd.read_csv(path_csv)
df = df[df["ID"].isin(ids)].copy()
print(f'Leght dataframe: {len(df)}\nLenght ids in folder: {len(ids)}')
df.reset_index(drop=True, inplace=True)

# Drop irrelevant columns
df = df.drop(columns=["AlertTime", "SpeechUrl", "MediaUrl", "Summary"], errors='ignore')


Leght dataframe: 22775
Lenght ids in folder: 22775


In [7]:
# Util functions
def get_ms(timestamp):
   """
   Transform timestamp from minutes into miliseconds
   """
   ms = 0
   timestamp = timestamp.split(".")[0]
   timestamp = timestamp.split(":")
   ms += (int(timestamp[1]) * 60 + int(timestamp[2])) * 1000
   return ms
  
def adj_start_ms(x):
   """"
   Recording start ~5 minutes (30000ms) before segment starts, but recorded times can be any time within a broadcast.
   This shifts start times to 5 min or less. Extra 10000ms is added for robustness.
   """
   if x > 300000:
       return 310000
   else:
       return x

In [8]:
# Adjust timings and create duration column
if "StartPosition" in df.keys():
    start_ms = df["StartPosition"].apply(get_ms)
    end_ms = df["EndPosition"].apply(get_ms)
    df["duration_ms"] = end_ms - start_ms
    df["adj_start_ms"] = start_ms.apply(lambda x: adj_start_ms(x))
    df["adj_end_ms"] = df["adj_start_ms"] + df["duration_ms"]
    df = df.drop(columns=["StartPosition", "EndPosition", "start_ms", "end_ms"], errors="ignore")
df.head(5)

Unnamed: 0,ID,ProgrammeDateTime,duration_ms,adj_start_ms,adj_end_ms
0,E94B5735-A16D-41A9-A05A-09EE40E240A3,2019-01-01 05:00:00.000,331000,310000,641000
1,168D1E30-8F21-4C9D-A5DD-149D77E233B1,2019-01-01 06:00:00.000,17000,137000,154000
2,FCDD8B88-9BC5-4F84-BA71-610BFA889314,2019-01-01 06:00:00.000,17000,163000,180000
3,9F461187-A082-42DF-B09B-7566B290657F,2019-01-01 06:00:00.000,486000,310000,796000
4,3376B065-A0BD-4E3D-AC05-14B278077F90,2019-01-01 07:00:00.000,333000,310000,643000


### 3. Parse the transcripts from the json files
- Only relevant transcripts for summaries
- relevant for Gio
- parse from "text" ipv "result" 

In [9]:
def get_id_dict(path):
   """
   Create dict, linking json file locations to its ID
   """
   id_dict = {}
   for root, _, files in os.walk(path):
      for file in files:
         if file.endswith(".json"):
            id_dict[file.split(".")[0]] = os.path.join(root, file)

   return id_dict
 
def get_paths_by_id(id_dict, id):
   return id_dict[id]

In [10]:
id_dict = get_id_dict(path_2019)
dft = df
dft['Path'] = dft['ID'].apply(lambda x: get_paths_by_id(id_dict, x))

In [28]:
"""
Dataset JSON loader
Recordings are divided into sentences like so:
[
    {
        "result": [
            [
                "word",
                starttime,
                endtime,
                confidence
            ],
            ...
        ],
        "text": entire sentence,
        "speaker": "unk"??
    },
    ...
]
"""
def load_json(path):
    with open(path) as f:
       data = json.load(f)
    return data
 
def get_index(data, start, end):
    # Get the beginning of sentence times
    bos_times = []
    for i in range(len(data)):
        bos_times.append(data[i]['result'][0][1])
    
    # Find bos_time closest to start/end and return the index
    if len(bos_times) > 0:
        s = min(bos_times, key=lambda x:abs(x-start))
        e = min(bos_times, key=lambda x:abs(x-end))
        return bos_times.index(s), bos_times.index(e)
    else:
        return 0, 0
 
def get_sentences(data, start_index, end_index):
   sentences = []
   for i in range(start_index, end_index):
       sentences.append(data[i]['text'])
   return sentences
 
def combine_functions(data, start_ms, end_ms):
   start_index, end_index = get_index(data, start_ms, end_ms)
   sentences = get_sentences(data, start_index, end_index)
   return sentences


In [29]:
def get_index_from_path(path, start_ms, end_ms):
   data = load_json(path)
   start_index, end_index = get_index(data, start_ms, end_ms)
   return start_index, end_index
 
def get_senteces_from_index(path, index):
   data = load_json(path)
   start_index, end_index = index[0], index[1]
   sentences = get_sentences(data, start_index, end_index)
   return sentences

In [30]:
dft['Index'] = dft.apply(lambda x: get_index_from_path(x.Path, x.adj_start_ms, x.adj_end_ms), axis=1)
dft['Transcript'] = dft.apply(lambda x: get_senteces_from_index(x.Path, x.Index), axis=1)
dft.head()

Unnamed: 0,ID,ProgrammeDateTime,duration_ms,adj_start_ms,adj_end_ms,Path,Index,Transcript
0,E94B5735-A16D-41A9-A05A-09EE40E240A3,2019-01-01 05:00:00.000,331000,310000,641000,/AI/data/samenvattingen/2019/01/01/E94B5735-A1...,"(36, 96)","[Het gehad., Nou ja goed we gaan eens even kij..."
1,168D1E30-8F21-4C9D-A5DD-149D77E233B1,2019-01-01 06:00:00.000,17000,137000,154000,/AI/data/samenvattingen/2019/01/01/168D1E30-8F...,"(19, 23)","[Meerdere verdachten zijn aangehouden., Vanaf ..."
2,FCDD8B88-9BC5-4F84-BA71-610BFA889314,2019-01-01 06:00:00.000,17000,163000,180000,/AI/data/samenvattingen/2019/01/01/FCDD8B88-9B...,"(25, 28)",[Vanaf vandaag is het lage btw tarief verhoogd...
3,9F461187-A082-42DF-B09B-7566B290657F,2019-01-01 06:00:00.000,486000,310000,796000,/AI/data/samenvattingen/2019/01/01/9F461187-A0...,"(37, 116)",[André Meinema van de economieredactie die hee...
4,3376B065-A0BD-4E3D-AC05-14B278077F90,2019-01-01 07:00:00.000,333000,310000,643000,/AI/data/samenvattingen/2019/01/01/3376B065-A0...,"(65, 120)",[Zeventien vuurwerk en oud en nieuw onlosmakel...


In [31]:
# Final dataframe cleaned from (near) empty transcripts
empty = dft[dft.Transcript.apply(lambda x: len(x) < 4)]
dfc = dft.drop(empty.index, axis=0)
len(dfc)

# Check for empty transcripts
# uncomment to save, already saved in csv!
# dfc.to_csv("/AI/data/samenvattingen/csv/data_2019_04_clean.csv", index=False)

3693

### Dataset Analysis

In [138]:
# Number of sentences per segment
np.mean(dfc.Transcript.apply(lambda x: len(x)))

50.73625778499865

In [139]:
sum(dfc.Transcript.apply(lambda x: len(x)))

187369

In [141]:
dfc.shape

(3693, 9)