## New dataset
1. List of ids from jsons
2. Retrieve metadata from manifest file
3. Create dataframe wih timestamps

#### Interval: 2019 01 tm 04

In [1]:
import pandas as pd
import os
import numpy as np
import json

In [11]:
path_csv = '../../../data/samenvattingen/ArchiefExport_2019.csv'
path_2019 = "../../../data/samenvattingen/2019"

### Process manifest file

In [12]:
# Get IDs & file location from processed files
id_dict = {}
for root, _, files in os.walk(path_2019):
    for file in files:
        if file.endswith(".json"):
            id_dict[file.split(".")[0]] = os.path.join(root, file)

# Retrieve only processed IDs
df = pd.read_csv(path_csv)
df = df[df["ID"].isin(id_dict.keys())].copy()
df.reset_index(drop=True, inplace=True)
df['Path'] = df['ID'].apply(lambda id: id_dict[id])

# Drop irrelevant columns
df = df.drop(columns=["AlertTime", "SpeechUrl", "MediaUrl", "Summary"], errors='ignore')

print(f'Number of processed files: {len(df)}')

Number of processed files: 22775


In [115]:
# Util functions
def get_ms(timestamp):
   """
   Transform timestamp from minutes into miliseconds
   """
   ms = 0
   timestamp = timestamp.split(".")[0]
   timestamp = timestamp.split(":")
   ms += (int(timestamp[1]) * 60 + int(timestamp[2])) * 1000
   return ms
  
def adj_start_ms(x):
   """"
   Recording start ~5 minutes (30000ms) before segment starts, but recorded times can be any time within a broadcast.
   This shifts start times to 5 min or less. Extra 10000ms is added for robustness.
   """
   if x > 300000:
       return 310000
   else:
       return x

In [116]:
# Adjust timings and create duration column
if "StartPosition" in df.keys():
    start_ms = df["StartPosition"].apply(get_ms)
    end_ms = df["EndPosition"].apply(get_ms)
    df["duration_ms"] = end_ms - start_ms
    df["adj_start_ms"] = start_ms.apply(lambda x: adj_start_ms(x))
    df["adj_end_ms"] = df["adj_start_ms"] + df["duration_ms"]
    df = df.drop(columns=["StartPosition", "EndPosition", "start_ms", "end_ms"], errors="ignore")
df.head(5)

Unnamed: 0,ID,ProgrammeDateTime,Path,duration_ms,adj_start_ms,adj_end_ms
0,E94B5735-A16D-41A9-A05A-09EE40E240A3,2019-01-01 05:00:00.000,/AI/data/samenvattingen/2019/01/01/E94B5735-A1...,331000,310000,641000
1,168D1E30-8F21-4C9D-A5DD-149D77E233B1,2019-01-01 06:00:00.000,/AI/data/samenvattingen/2019/01/01/168D1E30-8F...,17000,137000,154000
2,FCDD8B88-9BC5-4F84-BA71-610BFA889314,2019-01-01 06:00:00.000,/AI/data/samenvattingen/2019/01/01/FCDD8B88-9B...,17000,163000,180000
3,9F461187-A082-42DF-B09B-7566B290657F,2019-01-01 06:00:00.000,/AI/data/samenvattingen/2019/01/01/9F461187-A0...,486000,310000,796000
4,3376B065-A0BD-4E3D-AC05-14B278077F90,2019-01-01 07:00:00.000,/AI/data/samenvattingen/2019/01/01/3376B065-A0...,333000,310000,643000


### Parse the transcripts from the json files
- Only relevant transcripts for summaries
- parse from "text" ipv "result" 

In [117]:
"""
Dataset JSON loader
Recordings are divided into sentences like so:
[
    {
        "result": [
            [
                "word",
                starttime,
                endtime,
                confidence
            ],
            ...
        ],
        "text": entire sentence,
        "speaker": "unk"??
    },
    ...
]
""" 
def get_index(data, start, end):
    # Get the beginning of sentence times
    bos_times = []
    for i in range(len(data)):
        bos_times.append(data[i]['result'][0][1])
    
    # Find bos_time closest to start/end and return the index
    if len(bos_times) > 0:
        s = min(bos_times, key=lambda x:abs(x-start))
        e = min(bos_times, key=lambda x:abs(x-end))
        return bos_times.index(s), bos_times.index(e)
    else:
        return 0, 0

def get_text_from_index(transcript, index):
    """
    Get the relevant segment out of a full transcript
    """
    start_index, end_index = index[0], index[1]
    return transcript[start_index:end_index]

In [118]:
# Get the full transcript out of the .json file, also extract the relevant segment out of it.
t_full = []
t_sub = []
for i, path in enumerate(df.Path):
    with open(path) as f:
        data = json.load(f)
        index = get_index(data, df.adj_start_ms.iloc[i], df.adj_end_ms.iloc[i])
        transcript = [t['text'] for t in data]
        t_full.append(transcript)
        t_sub.append(get_text_from_index(transcript, index))

df['Transcript_full'] = t_full
df['Transcript_sub'] = t_sub

In [119]:
# Final dataframe cleaned from (near) empty transcripts
empty = df[df.Transcript_sub.apply(lambda x: len(x) < 4)]
df.drop(empty.index, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

# Check for empty transcripts
# uncomment to save, already saved in csv!
# dfc.to_csv("/AI/data/samenvattingen/csv/data_2019_04_clean.csv", index=False)

Unnamed: 0,ID,ProgrammeDateTime,Path,duration_ms,adj_start_ms,adj_end_ms,Transcript_full,Transcript_sub
0,E94B5735-A16D-41A9-A05A-09EE40E240A3,2019-01-01 05:00:00.000,/AI/data/samenvattingen/2019/01/01/E94B5735-A1...,331000,310000,641000,[Straat waren rode cirkels getekend en niemand...,"[Het gehad., Nou ja goed we gaan eens even kij..."
1,168D1E30-8F21-4C9D-A5DD-149D77E233B1,2019-01-01 06:00:00.000,/AI/data/samenvattingen/2019/01/01/168D1E30-8F...,17000,137000,154000,[Radio m. Utrecht in Midden Nederland op FM dr...,"[Meerdere verdachten zijn aangehouden., Vanaf ..."
2,9F461187-A082-42DF-B09B-7566B290657F,2019-01-01 06:00:00.000,/AI/data/samenvattingen/2019/01/01/9F461187-A0...,486000,310000,796000,[Een zinzen St Regis was dat is kwart over zes...,[André Meinema van de economieredactie die hee...
3,3376B065-A0BD-4E3D-AC05-14B278077F90,2019-01-01 07:00:00.000,/AI/data/samenvattingen/2019/01/01/3376B065-A0...,333000,310000,643000,[En zeggen we waar we zijn en wat er aan de ha...,[Zeventien vuurwerk en oud en nieuw onlosmakel...
4,906D9A09-6E9B-4B01-ADEB-0A13D29D4A71,2019-01-01 07:00:00.000,/AI/data/samenvattingen/2019/01/01/906D9A09-6E...,355000,259000,614000,[In tweeduizendnegentien reden we nog op benzi...,[Daar is de brandweer nog steeds bezig met het...


In [99]:
print(f'Number of segments: {len(df)}')
print(f'Avg. segment length: {np.mean(df.Transcript_sub.apply(lambda x: len(x))):.4}')

Number of segments: 3693
Avg. number of sentences per segment: 50.74


### Concatenated Dataset

In [100]:
from datetime import datetime
from tqdm import tqdm
rng = np.random.default_rng()

In [142]:
def create_doc(data: pd.DataFrame, save=True):
    """
    Creates a folder containing .txt files from concatenated segments.
    :param data: Dataframe containing the data to be processed.
    :param save: Whether to save to a folder or to print out results.
    """
    d = data.copy()

    dt = datetime.now().strftime("%d%m-%H%M")
    i = 0
    pad = int(np.log10(len(d))) + 1

    doc_lengths = []

    if save:
        new_dir = f'../NLAuVi/data_{dt}'
        os.mkdir(new_dir)
    else:
        docs = []

    with tqdm(total=len(d), desc='Transcripts processed') as pbar:
        while len(d) > 0:
            if len(d) >= 6:
                n = rng.integers(2,5)
            elif len(d) == 5:
                n = rng.integers(2,4)
            else:
                n = len(d)
            slice = rng.choice(len(d), size=n, replace=False)

            doc = ''
            l = 0
            for t in slice:
                text = d["Transcript_sub"].iloc[t]
                doc += f'==={d["ID"].iloc[t]}===\n{" ".join(text)}\n'
                l += len(text)

            doc_lengths.append(l / n)
            d.drop(slice, inplace=True)
            d.reset_index(drop=True, inplace=True)

            if save:
                with open(f'{new_dir}/{i:0{pad}}.txt', 'w') as tfile:
                    tfile.write(doc)
            else:
                docs.append(doc)
            i += 1
            pbar.update(n)

    print(f'Created {i} Documents')
    print(f'   Average document length = {np.mean(doc_lengths)} sentences')

    if save is False: 
        return docs

In [126]:
df.iloc[0:3]

Unnamed: 0,ID,ProgrammeDateTime,Path,duration_ms,adj_start_ms,adj_end_ms,Transcript_full,Transcript_sub
0,E94B5735-A16D-41A9-A05A-09EE40E240A3,2019-01-01 05:00:00.000,/AI/data/samenvattingen/2019/01/01/E94B5735-A1...,331000,310000,641000,[Straat waren rode cirkels getekend en niemand...,"[Het gehad., Nou ja goed we gaan eens even kij..."
1,168D1E30-8F21-4C9D-A5DD-149D77E233B1,2019-01-01 06:00:00.000,/AI/data/samenvattingen/2019/01/01/168D1E30-8F...,17000,137000,154000,[Radio m. Utrecht in Midden Nederland op FM dr...,"[Meerdere verdachten zijn aangehouden., Vanaf ..."
2,9F461187-A082-42DF-B09B-7566B290657F,2019-01-01 06:00:00.000,/AI/data/samenvattingen/2019/01/01/9F461187-A0...,486000,310000,796000,[Een zinzen St Regis was dat is kwart over zes...,[André Meinema van de economieredactie die hee...


In [143]:
create_doc(df)

Transcripts processed: 100%|██████████| 3693/3693 [00:01<00:00, 2591.90it/s]

Created 1232 Documents
   Average document length = 50.642518939393945 sentences



