In [1]:
import pandas as pd
import numpy as np
import re
import os

In [2]:
os.chdir('/home/tompickard/H_Drive/srv/studat/cdt')

In [3]:
INFILE = './team2/data/legasee/metadata/master_metadata.csv'
OUTPATH = './team2/data/legasee/metadata/'

In [4]:
meta_df = pd.read_csv(INFILE)

In [5]:
meta_df.Allocation.value_counts(dropna=False)

NaN           653
Train/Eval     22
Test            8
Name: Allocation, dtype: int64

In [6]:
# Identify key items from tags, interviewee name

def get_priority_items(inrow):
    
    patt_service_loc = re.compile(r"[A-Z]+\s(.+)")
    
    ww = []

    if type(inrow.Tags) == str:
        # Split taglist on pipes, then into sub-tags
        for f in inrow.Tags.split('|'):
            ww.append(re.split(' - ',re.split('[>]',f)[-1]))

        # Flatten split lists
        ww = [item.strip() for sublist in ww for item in sublist]

        # Extract shortned names from likely service locations, e.g. for "RAF Halesworth" we also append "Halesworth"
        for w in ww:
            loc_match = re.fullmatch(patt_service_loc,w)
            if loc_match:
                ww.append(loc_match.group(1))
            
    return ww

In [7]:
def get_name_words(inrow):
    
    ww = []
    
    # Finally add each of the interviewee's names
    for n in inrow.Title.split(" "):
        # Strip quote marks from nicknames
        n = n.strip(" \"'")
        if len(n): ww.append(n)
            
    return ww

In [8]:
def add_meta_words(df):
    
    df['Priority Words'] = df.apply(get_priority_items, axis=1)
    df['Name Words'] = df.apply(get_name_words, axis=1)

In [9]:
# Subset by batches, select specified number (of interviews with transcripts) for allocation to test set if not already done

batch_allocation = {-1 : 4,
                     0 : 0,
                     1 : 0}

for b,test_target in batch_allocation.items():
    
    # Get unallocated items from this batch
    filt = (meta_df.Batch == b) & (pd.isnull(meta_df.Allocation))
    b_df = meta_df[filt].copy()
    
    # If none selected, do nothing - all allocated already
    if len(b_df) == 0:
        print("Batch {} has no unallocated items. Skipping.".format(str(b)))
        pass
    
    else:
        # Look for entries with transcripts, check there are enough
        b_gold = b_df[b_df.Transcript > 0]
        
        if len(b_gold) < test_target:
            raise ValueError("Batch {} has {} items with gold standard, but {} requested. Aborting.".format(str(b),len(b_gold),test_target))
            
        elif test_target < 0:
            raise ValueError("Target count must be >= 0, but {} requested. Aborting.".format(test_target))
            
        elif test_target == 0:
            # Nothing to go to test
            b_gold = b_gold.assign(Allocation = 'Train/Eval')
            meta_df.update(b_gold[['Allocation']])
            
            add_meta_words(meta_df)
            meta_df.to_csv(INFILE, index=False)
                        
        else:
            b_to_train = b_gold.copy()
            
            b_to_test  = b_gold.sample(test_target)
                        
            b_to_train = pd.merge(b_to_train, b_to_test, left_index=True, right_index=True, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)
            
            b_to_test = b_to_test.assign(Allocation = 'Test')
            b_to_train = b_to_train.assign(Allocation = 'Train/Eval')
            
            meta_df.update(b_to_test[['Allocation']])
            meta_df.update(b_to_train[['Allocation']])
            
            add_meta_words(meta_df)
            
            b_to_test = meta_df.iloc[b_to_test.index]
            b_to_train = meta_df.iloc[b_to_test.index]
            
            b_to_test.to_csv(OUTPATH+'batch_{}_test.csv'.format(b),index=False)
            b_to_train.to_csv(OUTPATH+'batch_{}_train.csv'.format(b),index=False)
            meta_df.to_csv(INFILE, index=False)
        

In [10]:
meta_df.Allocation.value_counts(dropna=False)

NaN           640
Train/Eval     31
Test           12
Name: Allocation, dtype: int64

In [11]:
meta_df[meta_df.Allocation == 'Test']

Unnamed: 0,id,Title,Batch,Transcript,Allocation,Content,biografy,vimeo_promo_id,vimeo_description,Service Types,Project Types,Tags,related_videos,Priority Words,Name Words
1,2071.0,Alex Owens,1,1,Test,,The delightful Alex Owens provides a classic s...,90101380.0,,Navy,Keeping Britain Afloat|The Veterans' video arc...,Places>England - HMS Ganges (Stone Frigate)|Mi...,"a:7:{i:0;s:4:""2074"";i:1;s:4:""2075"";i:2;s:4:""20...","[England, HMS Ganges (Stone Frigate), Descript...","[Alex, Owens]"
9,2573.0,Gordon Hooton,1,1,Test,,Gordon ran away from home and the Navy and the...,,,Navy,Keeping Britain Afloat|The Veterans' video arc...,Places>Russia|Places>Russia - Polyarny|Places>...,"a:5:{i:0;s:4:""2576"";i:1;s:4:""2577"";i:2;s:4:""25...","[Russia, Russia, Polyarny, The Far East, Russi...","[Gordon, Hooton]"
10,2374.0,Irene Bellamy,1,1,Test,,Irene Bellamy provides an entertaining and det...,100237326.0,,Navy,The Secret War|The Veterans' video archive|Kee...,Places>Normandy - Granville|Miscellaneous>Meda...,"a:9:{i:0;s:4:""2377"";i:1;s:4:""2378"";i:2;s:4:""23...","[Normandy, Granville, Medal, British Empire Me...","[Irene, Bellamy]"
13,2536.0,Joan Field,1,1,Test,,Joan Field was a WREN stationed at an extraord...,107398595.0,"""I got this rifle and I knew it wasn't loaded ...",Navy,The Veterans' video archive|Keeping Britain Af...,Vehicles>Southern Wave|Miscellaneous>Descripti...,"a:7:{i:0;s:4:""2540"";i:1;s:4:""2541"";i:2;s:4:""25...","[Southern Wave, Description, Camp / Accom, Des...","[Joan, Field]"
14,2410.0,Joe Pitcher,1,1,Test,,Joe Pitcher was a DEMS gunner on many Merchant...,97151742.0,"In this short extract from his interview, Joe ...",Navy,The Veterans' video archive|Keeping Britain Af...,Role>DEMS Gunner|Vehicles>HMS Kenrix|Role>Chat...,"a:7:{i:0;s:4:""2413"";i:1;s:4:""2414"";i:2;s:4:""24...","[DEMS Gunner, HMS Kenrix, Chatham Rating, HMS ...","[Joe, Pitcher]"
15,1465.0,John Woodward,1,1,Test,,John Woodward worked on a Minesweeper during W...,72025251.0,"In this short extract from his interview, John...",Navy,The Veterans' video archive|Keeping Britain Af...,Battles>Operation - Neptune / Overlord (D-Day)...,"a:5:{i:0;s:4:""1468"";i:1;s:4:""1469"";i:2;s:4:""14...","[Operation, Neptune / Overlord (D-Day), Reserv...","[John, Woodward]"
20,1545.0,Rodney Newham,1,1,Test,,Rodney Newham lives in the RNBT’s Pembroke Hou...,90101415.0,Rodney details some of his work at the Chatham...,Civilian,The Veterans' video archive|Keeping Britain Af...,Miscellaneous>Servicing - Maintenance|Miscella...,"a:5:{i:0;s:4:""1548"";i:1;s:4:""1549"";i:2;s:4:""15...","[Servicing, Maintenance, Servicing, Repair, Sh...","[Rodney, Newham]"
22,2021.0,Vic Ould,1,1,Test,,Vic Ould gives a fascinating account of his li...,84855907.0,,Navy,Keeping Britain Afloat|The Veterans' video arc...,Places>Scotland - HMS Scotia (Stone Frigate)|P...,"a:12:{i:0;s:4:""2024"";i:1;s:4:""2025"";i:2;s:4:""2...","[Scotland, HMS Scotia (Stone Frigate), Scotlan...","[Vic, Ould]"
64,831.0,Alec 'Ernest' Kellaway,-1,1,Test,,Alec Kellaway is the only veteran in our archi...,45786858.0,It comes to something when a man who served on...,Navy,The Veterans' video archive|Keeping Britain Af...,Battles>Operation - Neptune / Overlord (D-Day)...,"a:9:{i:0;s:3:""834"";i:1;s:3:""835"";i:2;s:3:""836""...","[Operation, Neptune / Overlord (D-Day), Railto...","[Alec, Ernest, Kellaway]"
69,2420.0,Colette Cook,-1,1,Test,,Colette Cook gives an entertaining account of ...,97151743.0,Colette gives some nice detail about what it w...,Navy,The Veterans' video archive|Keeping Britain Af...,War/Conflict>WWII|Service Type>Navy|Places>Eng...,"a:7:{i:0;s:4:""2423"";i:1;s:4:""2424"";i:2;s:4:""24...","[WWII, Navy, England, London ( Eastcote), WRNS...","[Colette, Cook]"
