# Import

In [1]:
import os
import json
import pandas as pd
import numpy as np
import shutil

# Define

In [2]:
def parse_fname(file, split_by='_'):
    fname = file.split('.')[0]
    fname_list = fname.split(split_by)
    
    iteration = fname_list[1][-1]
    comb = '_separate' if len(fname_list) == 3 else ''
    
    return iteration, comb, fname

In [3]:
def read_examples(fname):
    with open(fname, 'r') as f:
        return [json.loads(ex) for ex in f]

def get_files_by_type(fdir, ftype='.jsonl', start='train'):
    all_files = next(os.walk(fdir))[2]
    
    return [file for file in all_files if file.endswith(ftype) and file.startswith(start)]

def write_jsonl(df, fname):
    with open(fname, 'w') as f:
        for idx, row in df.iterrows():
            f.write(f'{json.dumps(row.to_dict())}\n')

In [4]:
def cross_val(df, leave_p=0.1, sort_by='AnonId', iteration=None):
    rows, cols = df.shape
#     print(rows)
    
    if sort_by is None:
        df_sorted = df
    else:
        df_sorted = df.sort_values(sort_by, ignore_index=True)
    
    sampled = []
    step = int(leave_p*rows)
    prev = 0
    stop = rows+step if rows%step == 0 else rows
    for split in np.arange(step, stop, step):
        sampled.append(pd.concat([
            df_sorted.iloc[:prev, :],
            df_sorted.iloc[split:, :]
        ]))
#         if not iteration is None and int(iteration) == 2:
#             print(prev, split)
        
        prev = split
        
#         print(sampled[-1].shape[0])
    
    return sampled

# Run

In [5]:
repo = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
data = os.path.join(repo, 'NLI_data')
out_base = os.path.join(repo, 'tasks', 'data')
treats = {
    '1_Baseline_protocol': 'baseline', 
    '2_Ling_on_side_protocol': 'LotS', 
    '3_Ling_in_loop_protocol': 'LitL',
}
rounds = range(1,6)

## Explore Examples by Annotator

In [6]:
iteration = 5

datasets = {}
for fbase, treat in treats.items():
    datasets[treat] = pd.DataFrame(read_examples(os.path.join(data, fbase, f'train_round{iteration}_{treat}_combined.jsonl')))

In [21]:
keeps = ['AnonId', 'round', 'heuristic', 'heuristic_checked']

for treat, dataset in datasets.items():
    keeps = ['AnonId', 'round']
    grouping = ['round', 'AnonId']
    if treat != 'baseline':
        keeps.extend(['heuristic', 'heuristic_checked'])
        grouping.extend(['heuristic', 'heuristic_checked'])
    
    temp = dataset[keeps]
    temp['counts'] = 1
    
    grouped = temp.groupby(grouping).count()
    grouped.to_csv(os.path.join(repo, 'eval_summary', 'annotator_counts', f'{treat}.csv'))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


## Sample

In [6]:
leave_p = 0.1
sort_by = 'AnonId'

In [7]:
overwrite = True

In [8]:
for treat_in, treat_out in treats.items():
    treat_dir = os.path.join(data, treat_in)    
    files = get_files_by_type(treat_dir)
    
    for file in files:
        iteration, comb, fname = parse_fname(file)
        
        run_dir = os.path.join(out_base, f'{treat_out}_{iteration}{comb}')
        cross_dir = os.path.join(run_dir, 'cross_eval')
        if overwrite and os.path.exists(cross_dir):
            shutil.rmtree(cross_dir)

        samples = cross_val(
            pd.DataFrame(read_examples(os.path.join(treat_dir, file))),
            leave_p=leave_p,
            sort_by=sort_by,
            iteration=iteration
        )

        for idx, sample in enumerate(samples):
            out_dir = os.path.join(cross_dir, f'{(idx+1)*leave_p:.1f}')
            os.makedirs(out_dir, exist_ok=True)

            write_jsonl(sample, os.path.join(out_dir, file))