In [None]:
import pandas as pd
from pathlib import Path
import json

In [None]:
def read_jsonl(path):
    with Path(path).open() as f:
        records = [json.loads(line) for line in f.readlines() if len(line) > 0]
    df = pd.DataFrame.from_records(records)
    return df

def to_jsonl(df, path):
    with Path(path).open('w') as f:
        for record in df.to_dict('records'):
            f.write(json.dumps(record))
            f.write('\n')

In [None]:
datasets_dir = Path('datasets')
for dataset_dir in datasets_dir.glob('[a-z_]*'):
    
    # Load dataset
    train = read_jsonl(dataset_dir / 'train.jsonl')
    test = read_jsonl(dataset_dir / 'test.jsonl')
    
    # Convert lists to tuples
    for col in train.columns:
        if isinstance(train.loc[0, col], list):
            train[col] = train[col].map(tuple)
            test[col] = test[col].map(tuple)
            
    # Drop duplicates within train/test
    train = train.drop_duplicates().reset_index(drop=True)
    test = test.drop_duplicates().reset_index(drop=True)
    
    # Get cross-duplicates
    merged = pd.concat([train, test]).reset_index(drop=True)
    not_xdups = [idx for idx in merged[~merged.duplicated(keep=False)].index.tolist()
                 if idx in train.index]
    
    # Remove cross-duplicates
    train = train.loc[not_xdups]
    
    # Double-check that all duplicates have been removed
    new_merged = pd.concat([train, test]).reset_index(drop=True)
    assert len(new_merged) == len(new_merged.drop_duplicates())
    
    # Store the new train and test sets
    to_jsonl(train, dataset_dir / 'train.jsonl')
    to_jsonl(test, dataset_dir / 'test.jsonl')