In [1]:
import csv, math
from polyleven import levenshtein
from tqdm import trange

In [2]:
DATASETS = ['../Dataset/Text-Data/RP-Crowd-2-folds.csv',
            '../Dataset/Text-Data/RP-Crowd-3-folds.csv',
            '../Dataset/Text-Data/RP-Mod-folds.csv']
VAL_FOLDS = ['8', '9']
FOLD_COLUMN = 'ten_folds'
OUT_PATH = '../Dataset/Text-Data/Cross Evaluation Without Duplicates/evaluate_on_{}.csv'

In [None]:
### Files for simple Evaluation

for data_set in DATASETS:
    rows_list_training = []
    rows_list_testing = []
    rows_list_no_duplicates = []
    with open(data_set) as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row[FOLD_COLUMN] in VAL_FOLDS:
                rows_list_testing.append(row)
            else:
                rows_list_training.append(row)
    
    print(data_set.split('/')[-1], f'Length Test: {len(rows_list_testing)}')
    
    dropped = 0
    for test_row in rows_list_testing:
        for train_row in rows_list_training:
            dist = levenshtein(test_row['text'], train_row['text'], 1)
            if dist == 0:
                #print(test_row['text'], dist)
                dropped += 1
                break
        else:
            rows_list_no_duplicates.append(test_row)
            
    print(data_set.split('/')[-1], f'Dropped: {dropped}')
    rows_list_training.extend(rows_list_no_duplicates)
    with open(data_set.replace('folds', 'folds-without-duplicates'), 'w') as f:
        csv_writer = csv.DictWriter(f, rows_list_training[0].keys())
        csv_writer.writeheader()
        csv_writer.writerows(rows_list_training)

In [3]:
### Files for combined test set
for valid_set in DATASETS:
    valid_set_name = valid_set.split('/')[-1][:-4]
    rows_list_valid = []
    
    ## Collect Validation Samples
    with open(valid_set) as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row[FOLD_COLUMN] in VAL_FOLDS:
                rows_list_valid.append(row)
    num_valid = len(rows_list_valid)
    
    for train_set in DATASETS:
        train_set_name = train_set.split('/')[-1][:-4]
        rows_list_train = []
        
        ### Collect Training Samples
        with open(train_set) as f:
            reader = csv.DictReader(f)
            for row in reader:
                if row[FOLD_COLUMN] not in VAL_FOLDS:
                    rows_list_train.append(row)
    
        for train_row in rows_list_train:
            rows_list_valid = [row for row in rows_list_valid 
                                   if levenshtein(row['text'], train_row['text'], 1) >= 1]
    
    dropped = num_valid - len(rows_list_valid)
    print(f'{valid_set_name}: dropped {dropped} from {num_valid} samples')
    with open(OUT_PATH.format(valid_set_name), 'w') as f:
        csv_writer = csv.DictWriter(f, rows_list_valid[0].keys())
        csv_writer.writeheader()
        csv_writer.writerows(rows_list_valid)

RP-Crowd-2-folds: dropped 1430 from 3472 samples
RP-Crowd-3-folds: dropped 1081 from 1260 samples
RP-Mod-folds: dropped 743 from 2856 samples
