# data split

In [1]:
import os
import argparse
import time
import random
import shutil

from pathlib import Path
from math import ceil

from somenlp.utils import get_time_marker
from somenlp.feature_engineering import calculate_features_parallel


import os
from os import listdir 
from collections import Counter

import os
from os import listdir 
from collections import Counter


in_path = '/home/beck/Desktop/SoMeNLP/data/PLoS_methods_bio/'
out_path = '/home/beck/Desktop/SoMeNLP/data/PLoS_methods/'

def split_data(in_path, out_path, seed_, file_ext =".data.txt", 
               ratio =[60, 20, 20], set_names = ['train', 'devel', 'test'], 
               ):
    
    in_path = in_path.rstrip('/')
    out_path = out_path.rstrip('/')

    if not os.path.isdir(in_path):
        raise(RuntimeError("Input path does not exist"))
        
    if not os.path.isdir(out_path):
        os.mkdir(out_path)
    
    if sum(ratio) != 100:
        raise(RuntimeError("Input ratio {} does not sum to 100".format(ratio)))
    if len(ratio) != len(set_names):
        raise(RuntimeError("Number of ratios and setnames has to match: {} vs {}".format(ratio, set_names)))

    #print("Loading files")
    single_files = list(Path(in_path).rglob('*{}'.format(file_ext)))
    
    all_files = []
    for entry in single_files:
        
        base_file_name = entry.name.split(file_ext)[0]
        
        base_file_entries = list(Path(in_path).rglob('{}*'.format(base_file_name)))
        
        all_files.append(base_file_entries)
    
    random.seed(seed_)
    random.shuffle(all_files)

    #print("Copying files")
    cut_sum = 0
    prev_cut_idx = 0
    for cut, name in zip(ratio, set_names):
        cut_sum += cut
        cut_idx = ceil(len(all_files) * cut_sum / 100) 
        
        out_folder_name = in_path.rsplit('/')[-1]
        new_output_location = '{}/{}_{}'.format(out_path.rstrip('/'), out_folder_name, name) 
        
        if not os.path.isdir(new_output_location):
            os.makedirs(new_output_location)
            
        for files in all_files[prev_cut_idx:cut_idx]:
            for f in files:
                source_path = str(f)
                target_path = '{}/{}'.format(new_output_location, f.name)
                shutil.copy(source_path, target_path)
        prev_cut_idx = cut_idx
    
    print("Done")
    

def purposeLabel_counter(path):
    
    import os
    from os import listdir 
    from collections import Counter
    
    
    def list_file_names(path, file_ext='.labels.txt'):
    
        file_names_list = []

        for file_name in os.listdir(path):
            if not file_name.endswith(file_ext): continue
            file_names_list.append(file_name) 

        file_names_list.sort()

        return file_names_list
    
    file_name_list = list_file_names(path)
    
    interest_list = ["Analysis", "Modelling", "Stimulation", "DataCollection", "DataPreProcss", 
                 "Simulation", "Visualization", "Programming"]
    
    all_purpose_labels = []
    
    for file_name in file_name_list[:]:
        file_path = path + file_name
        
        with open(file_path, 'r') as f:
            
            list_lines = f.readlines()
            
            #print(list_of_lebs)
            list_of_tokens = ' '.join(list_lines).split()
            
            for tok in list_of_tokens:
                if tok != 'O' and (len(tok.split('-')[1].split('_')) == 3 ):
                    if tok.split('-')[1].split('_')[2] in interest_list:
                        
                        purpose = tok.split('-')[1].split('_')[2]
                        
                        all_purpose_labels.append(purpose)
    
                        
    return  dict(Counter(all_purpose_labels))

def train_test_dev(whole_path, train_path, test_path, dev_path):
    
    _whole = purposeLabel_counter(whole_path)
    _train = purposeLabel_counter(train_path)
    _test  = purposeLabel_counter(test_path)
    _dev   = purposeLabel_counter(dev_path)
    
    
    
    _whole_sorted = dict(sorted(_whole.items()))
    _train_sorted = dict(sorted(_train.items()))
    _test_sorted  = dict(sorted(_test.items()))
    _dev_sorted   = dict(sorted(_dev.items()))
    
    return _whole_sorted, _train_sorted, _test_sorted, _dev_sorted

def __proportion_calculator_with_key(whole_dict, train_dict, test_dict, dev_dict ):
    
    train_purpose_props = []
    for (k,v), (k2,v2) in zip(whole_dict.items(), train_dict.items()):
        if k2 == k: 
            train_percentage = int(v2 / v * 100)
            train_purpose_props.append((train_percentage, k))
            
    dev_purpose_props = []
    for (k,v), (k2,v2) in zip(whole_dict.items(), dev_dict.items()):
        if k2 == k:
            
            dev_percentage = int(v2 / v * 100)
            dev_purpose_props.append((dev_percentage,k))
            
    test_purpose_props = []       
    for (k,v), (k2,v2) in zip(whole_dict.items(), test_dict.items()):
        if k2 == k:
            
            test_percentage = int(v2 / v * 100)
            test_purpose_props.append((test_percentage, k))
            
    return train_purpose_props, dev_purpose_props, test_purpose_props


def __proportion_calculator(whole_dict, train_dict, test_dict, dev_dict ):
    
    train_purpose_props = []
    for (k,v), (k2,v2) in zip(whole_dict.items(), train_dict.items()):
        if k2 == k: 
            train_percentage = int(v2 / v * 100)
            train_purpose_props.append(train_percentage)
            
    dev_purpose_props = []
    for (k,v), (k2,v2) in zip(whole_dict.items(), dev_dict.items()):
        if k2 == k:
            
            dev_percentage = int(v2 / v * 100)
            dev_purpose_props.append(dev_percentage)
            
    test_purpose_props = []       
    for (k,v), (k2,v2) in zip(whole_dict.items(), test_dict.items()):
        if k2 == k:
            
            test_percentage = int(v2 / v * 100)
            test_purpose_props.append(test_percentage)
            
    return train_purpose_props, dev_purpose_props, test_purpose_props



def _within_range(list_):
    
    bool_list = []
    for val in list_:
        if val in range(55,65):
            bool_list.append(True)
        else:
            bool_list.append(False)
            
    return all(bool_list) #, bool_list   # apply & to all and return


def _within_range_pubmed(list_):
    
    bool_list = []
    for val in list_:
        if val in range(50,70):
            bool_list.append(True)
        else:
            bool_list.append(False)
            
            
    withn_range_bool = all(bool_list) # apply & to all and return
    
    # check if all class labels are included , class label stimulation is missing in the PubMed 
    if (len(bool_list) == 7):
        all_classes_included = True
    else:
        all_classes_included = False
        
    result = all_classes_included and withn_range_bool
            
    return result   


def _within_range_dev_test(list_):
    
    bool_list = []
    for val in list_:
        if val in range(15,25):
            bool_list.append(True)
        else:
            bool_list.append(False)
            
    return all(bool_list) #, bool_list   # apply & to all and return

# plos_pubmed_combined

In [None]:
# combined plos pubmed bio   train , test , dev


in_path_comb = '/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined_bio/'
out_path_comb = '/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined/'

stable_split_seeds = [204, 1623, 1686, 2075, 2183, 2665, 2699, 3259, 4680, 4709, 4893, 5648, 7088, 7798,
                     8204, 8990, 9392, 10390, 10790, 11148, 11624, 11701, 12053, 12379]

i = 12545

split_data(in_path_comb, out_path_comb, i)


comb_path_whole = '/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined_bio/'
comb_path_train = "/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined/plos_pubmed_combined_bio_train/" 
comb_path_test = "/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined/plos_pubmed_combined_bio_test/" 
comb_path_dev = "/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined/plos_pubmed_combined_bio_devel/"
    
    
#calculate the distribution of class_labels for cmbined set
comb_whole_dict, comb_train_dict, comb_test_dict, comb_dev_dict = train_test_dev(comb_path_whole, 
                                                                                 comb_path_train, 
                                                                                 comb_path_test, 
                                                                                 comb_path_dev)


# calculate the proportion for each class label (8 - purposes)
comb_train_props, comb_dev_props, comb_test_props = __proportion_calculator(comb_whole_dict, 
                                                                            comb_train_dict, 
                                                                            comb_test_dict, 
                                                                            comb_dev_dict)


comb_train_result_ = _within_range_pubmed(comb_train_props)
comb_test_result_  = _within_range_dev_test(comb_test_props)
comb_dev_result_   = _within_range_dev_test(comb_dev_props)

bool_ = (comb_train_result_) and (comb_test_result_) and (comb_dev_result_)

stable_split_seeds = []

while not(bool_):
    
    print(f'splitting ... with seed: {i}')
    
    split_data(in_path_comb, out_path_comb, i)
    
    i += 1
    
    
    
    #calculate the distribution of class_labels for cmbined set
    comb_whole_dict, comb_train_dict, comb_test_dict, comb_dev_dict = train_test_dev(comb_path_whole, 
                                                                                 comb_path_train, 
                                                                                 comb_path_test, 
                                                                                 comb_path_dev)


    # calculate the proportion for each class label (8 - purposes)
    comb_train_props, comb_dev_props, comb_test_props = __proportion_calculator(comb_whole_dict, 
                                                                            comb_train_dict, 
                                                                            comb_test_dict, 
                                                                            comb_dev_dict)
    #print('is the split fair ?')
    comb_train_result_ = _within_range(comb_train_props)
    comb_test_result  = _within_range_dev_test(comb_test_props)
    comb_dev_result   = _within_range_dev_test(comb_dev_props)
    print()
    
    bool_ = (comb_train_result_) and (comb_test_result_) and (comb_dev_result_)
    

    
    
    if bool_:
        print()
        print('plos_pubmed split is balanced. stopped splitting')
        print(f'result:  {comb_train_result_, comb_test_result, comb_dev_result}') 
        stable_split_seeds.append(i)
        break
        
    else:
        
        shutil.rmtree(comb_path_train, ignore_errors=False, onerror=None)
        shutil.rmtree(comb_path_test, ignore_errors=False, onerror=None)
        shutil.rmtree(comb_path_dev, ignore_errors=False, onerror=None)
        
        print(f'result:  {comb_train_result_, comb_test_result, comb_dev_result}')   
        print(f'plos_pubmed split is not yet balanced {comb_train_props, comb_dev_props, comb_test_props}.  splitting further ...')
        print()    

Done
splitting ... with seed: 12545
Done

result:  (False, True, False)
plos_pubmed split is not yet balanced ([61, 61, 54, 60, 58, 82, 53, 66], [18, 21, 25, 22, 25], [20, 16, 19, 16, 16, 17, 15, 16]).  splitting further ...

splitting ... with seed: 12546
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([57, 60, 67, 60, 42, 57, 52, 74], [19, 9, 10, 13, 26, 32, 25, 9], [23, 30, 22, 25, 31, 10, 22, 16]).  splitting further ...

splitting ... with seed: 12547
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([59, 50, 58, 55, 55, 64, 61, 58], [21, 20, 22, 30, 21, 3, 28, 16], [18, 28, 18, 14, 23, 32, 9, 25]).  splitting further ...

splitting ... with seed: 12548
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([63, 67, 60, 58, 83, 75, 59, 74], [19, 10, 26, 25, 8, 14, 16, 17], [17, 22, 12, 15, 7, 10, 23, 8]).  splitting further ...

splitting ... with seed: 12549
Done

result:  (False, False, False)
plos_pubmed

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([64, 67, 66, 72, 58, 17, 32, 56], [19, 13, 16, 14, 35, 42, 26, 26], [16, 19, 17, 12, 6, 39, 40, 17]).  splitting further ...

splitting ... with seed: 12582
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 60, 69, 65, 45, 42, 42, 54], [15, 15, 12, 16, 2, 25, 26, 16], [23, 24, 18, 18, 52, 32, 30, 29]).  splitting further ...

splitting ... with seed: 12583
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 54, 66, 55, 75, 50, 64, 53], [15, 30, 20, 18, 10, 3, 14, 21], [22, 15, 12, 25, 15, 46, 21, 24]).  splitting further ...

splitting ... with seed: 12584
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 63, 64, 52, 37, 57, 60, 66], [18, 20, 22, 33, 40, 32, 22, 19], [22, 16, 13, 14, 22, 10, 16, 13]).  splitting further ...

splitting ... with seed: 12585
Done

result:  (False, False, False)
plos_pubmed split is not yet b

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([56, 56, 50, 55, 33, 46, 56, 52], [23, 16, 28, 19, 25, 21, 22, 26], [20, 26, 21, 25, 41, 32, 21, 21]).  splitting further ...

splitting ... with seed: 12618
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([56, 57, 50, 52, 73, 60, 66, 52], [20, 22, 22, 19, 8, 7, 25, 35], [22, 19, 26, 27, 17, 32, 8, 12]).  splitting further ...

splitting ... with seed: 12619
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 63, 73, 65, 52, 46, 63, 40], [17, 17, 10, 15, 27, 28, 28, 29], [20, 19, 16, 19, 20, 25, 8, 30]).  splitting further ...

splitting ... with seed: 12620
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 60, 51, 53, 72, 60, 78, 52], [20, 18, 20, 25, 12], [21, 20, 28, 20, 15, 39]).  splitting further ...

splitting ... with seed: 12621
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([53, 59, 39,

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([55, 65, 47, 54, 65, 35, 54, 52], [19, 19, 19, 25, 10, 25, 23, 24], [25, 14, 32, 20, 25, 39, 21, 23]).  splitting further ...

splitting ... with seed: 12654
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 62, 58, 66, 60, 10, 66, 61], [17, 21, 19, 22, 31, 21, 14, 17], [20, 16, 22, 11, 8, 67, 19, 20]).  splitting further ...

splitting ... with seed: 12655
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([55, 63, 53, 57, 50, 82, 47, 60], [20, 21, 24, 21, 20, 7, 29, 15], [23, 14, 21, 21, 30, 10, 22, 23]).  splitting further ...

splitting ... with seed: 12656
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([57, 56, 56, 63, 68, 39, 87, 64], [24, 25, 25, 23, 15, 21, 7, 20], [18, 18, 18, 12, 16, 39, 5, 15]).  splitting further ...

splitting ... with seed: 12657
Done

result:  (False, False, False)
plos_pubmed split is not yet ba

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 65, 54, 54, 52, 60, 54, 57], [17, 15, 22, 25, 10, 3, 22, 27], [21, 19, 22, 19, 37, 35, 22, 15]).  splitting further ...

splitting ... with seed: 12690
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 65, 69, 62, 81, 71, 59, 60], [18, 20, 11, 18, 6, 7, 18, 23], [20, 14, 18, 19, 12, 21, 22, 16]).  splitting further ...

splitting ... with seed: 12691
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([50, 67, 43, 52, 72, 53, 42, 60], [27, 14, 29, 29, 5, 32, 26, 23], [21, 17, 27, 18, 22, 14, 30, 16]).  splitting further ...

splitting ... with seed: 12692
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([57, 60, 61, 56, 62, 42, 67, 54], [19, 19, 13, 23, 30, 25, 9, 23], [23, 19, 25, 19, 7, 32, 22, 22]).  splitting further ...

splitting ... with seed: 12693
Done

result:  (False, False, False)
plos_pubmed split is not yet bala

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([63, 67, 62, 54, 48, 75, 54, 66], [16, 17, 14, 14, 26, 25, 22, 12], [19, 14, 22, 31, 25]).  splitting further ...

splitting ... with seed: 12726
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([62, 58, 69, 66, 47, 42, 46, 68], [21, 22, 11, 22, 6, 42, 18, 13], [16, 19, 19, 11, 46, 14, 35, 17]).  splitting further ...

splitting ... with seed: 12727
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 65, 64, 51, 57, 64, 74, 63], [20, 18, 14, 20, 10, 7, 5, 18], [18, 16, 20, 28, 32, 28, 19, 17]).  splitting further ...

splitting ... with seed: 12728
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([64, 52, 68, 73, 55, 32, 49, 51], [19, 27, 15, 17, 35, 35, 22, 29], [16, 19, 15, 8, 10, 32, 28, 19]).  splitting further ...

splitting ... with seed: 12729
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([59,

In [None]:
in_path_comb = '/home/beck/Desktop/Otimal_split/plos_pubmed_combined_bio/'
out_path_comb = '/home/beck/Desktop/Otimal_split/plos_pubmed_combined/'

#split_data(in_path_comb, out_path_comb, 204)


comb_path_whole = '/home/beck/Desktop/Otimal_split/plos_pubmed_combined_bio/'
comb_path_train = "/home/beck/Desktop/Otimal_split/plos_pubmed_combined/plos_pubmed_combined_bio_train/" 
comb_path_test = "/home/beck/Desktop/Otimal_split/plos_pubmed_combined/plos_pubmed_combined_bio_test/" 
comb_path_dev = "/home/beck/Desktop/Otimal_split/plos_pubmed_combined/plos_pubmed_combined_bio_devel/"
    
    
#calculate the distribution of class_labels for cmbined set
comb_whole_dict, comb_train_dict, comb_test_dict, comb_dev_dict = train_test_dev(comb_path_whole, 
                                                                                 comb_path_train, 
                                                                                 comb_path_test, 
                                                                                 comb_path_dev)


# calculate the proportion for each class label (8 - purposes)
comb_train_props, comb_dev_props, comb_test_props = __proportion_calculator(comb_whole_dict, 
                                                                            comb_train_dict, 
                                                                            comb_test_dict, 
                                                                            comb_dev_dict)

comb_train_result_ = _within_range_pubmed(comb_train_props)
comb_test_result_  = _within_range_dev_test(comb_test_props)
comb_dev_result_   = _within_range_dev_test(comb_dev_props)


print(f'plos+pubmed train {comb_train_props, _within_range(comb_train_props)[1]}')
print(f'plos+pubmed test  {comb_test_props, _within_range_dev_test(comb_test_props)[1]}')
print(f'plos+pubmed dev   {comb_dev_props, _within_range_dev_test(comb_dev_props)[1]}')
print()

print(f"plos+pubmed train within +/- 60% ? {_within_range(comb_train_props)[0]}")
print(f"plos+pubmed test  within +/- 15% ? {_within_range_dev_test(comb_test_props)[0]}")
print(f"plos+pubmed dev   within +/- 15% ? {_within_range_dev_test(comb_dev_props)[0]}")
print()

# train percentage 60%
    
for (k,v), (k2,v2) in zip(comb_whole_dict.items(),  comb_train_dict.items()):
    if k2 == k:
        print(f"{k2 :{18}}  {v2}/{v} train {v2/v*100: .2f}% ")

# test percentage 20% 
print()

for (k,v), (k2,v2) in zip(comb_whole_dict.items(),  comb_test_dict.items()):
    if k2 == k:
        print(f"{k2 :{18}}  {v2}/{v} test {v2/v*100: .2f}% ")
        
# dev percentage 20% 
print()

for (k,v), (k2,v2) in zip(comb_whole_dict.items(),  comb_dev_dict.items()):
    if k2 == k:
        print(f"{k2 :{18}}  {v2}/{v} dev {v2/v*100: .2f}% ")