# data split

In [1]:
import os
import argparse
import time
import random
import shutil

from pathlib import Path
from math import ceil

from somenlp.utils import get_time_marker
from somenlp.feature_engineering import calculate_features_parallel


import os
from os import listdir 
from collections import Counter

import os
from os import listdir 
from collections import Counter


in_path = '/home/beck/Desktop/SoMeNLP/data/PLoS_methods_bio/'
out_path = '/home/beck/Desktop/SoMeNLP/data/PLoS_methods/'

def split_data(in_path, out_path, seed_, file_ext =".data.txt", 
               ratio =[60, 20, 20], set_names = ['train', 'devel', 'test'], 
               ):
    
    in_path = in_path.rstrip('/')
    out_path = out_path.rstrip('/')

    if not os.path.isdir(in_path):
        raise(RuntimeError("Input path does not exist"))
        
    if not os.path.isdir(out_path):
        os.mkdir(out_path)
    
    if sum(ratio) != 100:
        raise(RuntimeError("Input ratio {} does not sum to 100".format(ratio)))
    if len(ratio) != len(set_names):
        raise(RuntimeError("Number of ratios and setnames has to match: {} vs {}".format(ratio, set_names)))

    #print("Loading files")
    single_files = list(Path(in_path).rglob('*{}'.format(file_ext)))
    
    all_files = []
    for entry in single_files:
        
        base_file_name = entry.name.split(file_ext)[0]
        
        base_file_entries = list(Path(in_path).rglob('{}*'.format(base_file_name)))
        
        all_files.append(base_file_entries)
    
    random.seed(seed_)
    random.shuffle(all_files)

    #print("Copying files")
    cut_sum = 0
    prev_cut_idx = 0
    for cut, name in zip(ratio, set_names):
        cut_sum += cut
        cut_idx = ceil(len(all_files) * cut_sum / 100) 
        
        out_folder_name = in_path.rsplit('/')[-1]
        new_output_location = '{}/{}_{}'.format(out_path.rstrip('/'), out_folder_name, name) 
        
        if not os.path.isdir(new_output_location):
            os.makedirs(new_output_location)
            
        for files in all_files[prev_cut_idx:cut_idx]:
            for f in files:
                source_path = str(f)
                target_path = '{}/{}'.format(new_output_location, f.name)
                shutil.copy(source_path, target_path)
        prev_cut_idx = cut_idx
    
    print("Done")
    

def purposeLabel_counter(path):
    
    import os
    from os import listdir 
    from collections import Counter
    
    
    def list_file_names(path, file_ext='.labels.txt'):
    
        file_names_list = []

        for file_name in os.listdir(path):
            if not file_name.endswith(file_ext): continue
            file_names_list.append(file_name) 

        file_names_list.sort()

        return file_names_list
    
    file_name_list = list_file_names(path)
    
    interest_list = ["Analysis", "Modelling", "Stimulation", "DataCollection", "DataPreProcss", 
                 "Simulation", "Visualization", "Programming"]
    
    all_purpose_labels = []
    
    for file_name in file_name_list[:]:
        file_path = path + file_name
        
        with open(file_path, 'r') as f:
            
            list_lines = f.readlines()
            
            #print(list_of_lebs)
            list_of_tokens = ' '.join(list_lines).split()
            
            for tok in list_of_tokens:
                if tok != 'O' and (len(tok.split('-')[1].split('_')) == 3 ):
                    if tok.split('-')[1].split('_')[2] in interest_list:
                        
                        purpose = tok.split('-')[1].split('_')[2]
                        
                        all_purpose_labels.append(purpose)
    
                        
    return  dict(Counter(all_purpose_labels))

def train_test_dev(whole_path, train_path, test_path, dev_path):
    
    _whole = purposeLabel_counter(whole_path)
    _train = purposeLabel_counter(train_path)
    _test  = purposeLabel_counter(test_path)
    _dev   = purposeLabel_counter(dev_path)
    
    
    
    _whole_sorted = dict(sorted(_whole.items()))
    _train_sorted = dict(sorted(_train.items()))
    _test_sorted  = dict(sorted(_test.items()))
    _dev_sorted   = dict(sorted(_dev.items()))
    
    return _whole_sorted, _train_sorted, _test_sorted, _dev_sorted

def __proportion_calculator_with_key(whole_dict, train_dict, test_dict, dev_dict ):
    
    train_purpose_props = []
    for (k,v), (k2,v2) in zip(whole_dict.items(), train_dict.items()):
        if k2 == k: 
            train_percentage = int(v2 / v * 100)
            train_purpose_props.append((train_percentage, k))
            
    dev_purpose_props = []
    for (k,v), (k2,v2) in zip(whole_dict.items(), dev_dict.items()):
        if k2 == k:
            
            dev_percentage = int(v2 / v * 100)
            dev_purpose_props.append((dev_percentage,k))
            
    test_purpose_props = []       
    for (k,v), (k2,v2) in zip(whole_dict.items(), test_dict.items()):
        if k2 == k:
            
            test_percentage = int(v2 / v * 100)
            test_purpose_props.append((test_percentage, k))
            
    return train_purpose_props, dev_purpose_props, test_purpose_props


def __proportion_calculator(whole_dict, train_dict, test_dict, dev_dict ):
    
    train_purpose_props = []
    for (k,v), (k2,v2) in zip(whole_dict.items(), train_dict.items()):
        if k2 == k: 
            train_percentage = int(v2 / v * 100)
            train_purpose_props.append(train_percentage)
            
    dev_purpose_props = []
    for (k,v), (k2,v2) in zip(whole_dict.items(), dev_dict.items()):
        if k2 == k:
            
            dev_percentage = int(v2 / v * 100)
            dev_purpose_props.append(dev_percentage)
            
    test_purpose_props = []       
    for (k,v), (k2,v2) in zip(whole_dict.items(), test_dict.items()):
        if k2 == k:
            
            test_percentage = int(v2 / v * 100)
            test_purpose_props.append(test_percentage)
            
    return train_purpose_props, dev_purpose_props, test_purpose_props



def _within_range(list_):
    
    bool_list = []
    for val in list_:
        if val in range(55,65):
            bool_list.append(True)
        else:
            bool_list.append(False)
            
    return all(bool_list) #, bool_list   # apply & to all and return


def _within_range_pubmed(list_):
    
    bool_list = []
    for val in list_:
        if val in range(50,70):
            bool_list.append(True)
        else:
            bool_list.append(False)
            
            
    withn_range_bool = all(bool_list) # apply & to all and return
    
    # check if all class labels are included , class label stimulation is missing in the PubMed 
    if (len(bool_list) == 7):
        all_classes_included = True
    else:
        all_classes_included = False
        
    result = all_classes_included and withn_range_bool
            
    return result   


def _within_range_dev_test(list_):
    
    bool_list = []
    for val in list_:
        if val in range(15,25):
            bool_list.append(True)
        else:
            bool_list.append(False)
            
    return all(bool_list) #, bool_list   # apply & to all and return

# plos_pubmed_combined

In [15]:
# combined plos pubmed bio   train , test , dev


in_path_comb = '/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined_bio/'
out_path_comb = '/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined/'


i = 10390

split_data(in_path_comb, out_path_comb, i)


comb_path_whole = '/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined_bio/'
comb_path_train = "/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined/plos_pubmed_combined_bio_train/" 
comb_path_test = "/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined/plos_pubmed_combined_bio_test/" 
comb_path_dev = "/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined/plos_pubmed_combined_bio_devel/"
    
    
#calculate the distribution of class_labels for cmbined set
comb_whole_dict, comb_train_dict, comb_test_dict, comb_dev_dict = train_test_dev(comb_path_whole, 
                                                                                 comb_path_train, 
                                                                                 comb_path_test, 
                                                                                 comb_path_dev)


# calculate the proportion for each class label (8 - purposes)
comb_train_props, comb_dev_props, comb_test_props = __proportion_calculator(comb_whole_dict, 
                                                                            comb_train_dict, 
                                                                            comb_test_dict, 
                                                                            comb_dev_dict)


comb_train_result_ = _within_range_pubmed(comb_train_props)
comb_test_result_  = _within_range_dev_test(comb_test_props)
comb_dev_result_   = _within_range_dev_test(comb_dev_props)

print(f'plos+pubmed train {comb_train_props, _within_range(comb_train_props)}')
print(f'plos+pubmed test  {comb_test_props, _within_range_dev_test(comb_test_props)}')
print(f'plos+pubmed dev   {comb_dev_props, _within_range_dev_test(comb_dev_props)}')
print()

print(f"plos+pubmed train within +/- 60% ? {_within_range(comb_train_props)}")
print(f"plos+pubmed test  within +/- 15% ? {_within_range_dev_test(comb_test_props)}")
print(f"plos+pubmed dev   within +/- 15% ? {_within_range_dev_test(comb_dev_props)}")
print()

# train percentage 60%
    
for (k,v), (k2,v2) in zip(comb_whole_dict.items(),  comb_train_dict.items()):
    if k2 == k:
        print(f"{k2 :{18}}  {v2}/{v} train {v2/v*100: .2f}% ")

# test percentage 20% 
print()

for (k,v), (k2,v2) in zip(comb_whole_dict.items(),  comb_test_dict.items()):
    if k2 == k:
        print(f"{k2 :{18}}  {v2}/{v} test {v2/v*100: .2f}% ")
        
# dev percentage 20% 
print()

for (k,v), (k2,v2) in zip(comb_whole_dict.items(),  comb_dev_dict.items()):
    if k2 == k:
        print(f"{k2 :{18}}  {v2}/{v} dev {v2/v*100: .2f}% ")

Done
plos+pubmed train ([64, 56, 65, 69, 60, 85, 49, 59], False)
plos+pubmed test  ([18, 22, 22, 20, 28, 7, 19, 23], False)
plos+pubmed dev   ([16, 21, 12, 10, 11, 7, 30, 17], False)

plos+pubmed train within +/- 60% ? False
plos+pubmed test  within +/- 15% ? False
plos+pubmed dev   within +/- 15% ? False

Analysis            932/1436 train  64.90% 
DataCollection      138/246 train  56.10% 
DataPreProcss       220/335 train  65.67% 
Modelling           158/227 train  69.60% 
Programming         48/80 train  60.00% 
Simulation          24/28 train  85.71% 
Stimulation         35/71 train  49.30% 
Visualization       70/117 train  59.83% 

Analysis            269/1436 test  18.73% 
DataCollection      56/246 test  22.76% 
DataPreProcss       74/335 test  22.09% 
Modelling           46/227 test  20.26% 
Programming         23/80 test  28.75% 
Simulation          2/28 test  7.14% 
Stimulation         14/71 test  19.72% 
Visualization       27/117 test  23.08% 

Analysis            235/143

In [None]:
# combined plos pubmed bio   train , test , dev


in_path_comb = '/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined_bio/'
out_path_comb = '/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined/'

stable_split_seeds = []

i = 0

split_data(in_path_comb, out_path_comb, i)


comb_path_whole = '/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined_bio/'
comb_path_train = "/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined/plos_pubmed_combined_bio_train/" 
comb_path_test = "/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined/plos_pubmed_combined_bio_test/" 
comb_path_dev = "/home/beck/Desktop/SoMeNLP/data/plos_pubmed_combined/plos_pubmed_combined_bio_devel/"
    
    
#calculate the distribution of class_labels for cmbined set
comb_whole_dict, comb_train_dict, comb_test_dict, comb_dev_dict = train_test_dev(comb_path_whole, 
                                                                                 comb_path_train, 
                                                                                 comb_path_test, 
                                                                                 comb_path_dev)


# calculate the proportion for each class label (8 - purposes)
comb_train_props, comb_dev_props, comb_test_props = __proportion_calculator(comb_whole_dict, 
                                                                            comb_train_dict, 
                                                                            comb_test_dict, 
                                                                            comb_dev_dict)


comb_train_result_ = _within_range_pubmed(comb_train_props)
comb_test_result_  = _within_range_dev_test(comb_test_props)
comb_dev_result_   = _within_range_dev_test(comb_dev_props)

bool_ = (comb_train_result_) and (comb_test_result_) and (comb_dev_result_)



while not(bool_):
    
    print(f'splitting ... with seed: {i}')
    
    split_data(in_path_comb, out_path_comb, i)
    
    i += 1
    
    
    
    #calculate the distribution of class_labels for cmbined set
    comb_whole_dict, comb_train_dict, comb_test_dict, comb_dev_dict = train_test_dev(comb_path_whole, 
                                                                                 comb_path_train, 
                                                                                 comb_path_test, 
                                                                                 comb_path_dev)


    # calculate the proportion for each class label (8 - purposes)
    comb_train_props, comb_dev_props, comb_test_props = __proportion_calculator(comb_whole_dict, 
                                                                            comb_train_dict, 
                                                                            comb_test_dict, 
                                                                            comb_dev_dict)
    #print('is the split fair ?')
    comb_train_result_ = _within_range(comb_train_props)
    comb_test_result  = _within_range_dev_test(comb_test_props)
    comb_dev_result   = _within_range_dev_test(comb_dev_props)
    print()
    
    bool_ = (comb_train_result_) and (comb_test_result_) and (comb_dev_result_)
    

    
    
    if bool_:
        print()
        print('plos_pubmed split is balanced. stopped splitting')
        print(f'result:  {comb_train_result_, comb_test_result, comb_dev_result}') 
        stable_split_seeds.append(i)
        break
        
    else:
        
        shutil.rmtree(comb_path_train, ignore_errors=False, onerror=None)
        shutil.rmtree(comb_path_test, ignore_errors=False, onerror=None)
        shutil.rmtree(comb_path_dev, ignore_errors=False, onerror=None)
        
        print(f'result:  {comb_train_result_, comb_test_result, comb_dev_result}')   
        print(f'plos_pubmed split is not yet balanced {comb_train_props, comb_dev_props, comb_test_props}.  splitting further ...')
        print()    

Done
splitting ... with seed: 0
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([62, 48, 59, 68, 46, 53, 33, 45], [17, 13, 15, 11, 41, 25, 45, 17], [20, 38, 24, 20, 12, 21, 21, 37]).  splitting further ...

splitting ... with seed: 1
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 56, 65, 73, 72, 75, 50, 56], [21, 21, 20, 10, 13, 7, 22, 32], [17, 21, 14, 15, 13, 17, 26, 11]).  splitting further ...

splitting ... with seed: 2
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 61, 58, 66, 60, 46, 59, 55], [19, 21, 23, 20, 26, 32, 16, 25], [19, 16, 18, 13, 13, 21, 23, 18]).  splitting further ...

splitting ... with seed: 3
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 60, 62, 60, 46, 71, 38, 68], [20, 13, 15, 20, 16, 21, 18, 5], [19, 26, 21, 18, 37, 7, 43, 25]).  splitting further ...

splitting ... with seed: 4
Done

result:  (False, False, False)
plos_pubmed sp

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([52, 59, 51, 55, 77, 67, 74, 54], [24, 21, 26, 14, 3, 10, 4, 17], [23, 18, 22, 29, 18, 21, 21, 27]).  splitting further ...

splitting ... with seed: 37
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([66, 68, 72, 70, 72, 53, 66, 59], [12, 12, 10, 11, 17, 17, 23, 18], [20, 19, 17, 18, 10, 28, 9, 21]).  splitting further ...

splitting ... with seed: 38
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([59, 62, 60, 66, 82, 64, 42, 64], [21, 24, 16, 16, 6, 28, 43, 25], [18, 13, 22, 17, 11, 7, 14, 9]).  splitting further ...

splitting ... with seed: 39
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([54, 48, 52, 47, 25, 82, 38, 47], [15, 19, 11, 19, 41, 3, 46, 22], [30, 31, 36, 33, 33, 14, 15, 29]).  splitting further ...

splitting ... with seed: 40
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 57

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 59, 59, 52, 68, 42, 45, 61], [16, 19, 16, 24, 11, 42, 9, 19], [22, 21, 24, 22, 20, 14, 45, 18]).  splitting further ...

splitting ... with seed: 73
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 56, 62, 50, 72, 39, 67, 54], [21, 22, 24, 34, 15, 10, 19, 15], [19, 21, 13, 14, 12, 50, 12, 29]).  splitting further ...

splitting ... with seed: 74
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 52, 65, 59, 53, 46, 60, 61], [19, 21, 14, 19, 11, 25, 15, 23], [22, 26, 20, 21, 35, 28, 23, 15]).  splitting further ...

splitting ... with seed: 75
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([64, 59, 60, 68, 46, 57, 60, 67], [20, 21, 21, 17, 31, 35, 21, 22], [15, 18, 17, 14, 22, 7, 18, 10]).  splitting further ...

splitting ... with seed: 76
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([5

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 61, 54, 63, 50, 67, 40, 53], [23, 23, 36, 17, 18, 21, 26, 17], [16, 15, 9, 18, 31, 10, 32, 28]).  splitting further ...

splitting ... with seed: 109
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 62, 59, 56, 72, 67, 53, 60], [19, 20, 13, 29, 5, 7, 33, 13], [20, 17, 27, 14, 22, 25, 12, 25]).  splitting further ...

splitting ... with seed: 110
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([59, 64, 60, 62, 77, 53, 64, 58], [16, 13, 14, 15, 10, 14, 25, 23], [23, 21, 25, 21, 12, 32, 9, 17]).  splitting further ...

splitting ... with seed: 111
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([53, 56, 48, 45, 58, 67, 53, 52], [26, 23, 32, 19, 20, 28, 26, 19], [20, 19, 18, 34, 21, 3, 19, 27]).  splitting further ...

splitting ... with seed: 112
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([63, 60, 63, 56, 82, 82, 49, 60], [21, 21, 17, 19, 15, 10, 35, 24], [15, 18, 18, 23, 2, 7, 15, 14]).  splitting further ...

splitting ... with seed: 145
Done

result:  (False, False, True)
plos_pubmed split is not yet balanced ([60, 67, 68, 63, 55, 71, 85, 63], [21, 18, 17, 17, 18], [18, 14, 14, 18, 26, 28, 9, 19]).  splitting further ...

splitting ... with seed: 146
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([63, 63, 68, 58, 80, 75, 61, 62], [20, 22, 21, 19, 5, 10, 32, 19], [16, 13, 10, 22, 15, 14, 5, 17]).  splitting further ...

splitting ... with seed: 147
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([63, 59, 67, 70, 68, 64, 71, 64], [18, 20, 21, 17, 5, 3, 15, 15], [17, 20, 11, 12, 26, 32, 12, 19]).  splitting further ...

splitting ... with seed: 148
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 44, 65, 68,

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([59, 61, 54, 54, 46, 71, 66, 63], [21, 19, 29, 26, 21, 17, 9, 15], [19, 18, 15, 19, 32, 10, 23, 21]).  splitting further ...

splitting ... with seed: 181
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([64, 66, 69, 72, 53, 57, 46, 76], [15, 19, 13, 12, 33, 17, 22, 6], [19, 14, 16, 14, 12, 25, 30, 16]).  splitting further ...

splitting ... with seed: 182
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([56, 55, 60, 51, 56, 64, 50, 52], [19, 23, 14, 16, 35, 7, 19, 18], [23, 20, 24, 31, 8, 28, 29, 29]).  splitting further ...

splitting ... with seed: 183
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 69, 56, 51, 83, 39, 64, 52], [22, 20, 22, 27, 10, 14, 22, 29], [19, 9, 21, 20, 6, 46, 12, 18]).  splitting further ...

splitting ... with seed: 184
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([6

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([62, 73, 63, 60, 45, 71, 56, 63], [20, 10, 26, 22, 10, 3, 30, 13], [16, 15, 9, 17, 45, 25, 12, 23]).  splitting further ...

splitting ... with seed: 217
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([62, 58, 59, 50, 41, 60, 57, 62], [18, 24, 15, 23, 17, 21, 12, 18], [19, 16, 25, 25, 41, 17, 29, 18]).  splitting further ...

splitting ... with seed: 218
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 65, 66, 67, 48, 50, 56, 52], [22, 19, 21, 14, 10, 17, 25, 26], [16, 15, 11, 18, 41, 32, 18, 21]).  splitting further ...

splitting ... with seed: 219
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 59, 56, 62, 28, 67, 63, 58], [22, 22, 24, 21, 50, 21, 14, 23], [15, 18, 18, 15, 21, 10, 22, 18]).  splitting further ...

splitting ... with seed: 220
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 64, 61, 59, 72, 82, 63, 65], [22, 19, 22, 25, 7, 7, 22, 28], [19, 16, 15, 14, 20, 10, 14, 5]).  splitting further ...

splitting ... with seed: 253
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([57, 51, 52, 53, 57, 50, 71, 67], [23, 32, 20, 24, 15, 39, 11, 21], [19, 16, 27, 21, 27, 10, 16, 11]).  splitting further ...

splitting ... with seed: 254
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 67, 54, 58, 53, 57, 66, 58], [19, 17, 15, 18, 27, 14, 16, 11], [21, 14, 30, 23, 18, 28, 16, 29]).  splitting further ...

splitting ... with seed: 255
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 68, 53, 59, 72, 75, 63, 49], [21, 21, 28, 20, 13, 14, 9, 22], [19, 9, 17, 19, 13, 10, 26, 28]).  splitting further ...

splitting ... with seed: 256
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([59, 50, 58, 59, 71, 85, 59, 70], [17, 22, 13, 9, 5, 3, 26, 22], [23, 26, 28, 31, 23, 10, 14, 6]).  splitting further ...

splitting ... with seed: 289
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([68, 63, 76, 68, 77, 82, 50, 73], [13, 17, 16, 22, 16, 3, 18, 10], [18, 19, 7, 9, 6, 14, 30, 16]).  splitting further ...

splitting ... with seed: 290
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([57, 56, 61, 62, 82, 92, 54, 64], [19, 17, 14, 20, 7, 3, 28, 13], [23, 25, 23, 17, 10, 3, 16, 21]).  splitting further ...

splitting ... with seed: 291
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([59, 56, 58, 63, 80, 46, 42, 70], [20, 15, 21, 15, 7, 21, 23, 16], [19, 27, 19, 20, 12, 32, 33, 13]).  splitting further ...

splitting ... with seed: 292
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([64, 48,

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([57, 66, 64, 62, 68, 42, 60, 59], [18, 19, 18, 13, 23, 3, 5, 11], [24, 13, 17, 24, 7, 53, 33, 28]).  splitting further ...

splitting ... with seed: 325
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([62, 63, 62, 65, 48, 53, 42, 54], [18, 21, 15, 14, 18, 32, 54, 28], [19, 15, 22, 19, 32, 14, 2, 17]).  splitting further ...

splitting ... with seed: 326
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 57, 56, 56, 31, 75, 80, 58], [13, 17, 11, 15, 45, 3, 11, 22], [25, 24, 32, 27, 23, 21, 8, 19]).  splitting further ...

splitting ... with seed: 327
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([62, 66, 52, 59, 53, 78, 43, 58], [16, 16, 19, 20, 28, 14, 21, 17], [21, 17, 27, 20, 17, 7, 35, 24]).  splitting further ...

splitting ... with seed: 328
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([56

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([57, 69, 60, 53, 87, 82, 53, 58], [18, 18, 26, 22, 5, 3, 30, 22], [24, 11, 13, 24, 7, 14, 15, 19]).  splitting further ...

splitting ... with seed: 361
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 56, 65, 48, 67, 57, 67, 59], [19, 23, 17, 30, 13, 28, 16, 17], [21, 19, 17, 20, 18, 14, 15, 22]).  splitting further ...

splitting ... with seed: 362
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([65, 62, 63, 58, 71, 71, 59, 64], [15, 20, 17, 22, 17, 25, 18, 13], [19, 17, 19, 19, 11, 3, 22, 22]).  splitting further ...

splitting ... with seed: 363
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([59, 63, 59, 55, 55, 46, 67, 65], [21, 23, 24, 22, 25, 7, 18, 12], [19, 13, 15, 22, 20, 46, 14, 21]).  splitting further ...

splitting ... with seed: 364
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([59, 61, 50, 59, 42, 35, 54, 56], [19, 25, 12, 15, 20, 14, 26, 20], [20, 13, 37, 24, 37, 50, 18, 23]).  splitting further ...

splitting ... with seed: 397
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 54, 55, 53, 52, 67, 67, 50], [21, 15, 22, 22, 11, 14, 23, 22], [19, 29, 21, 23, 36, 17, 8, 27]).  splitting further ...

splitting ... with seed: 398
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([62, 60, 61, 56, 60, 71, 49, 61], [17, 21, 20, 24, 31, 10, 21, 24], [19, 17, 18, 18, 8, 17, 29, 13]).  splitting further ...

splitting ... with seed: 399
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([62, 57, 55, 54, 45, 60, 60, 66], [23, 25, 28, 24, 32, 25, 22, 18], [14, 17, 16, 21, 22, 14, 16, 14]).  splitting further ...

splitting ... with seed: 400
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 50, 64, 56, 75, 53, 59, 48], [20, 32, 22, 22, 8, 17, 22, 23], [18, 16, 12, 20, 16, 28, 18, 28]).  splitting further ...

splitting ... with seed: 433
Done

result:  (False, False, True)
plos_pubmed split is not yet balanced ([57, 63, 54, 60, 45, 57, 56, 54], [19, 17, 17, 18, 23, 21, 16, 17], [23, 19, 28, 20, 31, 21, 26, 27]).  splitting further ...

splitting ... with seed: 434
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([63, 55, 74, 61, 83, 57, 53, 58], [17, 23, 9, 16, 10, 21, 19, 20], [19, 20, 16, 22, 6, 21, 26, 20]).  splitting further ...

splitting ... with seed: 435
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([63, 55, 53, 68, 48, 46, 52, 65], [22, 18, 23, 20, 42, 21, 28, 12], [14, 26, 22, 11, 8, 32, 19, 21]).  splitting further ...

splitting ... with seed: 436
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([63, 58, 68, 53, 48, 60, 74, 70], [19, 26, 14, 18, 22, 10, 14, 15], [17, 14, 16, 27, 28, 28, 11, 13]).  splitting further ...

splitting ... with seed: 469
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([59, 57, 52, 44, 66, 85, 61, 61], [20, 19, 16, 24, 12, 14, 25, 24], [20, 22, 31, 30, 21]).  splitting further ...

splitting ... with seed: 470
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 63, 70, 57, 28, 50, 76, 57], [20, 23, 11, 26, 6, 21, 12, 25], [21, 13, 17, 15, 65, 28, 11, 17]).  splitting further ...

splitting ... with seed: 471
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([63, 56, 56, 52, 50, 53, 47, 72], [18, 21, 26, 31, 22, 21, 32, 17], [18, 22, 17, 15, 27, 25, 19, 10]).  splitting further ...

splitting ... with seed: 472
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 68, 

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([57, 56, 59, 57, 45, 67, 78, 64], [21, 23, 12, 12, 18, 10, 18, 12], [21, 19, 28, 30, 36, 21, 2, 22]).  splitting further ...

splitting ... with seed: 505
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 59, 60, 56, 73, 78, 53, 62], [16, 22, 12, 18, 7, 21, 16, 20], [21, 17, 26, 25, 18]).  splitting further ...

splitting ... with seed: 506
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 58, 63, 59, 38, 46, 56, 64], [19, 19, 18, 16, 33, 39, 33, 27], [22, 21, 18, 24, 27, 14, 9, 7]).  splitting further ...

splitting ... with seed: 507
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([56, 57, 59, 53, 55, 57, 54, 64], [19, 13, 22, 15, 27, 17, 15, 11], [23, 28, 18, 30, 17, 25, 29, 24]).  splitting further ...

splitting ... with seed: 508
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([56, 60, 58,

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([62, 62, 65, 62, 43, 53, 64, 54], [17, 17, 12, 14, 31, 39, 15, 26], [19, 19, 22, 22, 25, 7, 19, 18]).  splitting further ...

splitting ... with seed: 541
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([64, 62, 68, 62, 43, 46, 70, 58], [17, 28, 22, 18, 26, 7, 5, 29], [17, 9, 8, 18, 30, 46, 23, 11]).  splitting further ...

splitting ... with seed: 542
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 58, 54, 59, 70, 75, 81, 70], [23, 28, 22, 18, 10, 7, 9, 19], [18, 12, 22, 21, 20, 17, 8, 9]).  splitting further ...

splitting ... with seed: 543
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([65, 55, 61, 71, 68, 85, 47, 72], [20, 20, 26, 14, 8, 7, 12, 9], [13, 24, 11, 13, 22, 7, 39, 17]).  splitting further ...

splitting ... with seed: 544
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 63, 

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([63, 54, 69, 54, 58, 42, 74, 64], [21, 22, 17, 19, 11, 46, 18, 17], [15, 23, 12, 25, 30, 10, 7, 17]).  splitting further ...

splitting ... with seed: 577
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 61, 56, 63, 27, 89, 60, 62], [15, 15, 12, 21, 60, 3, 25, 19], [24, 23, 30, 14, 12, 7, 14, 17]).  splitting further ...

splitting ... with seed: 578
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 59, 62, 68, 65, 67, 59, 65], [17, 19, 21, 15, 28, 21, 12, 19], [21, 21, 16, 15, 6, 10, 28, 14]).  splitting further ...

splitting ... with seed: 579
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([54, 55, 46, 62, 71, 46, 59, 50], [26, 28, 30, 15, 21, 39, 26, 36], [18, 16, 22, 22, 7, 14, 14, 12]).  splitting further ...

splitting ... with seed: 580
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 60, 61, 53, 51, 64, 54, 52], [20, 21, 19, 25, 12, 17, 26, 24], [18, 17, 19, 21, 36, 17, 18, 23]).  splitting further ...

splitting ... with seed: 613
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([62, 58, 53, 61, 73, 64, 59, 64], [17, 13, 18, 18, 16, 21, 14, 10], [20, 28, 28, 20, 10, 14, 26, 24]).  splitting further ...

splitting ... with seed: 614
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([64, 54, 68, 61, 52, 85, 70, 54], [18, 16, 17, 18, 36, 3, 21, 19], [17, 28, 14, 19, 11, 10, 8, 25]).  splitting further ...

splitting ... with seed: 615
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 58, 67, 60, 57, 67, 67, 76], [18, 15, 16, 13, 31, 32, 15, 8], [19, 25, 16, 26, 11]).  splitting further ...

splitting ... with seed: 616
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([62, 63, 60

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([63, 64, 74, 59, 47, 71, 70, 43], [13, 15, 5, 16, 30, 7, 23, 29], [22, 19, 20, 23, 22, 21, 5, 27]).  splitting further ...

splitting ... with seed: 649
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 53, 57, 66, 65, 71, 56, 65], [20, 25, 20, 13, 12, 21, 25, 16], [20, 21, 22, 19, 22, 7, 18, 17]).  splitting further ...

splitting ... with seed: 650
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 63, 47, 70, 52, 64, 46, 63], [26, 19, 35, 17, 38, 28, 21, 27], [14, 17, 17, 11, 8, 7, 32, 9]).  splitting further ...

splitting ... with seed: 651
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 54, 65, 64, 31, 78, 53, 59], [22, 24, 21, 20, 23, 14, 12, 15], [15, 20, 13, 14, 45, 7, 33, 24]).  splitting further ...

splitting ... with seed: 652
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61,

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 63, 71, 66, 85, 46, 71, 66], [19, 19, 11, 13, 7, 28, 21, 14], [19, 16, 16, 19, 7, 25, 7, 18]).  splitting further ...

splitting ... with seed: 685
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([63, 65, 68, 69, 43, 46, 53, 65], [17, 24, 12, 11, 42, 42, 23, 16], [18, 10, 19, 18, 13, 10, 22, 17]).  splitting further ...

splitting ... with seed: 686
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 60, 58, 44, 36, 46, 63, 61], [20, 22, 22, 26, 36, 28, 15, 17], [17, 17, 19, 28, 27, 25, 21, 21]).  splitting further ...

splitting ... with seed: 687
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 60, 66, 51, 63, 75, 63, 64], [16, 17, 16, 17, 22, 10, 22, 10], [22, 21, 17, 30, 13, 14, 14, 25]).  splitting further ...

splitting ... with seed: 688
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced 

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 59, 59, 56, 78, 53, 60, 66], [19, 24, 25, 24, 7, 39, 21, 19], [19, 15, 14, 19, 13, 7, 18, 13]).  splitting further ...

splitting ... with seed: 721
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([64, 51, 59, 65, 78, 42, 49, 64], [18, 29, 20, 18, 8, 35, 29, 24], [16, 19, 20, 16, 12, 21, 21, 11]).  splitting further ...

splitting ... with seed: 722
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([59, 64, 62, 64, 45, 46, 49, 41], [22, 14, 22, 18, 33, 32, 30, 25], [18, 21, 14, 16, 21, 21, 19, 33]).  splitting further ...

splitting ... with seed: 723
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([58, 45, 50, 53, 75, 75, 77, 47], [21, 34, 27, 22, 15, 21, 8, 30], [19, 19, 22, 24, 10, 3, 14, 22]).  splitting further ...

splitting ... with seed: 724
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 65, 59, 57, 51, 42, 54, 73], [19, 13, 22, 23, 23, 39, 18, 8], [19, 20, 17, 19, 25, 17, 26, 17]).  splitting further ...

splitting ... with seed: 757
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([64, 59, 62, 57, 30, 67, 66, 57], [16, 14, 21, 26, 7, 7, 7, 17], [19, 26, 15, 15, 62, 25, 26, 24]).  splitting further ...

splitting ... with seed: 758
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([61, 57, 60, 71, 76, 57, 66, 56], [18, 14, 15, 12, 10, 32, 5, 21], [20, 28, 24, 15, 13, 10, 28, 22]).  splitting further ...

splitting ... with seed: 759
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([66, 63, 71, 66, 57, 39, 32, 55], [18, 18, 20, 21, 13, 39, 43, 23], [14, 17, 8, 11, 28, 21, 23, 21]).  splitting further ...

splitting ... with seed: 760
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([6

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([55, 65, 52, 53, 63, 67, 57, 61], [22, 17, 23, 17, 20, 10, 16, 18], [22, 17, 23, 29, 16, 21, 25, 19]).  splitting further ...

splitting ... with seed: 793
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([64, 60, 68, 66, 42, 57, 61, 72], [17, 19, 14, 14, 52, 35, 35, 7], [17, 19, 16, 19, 5, 7, 2, 19]).  splitting further ...

splitting ... with seed: 794
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([66, 64, 75, 70, 43, 64, 76, 57], [16, 13, 11, 7, 40, 25, 11, 20], [17, 21, 12, 22, 16, 10, 12, 22]).  splitting further ...

splitting ... with seed: 795
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([64, 57, 66, 66, 57, 57, 69, 52], [15, 16, 13, 10, 2, 3, 8, 25], [20, 26, 19, 23, 40, 39, 22, 21]).  splitting further ...

splitting ... with seed: 796
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([64,

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([53, 55, 45, 59, 65, 64, 49, 62], [22, 26, 24, 15, 30, 17, 8, 14], [23, 18, 30, 24, 5, 17, 42, 23]).  splitting further ...

splitting ... with seed: 829
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 59, 63, 55, 57, 57, 69, 47], [16, 19, 8, 13, 11, 10, 5, 16], [23, 20, 27, 30, 31, 32, 25, 35]).  splitting further ...

splitting ... with seed: 830
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([62, 55, 65, 66, 68, 89, 69, 57], [20, 26, 21, 17, 13, 7, 18, 27], [17, 18, 13, 15, 17, 3, 12, 15]).  splitting further ...

splitting ... with seed: 831
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([56, 62, 64, 53, 43, 32, 57, 60], [20, 21, 19, 18, 36, 32, 28, 17], [22, 15, 16, 27, 20, 35, 14, 22]).  splitting further ...

splitting ... with seed: 832
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([5

Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([68, 65, 71, 67, 51, 42, 77, 60], [15, 14, 10, 9, 35, 21, 16, 19], [16, 19, 17, 23, 13, 35, 5, 19]).  splitting further ...

splitting ... with seed: 865
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([57, 60, 62, 51, 48, 50, 40, 75], [22, 22, 23, 28, 12, 35, 21, 13], [19, 17, 14, 20, 38, 14, 38, 11]).  splitting further ...

splitting ... with seed: 866
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([60, 59, 66, 59, 65, 71, 60, 58], [20, 19, 15, 17, 12, 17, 22, 22], [19, 21, 17, 23, 22, 10, 16, 19]).  splitting further ...

splitting ... with seed: 867
Done

result:  (False, False, False)
plos_pubmed split is not yet balanced ([63, 58, 62, 59, 53, 82, 77, 66], [23, 25, 24, 18, 28, 10, 8, 18], [13, 16, 12, 22, 17, 7, 14, 14]).  splitting further ...

splitting ... with seed: 868


In [None]:
in_path_comb = '/home/beck/Desktop/Otimal_split/plos_pubmed_combined_bio/'
out_path_comb = '/home/beck/Desktop/Otimal_split/plos_pubmed_combined/'

#split_data(in_path_comb, out_path_comb, 204)


comb_path_whole = '/home/beck/Desktop/Otimal_split/plos_pubmed_combined_bio/'
comb_path_train = "/home/beck/Desktop/Otimal_split/plos_pubmed_combined/plos_pubmed_combined_bio_train/" 
comb_path_test = "/home/beck/Desktop/Otimal_split/plos_pubmed_combined/plos_pubmed_combined_bio_test/" 
comb_path_dev = "/home/beck/Desktop/Otimal_split/plos_pubmed_combined/plos_pubmed_combined_bio_devel/"
    
    
#calculate the distribution of class_labels for cmbined set
comb_whole_dict, comb_train_dict, comb_test_dict, comb_dev_dict = train_test_dev(comb_path_whole, 
                                                                                 comb_path_train, 
                                                                                 comb_path_test, 
                                                                                 comb_path_dev)


# calculate the proportion for each class label (8 - purposes)
comb_train_props, comb_dev_props, comb_test_props = __proportion_calculator(comb_whole_dict, 
                                                                            comb_train_dict, 
                                                                            comb_test_dict, 
                                                                            comb_dev_dict)

comb_train_result_ = _within_range_pubmed(comb_train_props)
comb_test_result_  = _within_range_dev_test(comb_test_props)
comb_dev_result_   = _within_range_dev_test(comb_dev_props)


print(f'plos+pubmed train {comb_train_props, _within_range(comb_train_props)[1]}')
print(f'plos+pubmed test  {comb_test_props, _within_range_dev_test(comb_test_props)[1]}')
print(f'plos+pubmed dev   {comb_dev_props, _within_range_dev_test(comb_dev_props)[1]}')
print()

print(f"plos+pubmed train within +/- 60% ? {_within_range(comb_train_props)[0]}")
print(f"plos+pubmed test  within +/- 15% ? {_within_range_dev_test(comb_test_props)[0]}")
print(f"plos+pubmed dev   within +/- 15% ? {_within_range_dev_test(comb_dev_props)[0]}")
print()

# train percentage 60%
    
for (k,v), (k2,v2) in zip(comb_whole_dict.items(),  comb_train_dict.items()):
    if k2 == k:
        print(f"{k2 :{18}}  {v2}/{v} train {v2/v*100: .2f}% ")

# test percentage 20% 
print()

for (k,v), (k2,v2) in zip(comb_whole_dict.items(),  comb_test_dict.items()):
    if k2 == k:
        print(f"{k2 :{18}}  {v2}/{v} test {v2/v*100: .2f}% ")
        
# dev percentage 20% 
print()

for (k,v), (k2,v2) in zip(comb_whole_dict.items(),  comb_dev_dict.items()):
    if k2 == k:
        print(f"{k2 :{18}}  {v2}/{v} dev {v2/v*100: .2f}% ")