In [105]:
from collections import Counter, defaultdict
import random
from sklearn.model_selection import train_test_split
import json
import copy
import math
import re

In [106]:
class ZaebucDoc:
    def __init__(self, writer_id, course, gender,
                 school_language, school_type, topic, CEFR):
        self.writer_id = writer_id
        self.course = course
        self.gender = gender
        self.school_language = school_language
        self.school_type = school_type
        self.topic = topic
        self.CEFR = CEFR
    
    def __repr__(self):
        return str(self.to_json_str())
    
    def to_json_str(self):
        return json.dumps(self.to_dict(), indent=2, ensure_ascii=False)
    
    def to_dict(self):
        output = copy.deepcopy(self.__dict__)
        return output

In [107]:
def read_data(path):
    with open(path) as f:
        return [x.strip() for x in f.readlines()]

In [108]:
def write_data(data, path):
    with open(path, mode='w') as f:
        f.write('writer_id\tcourse\tgender\tschool_language\tschool_type\ttopic\tCEFR\n')
        for ex in data:
            f.write(f'{ex.writer_id}\t{ex.course}\t{ex.gender}\t{ex.school_language}\t')
            f.write(f'{ex.school_type}\t{ex.topic}\t{ex.CEFR}\t')
            f.write('\n')

In [124]:
def parse_data(raw_data):
    count = 0
    info = []
    for line in raw_data:
        lookup = extract_info(line)
        writer_id = lookup.get('writer_id', None)
        course = lookup.get('course', None)
        gender = lookup.get('gender', None)
        topic = lookup.get('topic', None)
        CEFR = lookup.get('CEFR', None)
        school_lang = lookup.get('school_language', None)
        school_type = lookup.get('school_type', None)
        print(writer_id, course, gender, school_lang, school_type, topic, CEFR)
        info.append(ZaebucDoc(writer_id, course, gender, school_lang,
                              school_type, topic, CEFR))
    return info
    
# def extract_info(seed, line):
#     if seed in line:
#         start_idx = line.index(seed)
#         end_idx = start_idx
#         while line[end_idx] != " " and line[end_idx] != ">":
#             end_idx += 1

#         return line[start_idx:end_idx].replace(seed+"=", "").replace('"', "")

#     else:
#         return None

def extract_info(line):

        # removing line boundaries
        line = re.sub('<doc', '', line)
        line = re.sub('/>', '', line)
        # replacing double quotes with single quotes
        line = re.sub('""', '"', line)
        line = re.sub('^"', '', line)
        line = re.sub('"$', '', line)
        line = line.strip().split('" ')
        lookup = {x.split('=')[0]:x.split('=')[1].replace('"','') for x in line}
        
        return lookup

In [125]:
def round_half_up(n, decimals=0):
    multiplier = 10 ** decimals
    return int(math.floor(n*multiplier + 0.5) / multiplier)

In [133]:
def split_and_balance(data, train_split=0.7, dev_split=0.15, test_split=0.15):
    # Shuffling the data
    data_copy = copy.deepcopy(data)
    random.Random(42).shuffle(data_copy)
    
    # grouping examples by cefr
    cefr_map = defaultdict(list)
    for ex in data_copy:
        cefr_map[ex.CEFR].append(ex)

    # creating train, dev, and test splits based while maintaining
    # CEFR level proprtions
    splits_by_level = {'Train':dict(), 'Dev':dict(), 'Test': dict()}

    for cefr_level in cefr_map:

        train_size = round_half_up(0.70 * len(cefr_map[cefr_level]))
        dev_size = round_half_up(0.15 * len(cefr_map[cefr_level]))
        
        splits_by_level['Train'][cefr_level] = cefr_map[cefr_level][:train_size]
        splits_by_level['Dev'][cefr_level] = cefr_map[cefr_level][train_size:train_size + dev_size]
        splits_by_level['Test'][cefr_level] = cefr_map[cefr_level][train_size+dev_size:]
        
        assert (len(splits_by_level['Train'][cefr_level]) + len(splits_by_level['Dev'][cefr_level])
                + len(splits_by_level['Test'][cefr_level]) == len(cefr_map[cefr_level]))
        
    
    # unflattening splits
    splits = {'Train': list(), 'Dev': list(), 'Test': list()}
    # organizing splits by examples
    for split in splits_by_level:
        for level in splits_by_level[split]:
            for ex in splits_by_level[split][level]:
                splits[split].append(ex)
    
    
    # printing summary
    train_levels_counts = Counter([ex.CEFR for ex in splits['Train']])
    total_train = sum(train_levels_counts.values())
    dev_levels_counts = Counter([ex.CEFR for ex in splits['Dev']])
    total_dev = sum(dev_levels_counts.values())
    test_levels_counts = Counter([ex.CEFR for ex in splits['Test']])
    total_test = sum(test_levels_counts.values())
    
    # turning counts into percentages
    print(f'Train: {total_train} examples')
    print('===========')
    for level, count in train_levels_counts.items():
        print(f'{level}: {100 * (count / total_train):.2f}%')
    
    print()
    print(f'Dev: {total_dev} examples')
    print('===========')
    for level, count in dev_levels_counts.items():
        print(f'{level}: {100 * (count / total_dev):.2f}%')
    
    print()
    print(f'Test: {total_test} examples')
    print('===========')
    for level, count in test_levels_counts.items():
        print(f'{level}: {100 * (count / total_test):.2f}%')
    
    
    ids_by_split = {}
    for split in splits:
        for ex in splits[split]:
            ids_by_split[ex.writer_id] = split
    
    assert len(ids_by_split) == len(data_copy)
    
    return ids_by_split, splits, splits_by_level

In [134]:
def write_split(data, ids_by_split, output_path):
    with open(output_path, mode='w') as f:
        for ex in data:
            f.write(ex.writer_id + '\t' +ids_by_split[ex.writer_id] + '\t' + ex.CEFR)
            f.write('\n')

In [136]:
raw_data = read_data('arabic_docs.txt')
organized_data = parse_data(raw_data)
write_data(organized_data, 'arabic_data_extracted.txt')

268469 ARA-030 Male English Private Social Media B1
386369 ARA-030 Male Arabic Government Social Media B2
81027 ARA-030 Female English Government Social Media A2
81757 ARA-030 Female Arabic Government Social Media B2
83625 ARA-030 Female English Private Social Media Unassessable
85891 ARA-030 Female Arabic Government Social Media A2
87953 ARA-030 Female English Private Tolerance B1
89249 ARA-030 Female English Private Tolerance B1
89881 ARA-030 Female English Private Social Media B1
90528 ARA-030 Female English Government Social Media A2
92524 ARA-030 Female Arabic Government Tolerance B1
92800 ARA-030 Female English Government Tolerance B1
93360 ARA-030 Female Arabic Government Tolerance B2
96339 ARA-030 Female English Private Social Media B1
96683 ARA-030 Female English Government Social Media B1
97791 ARA-030 Female English Government Development B1
119813 ARA-130 Female Arabic Government Social Media A2
119831 ARA-130 Female Arabic Government Tolerance Unassessable
121991965 ARA-13

In [137]:
organized_data

[{
   "writer_id": "268469",
   "course": "ARA-030",
   "gender": "Male",
   "school_language": "English",
   "school_type": "Private",
   "topic": "Social Media",
   "CEFR": "B1"
 },
 {
   "writer_id": "386369",
   "course": "ARA-030",
   "gender": "Male",
   "school_language": "Arabic",
   "school_type": "Government",
   "topic": "Social Media",
   "CEFR": "B2"
 },
 {
   "writer_id": "81027",
   "course": "ARA-030",
   "gender": "Female",
   "school_language": "English",
   "school_type": "Government",
   "topic": "Social Media",
   "CEFR": "A2"
 },
 {
   "writer_id": "81757",
   "course": "ARA-030",
   "gender": "Female",
   "school_language": "Arabic",
   "school_type": "Government",
   "topic": "Social Media",
   "CEFR": "B2"
 },
 {
   "writer_id": "83625",
   "course": "ARA-030",
   "gender": "Female",
   "school_language": "English",
   "school_type": "Private",
   "topic": "Social Media",
   "CEFR": "Unassessable"
 },
 {
   "writer_id": "85891",
   "course": "ARA-030",
   "gend

In [138]:
ids_by_split, splits, splits_by_level = split_and_balance(organized_data)
write_split(organized_data, ids_by_split, output_path='ar_splits.txt')

Train: 150 examples
B2: 37.33%
B1: 51.33%
C1: 5.33%
A2: 3.33%
Unassessable: 2.67%

Dev: 33 examples
B2: 36.36%
B1: 51.52%
C1: 6.06%
A2: 3.03%
Unassessable: 3.03%

Test: 31 examples
B2: 38.71%
B1: 51.61%
C1: 3.23%
A2: 3.23%
Unassessable: 3.23%


In [139]:
raw_data = read_data('english_docs.txt')
organized_data = parse_data(raw_data)
write_data(organized_data, 'english_data_extracted.txt')

116710 GEN-140 Female English Private Tolerance B1
117417 GEN-140 Female Arabic Government Social Media A2
119150 GEN-140 Female English Government Social Media A2
119813 GEN-140 Female Arabic Government Social Media A2
119831 GEN-140 Female Arabic Government Social Media A2
121991844 GEN-140 Female Korean Korean High School  Social Media B1
121991965 GEN-140 Female Arabic Government Social Media B1
165859 GEN-140 Female Arabic Government Social Media A2
165866 GEN-140 Male Arabic Government Tolerance B1
165904 GEN-140 Female Arabic Government Social Media B1
165949 GEN-140 Female Arabic Government Social Media B1
166095 GEN-140 Male English Private Development A2
166122 GEN-140 Male English Private Social Media B1
168692 GEN-140 Female Arabic Government Social Media B1
168718 GEN-140 Female Arabic Government Social Media A2
168839 GEN-140 Female English Private Social Media B1
186196 GEN-140 Female Arabic Government Social Media B1
186798 GEN-140 Female English Private Development A2


In [140]:
ids_by_split, splits, splits_by_level = split_and_balance(organized_data)
write_split(organized_data, ids_by_split, output_path='en_splits.txt')

Train: 272 examples
A2: 24.63%
B1: 50.00%
B2: 20.96%
A1: 1.84%
C1: 2.57%

Dev: 58 examples
A2: 24.14%
B1: 50.00%
B2: 20.69%
A1: 1.72%
C1: 3.45%

Test: 58 examples
A2: 24.14%
B1: 50.00%
B2: 22.41%
A1: 1.72%
C1: 1.72%


In [141]:
ids_by_split, splits, splits_by_level = split_and_balance(organized_data)
write_split(organized_data, ids_by_split, output_path='en_splits.txt')

Train: 272 examples
A2: 24.63%
B1: 50.00%
B2: 20.96%
A1: 1.84%
C1: 2.57%

Dev: 58 examples
A2: 24.14%
B1: 50.00%
B2: 20.69%
A1: 1.72%
C1: 3.45%

Test: 58 examples
A2: 24.14%
B1: 50.00%
B2: 22.41%
A1: 1.72%
C1: 1.72%
