#### Step 1: Data Pre-processing

In [1]:
import sys
import json

# to convert the json files of the Train, Dev, Test datasets to lists for fitting ML models
# the same as in the Dev script
Path_Dev_en = "Train_Dev_sets\\Dev_en_new.json"
Path_Dev_fr = "Train_Dev_sets\\Dev_fr_new.json"

Path_Train_en = "Train_Dev_sets\\Train_en_new.json"
Path_Train_fr = "Train_Dev_sets\\Train_fr_new.json"

Path_Test_en = "Test_set\\Test_en.json"
Path_Test_fr = "Test_set\\Test_fr.json"

def load_with_index(path): # for Train and Dev files
    with open(path) as json_file:  
        data = json.load(json_file)

    list_words =  data['text'].split(' ')
    list_begins =  data['begin_sentence']
    list_ends =  data['end_sentence']
    
    list_convert_words = []
    for word in list_words:
        if word.find('\n')>=0:
            list_convert_words.append(word.strip('\n')+'_Enter') # _Enter is used to replace \n 
        else:
            list_convert_words.append(word)
       
    return list_convert_words, list_begins, list_ends

def load_without_index(path): # for Test files
    with open(path) as json_file:  
        data = json.load(json_file)

    list_words =  data['text'].split(' ')
    
    list_convert_words = []
    for word in list_words:
        if word.find('\n')>=0:
            list_convert_words.append(word.strip('\n')+'_Enter') 
        else:
            list_convert_words.append(word)
   
    return list_convert_words
    
def labelled_text(convert_text, list_begins, list_ends):
    labelled_list_words = []

    for index, item in enumerate(convert_text):
    
        if index in list_begins:
            labelled_list_words.append(item+'_BEGIN')
        
        elif index in list_ends:
            labelled_list_words.append(item+'_END')
        
        else:
            labelled_list_words.append(item)

    return labelled_list_words

def write_text(file, content):
    outfile = open(file, 'w', encoding='utf-8')
    for word in content:
        outfile.write(word+'\n')
    outfile.close()

print ("Pre-process functions defined~")

Pre-process functions defined~


In [None]:
# load Train, Dev, Test files
Dev_en_words, Dev_en_begins, Dev_en_ends = load_with_index(Path_Dev_en)
Dev_fr_words, Dev_fr_begins, Dev_fr_ends = load_with_index(Path_Dev_fr)
Train_en_words, Train_en_begins, Train_en_ends = load_with_index(Path_Train_en)
Train_fr_words, Train_fr_begins, Train_fr_ends = load_with_index(Path_Train_fr)
Test_en_words = load_without_index(Path_Test_en)
Test_fr_words = load_without_index(Path_Test_fr)

# generate labelled texts
Dev_en_labelled = labelled_text(Dev_en_words, Dev_en_begins, Dev_en_ends)
Dev_fr_labelled = labelled_text(Dev_fr_words, Dev_fr_begins, Dev_fr_ends)
Train_en_labelled = labelled_text(Train_en_words, Train_en_begins, Train_en_ends)
Train_fr_labelled = labelled_text(Train_fr_words, Train_fr_begins, Train_fr_ends)

# write Train, Dev, Test text files
write_text("Pre-processed Datasets\\Dev_en_text.txt", Dev_en_words)
write_text("Pre-processed Datasets\\Dev_en_text_labelled.txt", Dev_en_labelled)

write_text("Pre-processed Datasets\\Dev_fr_text.txt", Dev_fr_words)
write_text("Pre-processed Datasets\\Dev_fr_text_labelled.txt", Dev_fr_labelled)

write_text("Pre-processed Datasets\\Train_en_text.txt", Train_en_words)
write_text("Pre-processed Datasets\\Train_en_text_labelled.txt", Train_en_labelled)

write_text("Pre-processed Datasets\\Train_fr_text.txt", Train_fr_words)
write_text("Pre-processed Datasets\\Train_fr_text_labelled.txt", Train_fr_labelled)

write_text("Pre-processed Datasets\\Test_en_text.txt", Test_en_words)
write_text("Pre-processed Datasets\\Test_fr_text.txt", Test_fr_words)

print("Data pre-processing completed~") 

#### Step 2: Classifier Selection 
Classifier: Random Forest Classifier (sklearn rfc). we also tried mnb, lr, dtc, knc, svm, and rfc is the best for this task.
##### The most optimized parameters: 
min_samples_split = 8, max_features = “log2”, oob_score = True, random_state = 10

In [2]:
import numpy as np
import copy
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

def rfc( X_train, Y_train, X_test):

    model = RandomForestClassifier( min_samples_split = 8, max_features = "log2", oob_score = True, random_state = 10 )
    nn = model.fit( X_train, Y_train )

    pre_test = model.predict( X_test )

    #target_names = ["O","BS","ES"] 
    #target_names = ["BS","ES","O"] 
    #print( classification_report( Y_test, pre_test, target_names = target_names, digits=5 ) )

    return pre_test

print("The rfc classifier defined with min_samples_split = 8, max_features = log2, oob_score = True, random_state = 10 ~") 

The rfc classifier defined with min_samples_split = 8, max_features = log2, oob_score = True, random_state = 10 ~


#### Step 3: Feature Engineering
We tried features of:
1. wordcount2vectors: not good and slow;
2. individual features of punctuations, initially captalized words, acronyms, _Enter(\n), digits, letters (including Roman numbers) and pos tags;
3. feature fusion all the features in 2;
4. rule-based validation with keyword list which was extracted from the Train and Dev datasets.

In [3]:
import string
from sklearn import preprocessing
from numpy import array

#ERROR_INSTANCE_FN = 'error_cases.txt'
FOLDER_TEXT = 'Pre-processed Datasets'
FOLDER_POS = 'POS Tagged Datasets'
#SAMPLE_WEIGHT_BEGIN = 1
#SAMPLE_WEIGHT_END = 1
PUNC_SET1  = ['.']
PUNC_SET2  = ['?', '!', ';', ',', '%', '-', '/', '\"', '\\', '\'', ')', '(', '*', '', '>', '≥', '<', '≤', '•', '€', '$', '£', '``', '©', '℗', '®']
LETTER_SET = list(string.ascii_letters)

def load_filecontent(fn):
    text = list( open( fn, "r", encoding = 'utf-8' ).readlines() )
    text = [s.strip('\n') for s in text]
   
    return text

def load_both_train():
    train_file_en = FOLDER_TEXT + "/Train_en_text_labelled.txt"
    train_en = load_filecontent(train_file_en)
    
    train_file_fr = FOLDER_TEXT + "/Train_fr_text_labelled.txt"
    train_fr = load_filecontent(train_file_fr)
      
    dev_file_en = FOLDER_TEXT + "/Dev_en_text_labelled.txt"
    dev_en = load_filecontent(dev_file_en)
    
    dev_file_fr = FOLDER_TEXT + "/Dev_fr_text_labelled.txt"
    dev_fr = load_filecontent(dev_file_fr)
    
    train_data = train_en + train_fr + dev_en + dev_fr
    
    test_file_en = FOLDER_TEXT + "/Test_en_text.txt"
    test_en = load_filecontent(test_file_en)
    
    test_file_fr = FOLDER_TEXT + "/Test_fr_text.txt"
    test_fr = load_filecontent(test_file_fr)
    
    return train_data, test_en, test_fr

def load_pos():
    train_file_en = FOLDER_POS + "/Train_en_udpipe_pos.txt"
    train_en_pos = load_filecontent(train_file_en)
    
    train_file_fr = FOLDER_POS + "/Train_fr_udpipe_pos.txt"
    train_fr_pos = load_filecontent(train_file_fr)
    
    dev_file_en = FOLDER_POS + "/Dev_en_udpipe_pos.txt"
    dev_en_pos = load_filecontent(dev_file_en)
    
    dev_file_fr = FOLDER_POS + "/Dev_fr_udpipe_pos.txt"
    dev_fr_pos = load_filecontent(dev_file_fr)
    
    train_pos = train_en_pos + train_fr_pos + dev_en_pos + dev_fr_pos
    
    test_file_en = FOLDER_POS + "/Test_en_udpipe_pos.txt"
    test_en_pos = load_filecontent(test_file_en)
    
    test_file_fr = FOLDER_POS + "/Test_fr_udpipe_pos.txt"
    test_fr_pos = load_filecontent(test_file_fr)
    
    return train_pos, test_en_pos, test_fr_pos

def find_pos_alter( dataset, trainlen ):
    INV = '18'

    pos_feature = []

    pos_vec = [0] * len( dataset )
    pos_vec[0] = [INV, INV, dataset[0], dataset[1], dataset[2]]
    pos_vec[1] = [INV, dataset[0], dataset[1], dataset[2], dataset[3]]
    for i in range( 2, len( dataset ) - 2 ):
        pos_vec[i] = [dataset[i - 2], dataset[i - 1], dataset[i], dataset[i + 1], dataset[i + 2]]
    pos_vec[-2] = [dataset[-4], dataset[-3], dataset[-2], dataset[-1], INV]
    pos_vec[-1] = [dataset[-3], dataset[-2], dataset[-1], INV, INV]

    pos_vec_temp = []
    for item in pos_vec:
        pos_vec_temp.append( [int( x ) for x in item] )

    one_hot = preprocessing.OneHotEncoder( sparse = False )
    temp = one_hot.fit_transform( pos_vec_temp )

    return np.array( temp[0:trainlen] ), np.array( temp[trainlen:] )

def find_cap(dataset):
    cap_feature = []  # 3-dim, 1st, cap for the word, 2nd, cap for prev, 3rd, cap for next
    
    cap_bool = []
    for word in dataset:
        if word[0].isupper():
            cap_bool.append( 1 )
        else:
            cap_bool.append( 0 )    
    
    # first word
    cap_feature.append( [0, cap_bool[0], cap_bool[1]] )
    
    for i in range(1, len(cap_bool) - 1):
        cap_feature.append( [cap_bool[i - 1], cap_bool[i], cap_bool[i + 1]] )

    # last word
    cap_feature.append( [cap_bool[-2], cap_bool[-1], 0] )

    return np.array( cap_feature )

def find_num(dataset):
    num_feature = []  # 3-dim, 
    
    num_bool = []
    for word in dataset:
        if word[0].isdigit():
            num_bool.append( 1 )
        else:
            num_bool.append( 0 )    
    
    # first word
    num_feature.append( [0, num_bool[0], num_bool[1]] )
    
    for i in range(1, len(num_bool) - 1):
        num_feature.append( [num_bool[i - 1], num_bool[i], num_bool[i + 1]] )

    # last word
    num_feature.append( [num_bool[-2], num_bool[-1], 0] )

    return np.array( num_feature )

def find_letter(dataset):
    letter_feature = []  # 3-dim, 
    
    letter_bool = []
    for word in dataset:
        if word in LETTER_SET:
            letter_bool.append( 1 )
        else:
            letter_bool.append( 0 )    
    
    # first word
    letter_feature.append( [0, letter_bool[0], letter_bool[1]] )
    
    for i in range(1, len(letter_bool) - 1):
        letter_feature.append( [letter_bool[i - 1], letter_bool[i], letter_bool[i + 1]] )

    # last word
    letter_feature.append( [letter_bool[-2], letter_bool[-1], 0] )

    return np.array( letter_feature )

def find_Acronyms(dataset):
    abbs_feature = []
    
    abbs_bool = [0] * len( dataset )

    for i, word in enumerate( dataset ):
        abbs_bool[i] = 0
        for letter in word:
            if not letter.isupper():
                abbs_bool[i] = 0
                break
            abbs_bool[i] = 1

    # first word
    abbs_feature.append( [0, abbs_bool[0], abbs_bool[1]] )

    for i in range( 1, len( abbs_bool ) - 1 ):
        abbs_feature.append( [abbs_bool[i - 1], abbs_bool[i], abbs_bool[i + 1]] )

    # last word
    abbs_feature.append( [abbs_bool[-2], abbs_bool[-1], 0] )

    return np.array( abbs_feature )

def find_punc(dataset, PUNC_SET):
    punc_feature = []  # match punc 

    # the rule is not very strict at the beginning, could introduce some noise
    punc_bool = [0] * len( dataset )
    for i in range( len( dataset ) ):
        for punc in PUNC_SET:
            if punc in dataset[i]:
                punc_bool[i] = 1
                break

    # first word
    punc_feature.append( [0, punc_bool[0], punc_bool[1]] )

    for i in range( 1, len( punc_bool ) - 1 ):
        punc_feature.append( [punc_bool[i - 1], punc_bool[i], punc_bool[i + 1]] )

    # last word
    punc_feature.append( [punc_bool[-2], punc_bool[-1], 0] )

    return np.array( punc_feature )

def find_ent(dataset):
    ent_feature = []  # 3 dim, 1 for presence, 0 for non-presence of _Enter(\n)

    ent_bool = []
    for word in dataset:
        if word.find('_Enter') >= -1:
            ent_bool.append( 1 )
        else:
            ent_bool.append( 0 )    
    
    # first word
    ent_feature.append( [0, ent_bool[0], ent_bool[1]] )
    
    for i in range(1, len(ent_bool) - 1):
        ent_feature.append( [ent_bool[i - 1], ent_bool[i], ent_bool[i + 1]] )

    # last word
    ent_feature.append( [ent_bool[-2], ent_bool[-1], 0] )

    return np.array( ent_feature )

def construct_keyword_pattern_dict( dataset ):
    pattern_dict = {}

    for i, word in enumerate( dataset[0:-1] ):
        j = 0
        if '_END' in dataset[i]:
            if '_BEGIN' in dataset[i + 1]:
                continue
            else:
#                 new_pattern = dataset[i + 1]
                new_pattern = ''
                try:
                    while '_BEGIN' not in dataset[i + j + 1]:
                        try:
                            new_pattern = new_pattern + '<sssss>' + dataset[i + j + 1]
                        except:
                            break
                        j += 1

                    new_pattern_cleaned = new_pattern[7:]
                    if pattern_dict.__contains__( new_pattern_cleaned ):
                        pattern_dict[new_pattern_cleaned] += 1
                    else:
                        pattern_dict[new_pattern_cleaned] = 1
                except:
                    break

    return pattern_dict

def find_negbdry_pattern( train_data):
    negbdry_set_train = construct_keyword_pattern_dict( train_data )

    return negbdry_set_train

LEN_LIMIT = 5
FREQ_LIMIT = 5

def rule_based_validation(pattern_dict, dataset, pre_test):
    temp_test = copy.deepcopy( pre_test )

    NON_BOUNDARY_MARK = "O"
    
    for pattern_str in pattern_dict:
        pattern_series = pattern_str.split('<sssss>')
        pattern_len = len(pattern_series)
        
        # filter
        if pattern_len < LEN_LIMIT:
            continue

        if pattern_dict[pattern_str] < FREQ_LIMIT:
            continue

        i = 0
        for i in range(len(dataset) - pattern_len):
            b_matched = False
            for j in range(pattern_len):
                if dataset[i + j] != pattern_series[j]:
                    break
                b_matched = True
                
            if b_matched:
                for j in range( pattern_len ):
                    # correct prediction
                    temp_test[i + j] = NON_BOUNDARY_MARK

    return temp_test

def dict_refinement(dict_keyword):
    dict_refined = {}
    
    for item in dict_keyword.keys():
        newitem = item
#         newitem = item.replace('<sssss>',' ')
        if '_Enter' in newitem:
            keywords = newitem.split('_Enter')
            
            keyword_refined = keywords[0] + '_Enter'
            if dict_refined.__contains__( keyword_refined ):
                dict_refined[keyword_refined] += dict_keyword[item]
            else:
                dict_refined[keyword_refined] = dict_keyword[item]
                
            for keyword in keywords[1:]:
                keyword_refined = keyword + '_Enter'
                if dict_refined.__contains__( keyword_refined ):
                    dict_refined[keyword_refined[7:]] += dict_keyword[item]
                else:
                    dict_refined[keyword_refined[7:]] = dict_keyword[item]
        else:
            if dict_refined.__contains__( item ):
                dict_refined[newitem] += dict_keyword[item]
            else:
                dict_refined[newitem] = dict_keyword[item]
            
    fp = open('keywordlist.txt', 'w', encoding='utf8')
    
    sorted_list = sorted(dict_refined.items(), key=lambda item:item[1], reverse =True)
    
    for item, value in sorted_list:
        newitem = item.replace('<sssss>',' ')
        newitem = newitem.replace('_Enter','')
        fp.writelines(newitem + '\t' + str(value) + '\n')
    fp.close()
            
    return dict_refined

def gen_label( dataset ):
    labels = []

    for line in dataset:
        if '_BEGIN' in line:
            labels.append( "BS" )  # 'BEGIN')
        elif '_END' in line:
            labels.append( "ES" )  # 'END')
        else:
            labels.append( "O" )  # '_')

    return np.array( labels )

def gen_sample_weight( dataset ):
    sample_weight = []

    for line in dataset:
        if '_BEGIN' in line:
            sample_weight.append( SAMPLE_WEIGHT_BEGIN )  # 'BEGIN')
        elif '_END' in line:
            sample_weight.append( SAMPLE_WEIGHT_END )  # 'END')
        else:
            sample_weight.append( 1 )  # '_')

    return np.array( sample_weight )

def error_print(Y_test, pre_test, test_data):
    
    fp = open( ERROR_INSTANCE_FN, 'w', encoding = 'utf8' )

    for i in range( len( Y_test ) ):
        if pre_test[i] == "BS":
            pre_label = 'BEGIN'
        elif pre_test[i] == "ES":
            pre_label = 'END'
        elif pre_test[i] == "O":
            pre_label = 'NoBoun'

        if Y_test[i] != pre_test[i]:
            fp.writelines( test_data[i] + '____ERROR with ' + pre_label + '\n' )
        else:
            fp.writelines( test_data[i] + '\n' )

def train_test( model):

    print( 'using punc_set1, punc_set2, cap, acro, num, letter, enter, pos' )
    pre_test = model( train_vec, train_lbl, test_vec )
    #error_print(Y_test, pre_test, test_data)

    return pre_test

print("Feature sets defined.")

Feature sets defined.


#### Step 4: Prediction and Evaluation

In [7]:
if __name__ == "__main__":

    train_data, test_en, test_fr = load_both_train()
    train_pos, test_en_pos, test_fr_pos = load_pos()
    
     #test on the English Test dataset
    test_data = test_en
    test_pos = test_en_pos
    
    #test on the French Test dataset
    #test_data = test_fr
    #test_pos = test_fr_pos
    
    train_pos_feature, test_pos_feature = find_pos_alter( train_pos + test_pos, len( train_pos ) )

    train_cap = find_cap( train_data )
    test_cap = find_cap( test_data )
    
    train_num = find_num( train_data )
    test_num = find_num( test_data )

    train_abbs = find_Acronyms( train_data )
    test_abbs = find_Acronyms( test_data )

    train_letter = find_letter( train_data )
    test_letter = find_letter( test_data )

    train_punc1 = find_punc( train_data, PUNC_SET1 )
    test_punc1 = find_punc( test_data , PUNC_SET1 )
    
    train_punc2 = find_punc( train_data, PUNC_SET2 )
    test_punc2 = find_punc( test_data , PUNC_SET2 )

    train_ent = find_ent( train_data )
    test_ent = find_ent( test_data )
    
    # feature fusion
    #train_vec = np.concatenate( (train_punc1, train_punc2, train_num, train_cap, train_abbs, train_ent, train_letter, train_pos_feature ), axis = 1 )
    #test_vec = np.concatenate( ( test_punc1, test_punc2, test_num, test_cap, test_abbs, test_ent, test_letter, test_pos_feature), axis = 1 )
    train_vec = np.concatenate( (train_punc1, train_num, train_cap, train_abbs, train_ent, train_letter, train_pos_feature ), axis = 1 )
    test_vec = np.concatenate( ( test_punc1, test_num, test_cap, test_abbs, test_ent, test_letter, test_pos_feature), axis = 1 )
    # build label and weight vector
    train_lbl = gen_label( train_data )
    #test_lbl = gen_label( test_data )

    #train_wgt = gen_sample_weight( train_data )
    #test_wgt = gen_sample_weight( test_data )

    print( '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ using all features $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$:' )
    model = rfc
    pre_test = train_test( model)
    

    print( '$$$$$$$$$$$$$$$$$$$$ post processing with keyword validation $$$$$$$$$$$$$$$$$$$$:' )
    negbdry_set_train = find_negbdry_pattern( train_data)

    dict_refined = dict_refinement( negbdry_set_train )

    LEN_LIMIT = 3
    FREQ_LIMIT = 16
    print(  LEN_LIMIT, FREQ_LIMIT )
    pre_test_valid = rule_based_validation( dict_refined, test_data, pre_test )
    
    print(  "Predictions completed." )

            
    

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ using all features $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$:
using punc_set1, punc_set2, cap, acro, num, letter, enter, pos


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


$$$$$$$$$$$$$$$$$$$$ post processing with keyword validation $$$$$$$$$$$$$$$$$$$$:
3 16
Predictions completed.


#### Write predictions to files (text file)

In [15]:
 # to write the prediction to a file
write_text("Output/Predict_Test_en_valid.txt", pre_test_valid)
write_text("Output/Predict_Test_en_novalid.txt", pre_test.tolist())
#write_text("Output/Predict_Test_fr_valid.txt", pre_test_valid)
#write_text("Output/Predict_Test_fr_novalid.txt", pre_test.tolist())


print("Prediction files are written~")

Prediction files are written~


#### Write predictions to a json file (for the organizer to validate the classification)

In [5]:
import json

def index_generation(list_label):
    index_begin = []
    index_end = []
    
    index = 0
    for label in list_label:
        if label == "BS":
            index_begin.append(index)
        elif label == "ES":
            index_end.append(index)
        index += 1
            
    return index_begin, index_end

Index_begins, Index_ends = index_generation(pre_test)
Index_begins_valid, Index_ends_valid = index_generation(pre_test_valid)

mydict = {}
mydict_valid = {}

Test_en_words = load_without_index(Path_Test_en)
Test_en_words = [word.replace("_Enter", "\n") for word in Test_en_words]
#Test_fr_words = load_without_index(Path_Test_fr)
#Test_fr_words = [word.replace("_Enter", "\n") for word in Test_fr_words]

text = ' '.join(Test_en_words)
#text = ' '.join(Test_fr_words)
mydict["text"] = text
mydict["begin_sentence"] = Index_begins
mydict["end_sentence"] = Index_ends

mydict_valid["text"] = text
mydict_valid["begin_sentence"] = Index_begins_valid
mydict_valid["end_sentence"] = Index_ends_valid

with open('Test_Predicted/Test_en_pred_valid.json', 'w') as fp:
    json.dump(mydict, fp)
with open('Test_Predicted/Test_en_pred_novalid.json', 'w') as fp:
    json.dump(mydict_valid, fp)
    
#with open('Test_Predicted/Test_fr_pred_valid.json', 'w') as fp:
    #json.dump(mydict, fp)
#with open('Test_Predicted/Test_fr_pred_novalid.json', 'w') as fp:
    #json.dump(mydict_valid, fp)
    
print("Test json files written~ Ready for validation")


Test json files written~ Ready for validation
