In [None]:
import numpy as np
from collections import defaultdict
from tqdm import tqdm
from Levenshtein import distance
from sklearn.model_selection import KFold

In [1]:
def find_duplicates(liste):
    # put in dict
    dico = dict()

    for i in tqdm(liste):
        dico[i] = dico.get(i, 0) + 1

    output_dict = {}    
    for i in tqdm(dico):
        if dico[i] > 1: # for dico's element where value >= 2
            output_dict[i] = [a+1 for a, b in enumerate(liste) if b == i]

    return output_dict, dico


def load_citation_sentiment_corpus(filepath):
    texts = []
    polarities = []
    n_num = 0
    o_num = 0
    p_num = 0

    with open(filepath) as f:
        for line in f:
            line = line.strip()
            # look for invalid lines
            if (len(line) is 0):
                continue
            if line.startswith('#'):
                continue

            # divide the line into columns
            pieces = line.split('\t')
            if (len(pieces) != 4):
                print("Warning: incorrect number of fields in the data file for line:", line)
                continue
            
            text = pieces[3]
            # remove start/end quotes
            text = text[1:len(text) - 1]
            
            # create the labels and count them
            if pieces[2] is 'n':
                n_num += 1
                polarities.append(0)
            if pieces[2] is 'o':
                o_num += 1
                polarities.append(1)
            if pieces[2] is 'p':
                p_num += 1
                polarities.append(2)
            texts.append(text)

    print("o_num= ", o_num)
    print("p_num= ", p_num)
    print("n_num= ", n_num)

    return np.asarray(texts), np.asarray(polarities)


def add_to_dict(dictonary,key,value):   
    if key in dictonary:
        dictonary[key].add(value)
    else:
        dictonary[key] = {value}
        
def prepare_labels_dict(text_list,labels):
    d1 = defaultdict(set)
    # create data + label dict
    for text,label in tqdm(zip(text_list,labels)):
        add_to_dict(d1,text,label)

    #find all the text with more then 1 labels assigned to it.
    key_list=[]
    for i in d1:
        if len(d1[i])>1:
            key_list.append(i)

    return d1,key_list

def check_multi_label(text_list,label_dict):
    for text in text_list:
        if len(label_dict[text])>1:
            print("ERROR", text)
            return 
    print("No Multi label text found")

Reads the complete corpus and counts the labels.

In [4]:
text_list,labels_list=load_citation_sentiment_corpus('../data/complete_corpus.txt')

o_num=  7627
p_num=  829
n_num=  280


Cleans the corpus by removing all incorrect labeled instances.

In [5]:
# finds duplicates
duplicates_dic,complete_count_dict=find_duplicates(text_list)
# prepare the label dictionary
complete_labels_dict, multi_label_text_list=prepare_labels_dict(text_list,labels_list)
# setup the required lists of data and label dicts
duplicate_texts=list(duplicates_dic.keys())
complete_texts=list(complete_count_dict.keys())
duplicates_removed=list(set(complete_texts).intersection(multi_label_text_list))
final_texts_list=list(set(complete_texts)^set(multi_label_text_list))

if len(duplicates_removed)!= len(multi_label_text_list):
    print("ERROR! Something is wrong!! The number of removed samples should be equal to the number of samples with multiple labels.")
# check if a text has multiple labels
check_multi_label(final_texts_list,complete_labels_dict)
# create final label list
final_labels_list=[]
for text in final_texts_list:
    if len(complete_labels_dict[text])==1:
        label_set=complete_labels_dict[text]
        label=next(iter(label_set))
        final_labels_list.append(label)
    else:
        print("ERROR Muilti label",text,complete_labels_dict[text])

100%|██████████| 8736/8736 [00:00<00:00, 14835.94it/s]
100%|██████████| 8059/8059 [05:40<00:00, 23.69it/s] 
8736it [00:00, 13997.34it/s]

No Multi label text found





Create the duplicates handling file that covers information about the removed text samples and their labels.

In [6]:
def get_labels_string(label_set):
    # creates the label string by converting the label set to the corresponding string.
    # used to see the different labels for each sample.
    result=''
    for label in label_set:
        if label == 0:
            result=result+' NEGATIVE '
        elif label ==1:
            result=result+' NEUTRAL '
        else:
            result=result+' POSITIVE '

    return result

# collects the text for the duplicate handling file          
text_to_write='\n\n'

text_to_write=text_to_write+'========DUPLICATES REMOVED========\n\n'            
for text in duplicates_removed:
    text_to_write=text_to_write+text+'\n LABELS:'
    label_set=complete_labels_dict[text];
    text_to_write=text_to_write+get_labels_string(label_set)+'\n\n\n'


text_to_write=text_to_write+'==========ALL DUPLICATES=========\n'
for text in duplicate_texts:
    text_to_write=text_to_write+text+'\n LABELS:'
    label_set=complete_labels_dict[text];
    text_to_write=text_to_write+get_labels_string(label_set)+'\n\n\n'
    

text_to_write=text_to_write+'==========COMPLETE DATASET AFTER HANDLING DUPLICATES=========\n'
for text in final_texts_list:
    text_to_write=text_to_write+text+'\n LABELS:'
    label_set=complete_labels_dict[text];
    text_to_write=text_to_write+get_labels_string(label_set)+'\n\n\n'
    
# save the duplicate handling files    
text_file = open("Duplicates_handling.txt", "w")
n = text_file.write(text_to_write)
text_file.close()

Statistics for the data that is left.

In [7]:
print("Text length after removing duplicates",len(final_texts_list))
print("Labels length after removing duplicates",len(final_labels_list))

if(len(final_texts_list)!=len(final_labels_list)):
    print('Something is not right check again! The number of labels and data samples is not the same.')

Text length after removing duplicates 7980
Labels length after removing duplicates 7980


In [8]:
def get_labels_letter(label):
    # convert the labels to the corresponding character.
    if label == 0:
        return "n"+"\t"
    elif label ==1:
        return "o"+"\t"
    elif label ==2:
        return "p"+"\t"
    else:
        print("CONTROL SHOULD NEVER COME HERE!")

def write_data_txt(text_list,test=False,prefix=''):
    # method to write the output files that include additional columns to process them using XLNet with the imdb processor
    file_name=""
    if test==True:
        # test folds
        print('For Test data with',prefix,)
        file_name='../data/output/'+prefix+'test.txt'
    else:
        # train data
        print('For Train data with',prefix,)
        file_name='../data/output/'+prefix+'train.txt'
    # count instances
    negative_count = 0
    neutral_count = 0
    positive_count = 0
    text_to_write=''
    for line in text_list:
        # add dummy columns
        text_to_write=text_to_write+"AA"+"\t"+"AA"+"\t"
        label=''
        if len(complete_labels_dict[line])==1:
            label_set=complete_labels_dict[line]
            label=next(iter(label_set))
            #print(label)
            if label == 0:
                negative_count=negative_count+1
            elif label ==1:
                neutral_count=neutral_count+1
            elif label ==2:
                positive_count=positive_count+1
            else:
                print("CONTROL SHOULD NEVER COME HERE! Wrong label detected")           
        else:
            print("ERROR! THIS SHOULD NOT HAPPEN! Wrong number of labels e.g. multi label.")
        text_to_write=text_to_write+get_labels_letter(label)+line+"\n"
    
    # statistics
    print("Number of POSITIVE examples:",positive_count)
    print("Number of NEGATIVE examples:",negative_count)
    print("Number of NEUTRAL examples:",neutral_count)
    
    # save the fold
    text_file = open(file_name, "w")
    n = text_file.write(text_to_write)
    text_file.close()

Complete data computed using cosine similarity.

In [11]:
write_data_txt(final_texts_list,prefix='cosine')

For Train data with cosine
Number of POSITIVE examples: 728
Number of NEGATIVE examples: 253
Number of NEUTRAL examples: 6999


Compute the 10 folds for cross validation.

In [23]:
# split into ten folds.
kf = KFold(n_splits=10)
kf.get_n_splits(final_labels_list)
final_texts_list=np.array(final_texts_list)
print(kf)
count=1
for train_index, test_index in kf.split(final_texts_list):
    # split into trian and test
    X_train, X_test = final_texts_list[train_index], final_texts_list[test_index]
    file_name='Fold_'+str(count)
    count+=1
    # process fold
    write_data_txt(X_train,prefix=file_name)
    write_data_txt(X_test,test=True,prefix=file_name)

KFold(n_splits=10, random_state=None, shuffle=False)
For Train data with Fold_1
Number of POSITIVE examples: 641
Number of NEGATIVE examples: 228
Number of NEUTRAL examples: 6313
For Test data with Fold_1
Number of POSITIVE examples: 87
Number of NEGATIVE examples: 25
Number of NEUTRAL examples: 686


For Train data with Fold_2
Number of POSITIVE examples: 651
Number of NEGATIVE examples: 234
Number of NEUTRAL examples: 6297
For Test data with Fold_2
Number of POSITIVE examples: 77
Number of NEGATIVE examples: 19
Number of NEUTRAL examples: 702


For Train data with Fold_3
Number of POSITIVE examples: 635
Number of NEGATIVE examples: 231
Number of NEUTRAL examples: 6316
For Test data with Fold_3
Number of POSITIVE examples: 93
Number of NEGATIVE examples: 22
Number of NEUTRAL examples: 683


For Train data with Fold_4
Number of POSITIVE examples: 674
Number of NEGATIVE examples: 221
Number of NEUTRAL examples: 6287
For Test data with Fold_4
Number of POSITIVE examples: 54
Number of NEG