In [1]:
import nltk
import pandas as pd
from pandas import DataFrame
import re
import numpy as np

# Data Processing

In [8]:
df = pd.read_csv('../../data/train.tsv', sep='\t')

In [9]:
print(df.head(5))
print(df.count()[0])

Id  EssaySet  Score1  Score2  \
0   1         1       1       1   
1   2         1       1       1   
2   3         1       1       1   
3   4         1       0       0   
4   5         1       2       2   

                                           EssayText  
0  Some additional information that we would need...  
1  After reading the expirement, I realized that ...  
2  What you need is more trials, a control set up...  
3  The student should list what rock is better an...  
4  For the students to be able to make a replicat...  
17207


In [10]:
essay_set_list = (df['EssaySet'].unique())
max_score_list = []
for i in essay_set_list:
    max_score_list.append(df[df['EssaySet']==i]['Score1'].max())
    print('Max score for essay {} is {}'.format(i, max_score_list[i-1]))

Max score for essay 1 is 3
Max score for essay 2 is 3
Max score for essay 3 is 2
Max score for essay 4 is 2
Max score for essay 5 is 3
Max score for essay 6 is 3
Max score for essay 7 is 2
Max score for essay 8 is 2
Max score for essay 9 is 2
Max score for essay 10 is 2


In [11]:
reference = pd.DataFrame()
candidates = pd.DataFrame()

for i in essay_set_list:
    ref = df[(df['EssaySet']==i) & (df['Score1']==max_score_list[i-1])]
    ref_list = [reference, ref]
    reference = pd.concat(ref_list)
    cands = df[(df['EssaySet']==i) & (df['Score1']!=max_score_list[i-1])]
    cand_list = [candidates, cands]
    candidates = pd.concat(cand_list)

In [12]:
candidates.head(5)

Unnamed: 0,Id,EssaySet,Score1,Score2,EssayText
0,1,1,1,1,Some additional information that we would need...
1,2,1,1,1,"After reading the expirement, I realized that ..."
2,3,1,1,1,"What you need is more trials, a control set up..."
3,4,1,0,0,The student should list what rock is better an...
4,5,1,2,2,For the students to be able to make a replicat...


In [13]:
total_ref = reference.count()[0]
total_cand = candidates.count()[0]
print(total_ref, total_cand, total_ref+total_cand)

essay_set_list_ref = (reference['EssaySet'].unique())
essay_set_list_cand = (candidates['EssaySet'].unique())
print(essay_set_list_ref, essay_set_list_cand)

3712 13495 17207
[ 1  2  3  4  5  6  7  8  9 10] [ 1  2  3  4  5  6  7  8  9 10]


In [7]:
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]','', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

In [15]:
ref = df.loc[(df['Score1']==1) & (df['EssaySet']!=3)]
ref.head(5)
ref.loc[0]['EssaySet']

1

In [16]:
# Genearting the corpus

reference_corpus = []
candidate_corpus = []

for i in essay_set_list:
    ref = reference.loc[reference['EssaySet']==i]
    cand = candidates.loc[candidates['EssaySet']==i]
    
    count_ref = ref.count()[0]
    count_cand = cand.count()[0]
    
    ref_list = []
    cand_list = []
    
    for j in range(count_ref):
        ref_list.append(list(ref.iloc[j]['EssayText'].split()))
    ref_tuple = (i, ref_list)
    reference_corpus.append(ref_tuple)
    
    for j in range(count_cand):
        cand_list.append(list(cand.iloc[j]['EssayText'].split()))
    cand_tuple = (i, cand_list)
    candidate_corpus.append(cand_tuple)

reference_corpus = dict(reference_corpus)
candidate_corpus = dict(candidate_corpus)

In [17]:
reference_corpus = list(reference_corpus.values())
candidate_corpus = list(candidate_corpus.values())

In [18]:
new_reference_corpus = []
new_candidate_corpus = []

for i in essay_set_list:
    ref_list = []
    cand_list = []
    for j in range(len(reference_corpus[i-1])):
        ref_list.append(to_lowercase(remove_punctuation(reference_corpus[i-1][j])))
    for j in range(len(candidate_corpus[i-1])):
        cand_list.append(to_lowercase(remove_punctuation(candidate_corpus[i-1][j])))
    ref_tuple = (i, ref_list)
    cand_tuple = (i, cand_list)
    new_reference_corpus.append(ref_tuple)
    new_candidate_corpus.append(cand_tuple)

new_reference_corpus = dict(new_reference_corpus)
new_candidate_corpus = dict(new_candidate_corpus)

In [19]:
reference_corpus = list(new_reference_corpus.values())
candidate_corpus = list(new_candidate_corpus.values())

# BLEU Implementation

In [2]:
import collections
import math

In [3]:
def get_ngrams(segment, max_order=4):
    ngram_counts = collections.Counter()
    for order in range(1, max_order + 1):
        for i in range(0, len(segment) - order + 1):
            ngram = tuple(segment[i:i+order])
            ngram_counts[ngram] += 1

    return ngram_counts
    
def best_match_length(reference, candidate):
    ref_length_list = []
    for ref in reference:
        ref_length_list.append(len(ref))
    cand_length_list = [len(candidate)]*len(ref_length_list)
    difference = (np.abs(np.asarray(ref_length_list) - np.asarray(cand_length_list)))
    return ref_length_list[np.argmin(difference)]
            

def modified_precision(reference, candidate, order=4):
    candidate_counts = get_ngrams(candidate, order)
    
    max_counts = {}
    
    for ref in reference:
        ref_counts = get_ngrams(ref, order)
        
        for ngrams in candidate_counts:
            max_counts[ngrams] = max(max_counts.get(ngrams, 0), ref_counts[ngrams])
    
    clipped_counts = {
        ngram: min(count, max_counts[ngram]) for ngram, count in candidate_counts.items()
    }
    numer = sum(clipped_counts.values())
    denom = max(1, sum(candidate_counts.values()))
    
    return numer/denom

def BP(r, c):
    if c>r:
        return 1
    elif c == 0:
        return 0
    else:
        return math.exp(1-(r/c))
# print(best_match_length(reference_corpus[0][0], candidate_corpus[0][500]), len(candidate_corpus[0][500]))
# print(modified_precision(reference_corpus[0], candidate_corpus[0][0], 7))

In [4]:
def BLEU(reference, candidate, order=4):
    precision = np.zeros((1, order))
    p_log_sum = 0
    
    no_references = len(reference)
    candidate_length = len(candidate)
    for i in range(order):
        precision[0][i] = modified_precision(reference, candidate, i+1)
    
    r = best_match_length(reference, candidate)
    c = candidate_length
    
    bp = BP(r,c)
    
    weight = 1/order
    
    if (np.min(precision)>0):
        for i in range(order):
            p_log_sum += (weight * math.log(precision[0][i]))
        geo_mean = math.exp(p_log_sum)
    else:
        geo_mean = 0
    
    bleu = bp*geo_mean
    
    return bleu    

In [23]:
print(BLEU(reference_corpus[0], candidate_corpus[0][1000], 4))

0.9190499638456741


In [24]:
ref = []
ref.append(list(reference_corpus[0]))
print(len(ref))
cand = []
cand.append(list(candidate_corpus[0][1000]))

1


In [25]:
nltk.translate.bleu_score.corpus_bleu(ref, cand)

0.8574312041458294

In [26]:
import os
# path = '../../data/WMT18Data/system-outputs/newstest2018/'
# os.listdir(path)
refs = []
cands=[]
path = '../../data/WMT18Data/system-outputs/newstest2018/cs-en/newstest2018.CUNI-Transformer.5560.cs-en'
with open (path,  encoding='utf-8') as f:
    line = f.readline()
    while(line):
        line = line.split()
        line = remove_punctuation(to_lowercase(line))
        newline=[]
        newline.append((line))
        refs.append(newline)
        line = f.readline()
print(len(refs))

path2 = '../../data/WMT18Data/references/newstest2018-csen-ref.en'
with open (path2,  encoding='utf-8') as f:
    line = f.readline()
    while(line):
        line = line.split()
        line = remove_punctuation(to_lowercase(line))
        cands.append(line)
        line = f.readline()
print(len(cands))


2983
2983


In [27]:
for i in range(len(cands)):
    print(BLEU(refs[i], cands[i], 4))

0.5083071544779264
0.6300445187720716
0.3423731420046536
0.26074607611317624
0.16432214649845076
0.42998361161843524
0.08828781887996916
0.30398382447928807
0.608468284054153
0.4125636684615636
0.32316595856342234
0.5139458920479512
0.3472641646291465
0.2830818656902643
0.6621094939730368
0.49582587670602896
0.6226709461963417
0.33993372981186565
0.6938765901437532
0.6767781116542884
0.614457428633834
0.4216064288789155
0.6751429141444918
0.5990401480351641
0.4077590765349244
0.7138957847176474
0.7519623371954001
0.6512347732642607
0.5980765669982732
0.4228063691081178
0.4048925796859826
0.7031286991797443
0.205120024252585
0.8645707301556367
0.5347572026073356
0.46924700641055994
0.9402167052972139
0.4496214833476127
0.6082119331252117
0.41362119020498994
0.5452575656287083
0.5622235684474696
0.722160038719837
0.6177280077628728
1.0
1.0
0.3684475025010143
0.576778917407034
0.5707539894547234
0.39011199988267686
0.40578981533640907
0.47093801401403856
0.5131122288704895
1.0
0.399203976

0.6938765901437532
0.6431190788518708
0.2955622941561161
0.3366128437700189
0.5069985161983269
0.5090608483649872
0.6180325302502597
0.28223741754371784
0.32171863386439614
0.2590983059591702
0.2784311666119889
0.26406472953135074
0.3227752343981492
0.16986511750128763
0.1608392033141991
0.36871942484506776
0.44210155461102674
0.5463862306428988
0.25841624152398335
0.31375417556543367
0.24564325678381407
0.5007208889679271
0.378857532444933
0.3713290491373274
0.40557543269257
0.45414934778763577
0.08507434356707748
0.5777200101855126
0.32231753559379556
0.18439261424940698
0.3234131223423055
0.42348852280747445
0.210991081374606
0.4729432254143207
0.4545664248310859
0.41046017710215116
0.16105651305421365
0.43827750275375416
0.6126497077616663
0.5381636832180277
0.3917196589390866
0.6252459331855557
0.4920094022701888
0.4862807306898919
0.62185696937892
0.32748777383992694
0.42334910422975774
0.7899756945192723
0.6719681789958402
0.13743248009839384
0.6654160150301653
0.583202161653735

0.4262107013226344
0.5957955953285643
0.46921225640504743
0.7405807906661027
0.4506715990631179
0.23645503236482487
0.7931266214883685
0.5144391898338826
0.5494178975272831
0.3033668865762665
0.27618177741751665
0.20781099595546784
0.36415145424411394
0.30900582979324076
0.518938423049053
0.42296885731019174
0.6565921060967573
0.10724314736012695
0.5935334349227444
0.3286646799084113
0.5966969672846244
0.3627723905206598
0.41266825715677186
0.21816833090573948
0.19967826601419558
0.5142401605028263
0.5298765308495281
0.36889397323344053
0.3186602782541458
0.3208964297965531
0.13121946035894874
0.2979714705451885
0.3579705528418393
0.0
0.3904607700856769
0.4189884316277995
0.42348852280747445
0.13743248009839384
0.07257981475824929
0.29759282342490984
0.24004900871820206
0.5639481223782569
0.35317430677152795
0.803154665668484
0.5170143749560994
0.5024732783001399
0.08041960165709958
0.42946351998479326
0.4078085950989948
0.20874365970730743
0.48288776506986375
0.42131046394675825
0.447

0.2418068126014414
0.06250381527944884
0.4379330092538739
0.2313893843364534
0.4106942927288114
0.576778917407034
0.3774155035532907
0.6209080538777205
0.2530618805649334
0.839587623092576
0.34466302266420556
1.0
0.5475857123833053
0.42233495646071106
0.2701725131470283
0.588110678224876
0.39518515321133574
0.2765889213741105
0.4556979652275674
0.39541773433519367
0.4320581033598776
0.0
0.41776054812621966
0.3666062360796705
0.3496254556175599
0.3282571529073745
0.5659119256652702
0.5377817486205954
0.1889535254645286
0.3023751786103529
0.26533461480729753
0.5749780527646212
0.4945582901405199
0.6437353909858463
0.7774027499411065
0.14113991930789777
0.4775803229973401
0.4526862716715723
0.5292031904718658
0.23420658076712422
0.1960984223762515
0.37088694036438924
0.2814330309309699
0.26969010858675907
0.22269294917859717
0.40843987931831716
0.15948028086520064
0.1818958799213559
0.3947189890669352
0.3413131680765469
0.23622484472789615
0.14658444011790964
0.10355626069577599
0.3620331

0.3862198357256381
0.3872267128105133
0.5208084107734716
0.3165023575704978
0.411497768128925
0.45344493800370606
0.521158450438947
0.09786579426502802
0.14113991930789777
0.15723078586799477
0.19674979811155638
0.46383391542463204
0.21326270632927877
0.47515906701896976
0.22593184042564138
0.7305735430836253
0.5082870517488167
0.5728499335076644
0.30369161197198596
0.28259733605017556
0.32324590081212706
0.6869344118746369
0.8225608916472869
0.7438725047551571
0.5235573773970524
0.41412387656655203
0.46070641441067006
0.31627829182680633
0.4843568783926473
0.4011814474115119
0.42563044080810664
0.42677471890544716
0.8385766789076257
0.7016116562610203
0.411497768128925
0.4110433074789492
0.3884233955697982
0.3435441061496734
0.5919745852983377
0.40788270265283133
0.32190076219876473
0.5449424361325766
0.2574866101628968
0.5800367884532975
0.6082909473435885
0.2256523708684302
0.4887049965066992
0.7598356856515925
0.47191632023395513
0.4644298173813905
0.5323968753965502
0.507406664513

0.3446913316729032
1.0
0.29071536848410967
0.24912137744543908
0.8061898627027144
0.6803749333171202
0.49478520421821987
0
0.5634493922873826
0.6788877661016413
0.8038019482772603
0.23050898626566632
0.5249110810825803
0.2701725131470283
0.47053556867151264
0.6059856413067048
0.30008163675779
0.39853282581014765
0.3388714363186176
0.5237230188933742
0.6803749333171202
0.8070557274927982
0.18214876736584293
0.5228695260775044
0.519087769811935
1.0
0.5499346034742949
0.3964580245124456
0.23025981500764872
0.6670186310741493
0.39920397615494657
0.34252823678705807
0.3290203033026989
0.4017025650759059
0.6321270913115443
0.4608043899062472
0.41488865152565224
0.36725621997972874
0.6519665498965485
0.2043239733177272
0.13743248009839384
0.722160038719837
0.1731023926226892
0.44817006480885563
0.39192692850177596
0.538870674147565
0.5512830191543607
0.4775421317844965
0.5396045868350934
0.5648648593857069
0.6280169733958691
0.5420667523630728
0.4957849539570052
0.33313733631697073
0.52538197

0.4520990148005652
0.21448629472025388
1.0
0.8327622163884877
0.1875114458383465
0.5757761022114715
0.32084194299238733
0.754611188661124
0.40444853562971783
0.549015186730003
0.40293516672844226
0.4488789990521489
0.3498761149110956
0.7564885483357829
0.4702724616422749
0.07257981475824929
0.1250076305588977
0.2351296758029079
0.2211999787323959
0.30130404892785684
0.3290203033026989
