In [1]:
import nltk
import pandas as pd
from pandas import DataFrame
import re
import numpy as np

# Data Processing

In [8]:
df = pd.read_csv('../../data/train.tsv', sep='\t')

In [9]:
print(df.head(5))
print(df.count()[0])

Id  EssaySet  Score1  Score2  \
0   1         1       1       1   
1   2         1       1       1   
2   3         1       1       1   
3   4         1       0       0   
4   5         1       2       2   

                                           EssayText  
0  Some additional information that we would need...  
1  After reading the expirement, I realized that ...  
2  What you need is more trials, a control set up...  
3  The student should list what rock is better an...  
4  For the students to be able to make a replicat...  
17207


In [10]:
essay_set_list = (df['EssaySet'].unique())
max_score_list = []
for i in essay_set_list:
    max_score_list.append(df[df['EssaySet']==i]['Score1'].max())
    print('Max score for essay {} is {}'.format(i, max_score_list[i-1]))

Max score for essay 1 is 3
Max score for essay 2 is 3
Max score for essay 3 is 2
Max score for essay 4 is 2
Max score for essay 5 is 3
Max score for essay 6 is 3
Max score for essay 7 is 2
Max score for essay 8 is 2
Max score for essay 9 is 2
Max score for essay 10 is 2


In [11]:
reference = pd.DataFrame()
candidates = pd.DataFrame()

for i in essay_set_list:
    ref = df[(df['EssaySet']==i) & (df['Score1']==max_score_list[i-1])]
    ref_list = [reference, ref]
    reference = pd.concat(ref_list)
    cands = df[(df['EssaySet']==i) & (df['Score1']!=max_score_list[i-1])]
    cand_list = [candidates, cands]
    candidates = pd.concat(cand_list)

In [12]:
candidates.head(5)

Unnamed: 0,Id,EssaySet,Score1,Score2,EssayText
0,1,1,1,1,Some additional information that we would need...
1,2,1,1,1,"After reading the expirement, I realized that ..."
2,3,1,1,1,"What you need is more trials, a control set up..."
3,4,1,0,0,The student should list what rock is better an...
4,5,1,2,2,For the students to be able to make a replicat...


In [13]:
total_ref = reference.count()[0]
total_cand = candidates.count()[0]
print(total_ref, total_cand, total_ref+total_cand)

essay_set_list_ref = (reference['EssaySet'].unique())
essay_set_list_cand = (candidates['EssaySet'].unique())
print(essay_set_list_ref, essay_set_list_cand)

3712 13495 17207
[ 1  2  3  4  5  6  7  8  9 10] [ 1  2  3  4  5  6  7  8  9 10]


In [7]:
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]','', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

In [15]:
ref = df.loc[(df['Score1']==1) & (df['EssaySet']!=3)]
ref.head(5)
ref.loc[0]['EssaySet']

1

In [16]:
# Genearting the corpus

reference_corpus = []
candidate_corpus = []

for i in essay_set_list:
    ref = reference.loc[reference['EssaySet']==i]
    cand = candidates.loc[candidates['EssaySet']==i]
    
    count_ref = ref.count()[0]
    count_cand = cand.count()[0]
    
    ref_list = []
    cand_list = []
    
    for j in range(count_ref):
        ref_list.append(list(ref.iloc[j]['EssayText'].split()))
    ref_tuple = (i, ref_list)
    reference_corpus.append(ref_tuple)
    
    for j in range(count_cand):
        cand_list.append(list(cand.iloc[j]['EssayText'].split()))
    cand_tuple = (i, cand_list)
    candidate_corpus.append(cand_tuple)

reference_corpus = dict(reference_corpus)
candidate_corpus = dict(candidate_corpus)

In [17]:
reference_corpus = list(reference_corpus.values())
candidate_corpus = list(candidate_corpus.values())

In [18]:
new_reference_corpus = []
new_candidate_corpus = []

for i in essay_set_list:
    ref_list = []
    cand_list = []
    for j in range(len(reference_corpus[i-1])):
        ref_list.append(to_lowercase(remove_punctuation(reference_corpus[i-1][j])))
    for j in range(len(candidate_corpus[i-1])):
        cand_list.append(to_lowercase(remove_punctuation(candidate_corpus[i-1][j])))
    ref_tuple = (i, ref_list)
    cand_tuple = (i, cand_list)
    new_reference_corpus.append(ref_tuple)
    new_candidate_corpus.append(cand_tuple)

new_reference_corpus = dict(new_reference_corpus)
new_candidate_corpus = dict(new_candidate_corpus)

In [19]:
reference_corpus = list(new_reference_corpus.values())
candidate_corpus = list(new_candidate_corpus.values())

# BLEU Implementation

In [2]:
import collections
import math

In [3]:
def get_ngrams(segment, max_order=4):
    ngram_counts = collections.Counter()
    for order in range(1, max_order + 1):
        for i in range(0, len(segment) - order + 1):
            ngram = tuple(segment[i:i+order])
            ngram_counts[ngram] += 1

    return ngram_counts
    
def best_match_length(reference, candidate):
    ref_length_list = []
    for ref in reference:
        ref_length_list.append(len(ref))
    cand_length_list = [len(candidate)]*len(ref_length_list)
    difference = (np.abs(np.asarray(ref_length_list) - np.asarray(cand_length_list)))
    return ref_length_list[np.argmin(difference)]
            

def modified_precision(reference, candidate, order=4):
    candidate_counts = get_ngrams(candidate, order)
    
    max_counts = {}
    
    for ref in reference:
        ref_counts = get_ngrams(ref, order)
        
        for ngrams in candidate_counts:
            max_counts[ngrams] = max(max_counts.get(ngrams, 0), ref_counts[ngrams])
    
    clipped_counts = {
        ngram: min(count, max_counts[ngram]) for ngram, count in candidate_counts.items()
    }
    numer = sum(clipped_counts.values())
    denom = max(1, sum(candidate_counts.values()))
    
    return numer/denom

def BP(r, c):
    if c>r:
        return 1
    elif c == 0:
        return 0
    else:
        return math.exp(1-(r/c))
# print(best_match_length(reference_corpus[0][0], candidate_corpus[0][500]), len(candidate_corpus[0][500]))
# print(modified_precision(reference_corpus[0], candidate_corpus[0][0], 7))

In [4]:
def BLEU(reference, candidate, order=4):
    precision = np.zeros((1, order))
    p_log_sum = 0
    
    no_references = len(reference)
    candidate_length = len(candidate)
    for i in range(order):
        precision[0][i] = modified_precision(reference, candidate, i+1)
    
    r = best_match_length(reference, candidate)
    c = candidate_length
    
    bp = BP(r,c)
    
    weight = 1/order
    
    if (np.min(precision)>0):
        for i in range(order):
            p_log_sum += (weight * math.log(precision[0][i]))
        geo_mean = math.exp(p_log_sum)
    else:
        geo_mean = 0
    
    bleu = bp*geo_mean
    
    return bleu    

In [23]:
print(BLEU(reference_corpus[0], candidate_corpus[0][1000], 4))

0.9190499638456741


In [24]:
ref = []
ref.append(list(reference_corpus[0]))
print(len(ref))
cand = []
cand.append(list(candidate_corpus[0][1000]))

1


In [25]:
nltk.translate.bleu_score.corpus_bleu(ref, cand)

0.8574312041458294

In [14]:
import os
# path = '../../data/WMT18Data/system-outputs/newstest2018/'
# os.listdir(path)
lst = []
path = '../../data/WMT18Data/system-outputs/newstest2018/cs-en/newstest2018.CUNI-Transformer.5560.cs-en'
with open (path,  encoding='utf-8') as f:
    line = f.readline()
    while(line):
        line = line.split()
        line = remove_punctuation(to_lowercase(line))
        lst.append(line)
        line = f.readline()
print((lst))

