### Imports

In [None]:
import pandas as pd
import scipy.stats as stats
from tqdm import tqdm
import textstat
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import sent_tokenize, TweetTokenizer
import numpy as np

### Readability Functions

In [34]:
def spache_allen_readability_formula(text):
    with open("spache_allen.txt") as f:
        easy_words = set(line.strip() for line in f)

    return spache(text, easy_words)

def spache_extended_readability_formula(text):
    with open("spache_sven.txt") as f:
        easy_words = set(line.strip() for line in f)

    return spache(text, easy_words)

def spache_readability_formula(text):
    with open("spache_easy.txt") as f:
        easy_words = set(line.strip() for line in f)

    return spache(text, easy_words)

def spache(text, easy_words):
    stemmer = PorterStemmer()
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [t for t in tokens if t.isalnum()]

    num_words = len(tokens)
    if num_words == 0:
        return -1
    num_sentences = len(sent_tokenize(text))
    num_spache_complex = sum([1 for t in tokens if stemmer.stem(t.lower()) not in easy_words])

    avg_sentence_len = num_words / num_sentences
    percent_difficult_words = num_spache_complex / num_words * 100

    return (0.141 * avg_sentence_len) + (0.086 * percent_difficult_words) + 0.839

In [35]:
def get_readabilities(text):
    results_list = []
    for x in tqdm(text):
        results_dict = {}

        kincaid = textstat.flesch_kincaid_grade(x)
        if kincaid <= 0:
            kincaid = 0
        elif kincaid > 12:
            kincaid = 13

        coleman = textstat.coleman_liau_index(x)
        if coleman <= 0:
            coleman = 0
        elif coleman > 12:
            coleman = 13

        fog = textstat.gunning_fog(x)
        if fog <= 0:
            fog = 0
        elif fog > 12:
            fog = 13

        smog = textstat.smog_index(x)
        if smog <= 0:
            smog = 0
        elif smog > 12:
            smog = 13

        lix = textstat.lix(x)
        if lix <= 5:
            gradelix = 0
        elif lix > 5 and lix < 10: 
            gradelix = 1
        elif lix >= 10 and lix < 15:
            graelix = 2
        elif lix >= 15 and lix < 20: 
            gradelix = 3
        elif lix >= 20 and lix < 24: 
            gradelix = 4
        elif lix >= 24 and lix < 28: 
            gradelix = 5
        elif lix >= 28 and lix < 32: 
            gradelix = 6
        elif lix >= 32 and lix < 36: 
            gradelix = 7
        elif lix >= 36 and lix < 40: 
            gradelix = 8
        elif lix >= 40 and lix < 44: 
            gradelix = 9
        elif lix >= 44 and lix < 48: 
            gradelix = 10
        elif lix >= 48 and lix < 52: 
            gradelix = 11
        elif lix >= 52 and lix < 56: 
            gradelix = 12
        elif lix >= 56: 
            gradelix = 13

        rix = textstat.rix(x)
        if rix < 0.2: 
            graderix = 1
        elif rix >= 0.2 and rix < 0.5:
            graderix = 2
        elif rix >= 0.5 and rix < 0.8: 
            graderix = 3
        elif rix >= 0.8 and rix < 1.3: 
            graderix = 4
        elif rix >= 1.3 and rix < 1.8: 
            graderix = 5
        elif rix >= 1.8 and rix < 2.4: 
            graderix = 6
        elif rix >= 2.4 and rix < 3: 
            graderix = 7
        elif rix >= 3 and rix < 3.7: 
            graderix = 8
        elif rix >= 3.7 and rix < 4.5: 
            graderix = 9
        elif rix >= 4.5 and rix < 5.3:
            graderix = 10
        elif rix >= 5.3 and rix < 6.2: 
            graderix = 11
        elif rix >= 6.2 and rix < 7.2: 
            graderix = 12
        elif rix >= 7.2: 
            graderix = 13

        dale = textstat.dale_chall_readability_score_v2(x)
        if dale < 3:
            gradedale = 0
        elif dale >= 3 and dale < 3.5: 
            gradedale = 1
        elif dale >= 3.5 and dale < 4: 
            gradedale = 2
        elif dale >= 4 and dale < 4.5: 
            gradedale = 3
        elif dale >= 4.5 and dale < 5: 
            gradedale = 4
        elif dale >= 5 and dale < 5.5: 
            gradedale = 5
        elif dale >= 5.5 and dale < 6: 
            gradedale = 6
        elif dale >= 6 and dale < 6.5: 
            gradedale = 7
        elif dale >= 6.5 and dale < 7: 
            gradedale = 8
        elif dale >= 7 and dale < 7.5: 
            gradedale = 9
        elif dale >= 7.5 and dale < 8: 
            gradedale = 10
        elif dale >= 8 and dale < 8.5: 
            gradedale = 11
        elif dale >= 8.5 and dale < 9: 
            gradedale = 12
        elif dale >= 9: 
            gradedale = 13

        spache_og = spache_readability_formula(x)
        if spache_og > 12:
            spache_og = 13

        spache_allen = spache_allen_readability_formula(x)
        if spache_allen > 12:
            spache_allen = 13
            
        spache_ex = spache_extended_readability_formula(x)
        if spache_ex > 12:
            spache_ex = 13

        results_dict["Kincaid"] = kincaid
        results_dict["ColemanLiau"] = coleman
        results_dict["GunningFogIndex"] = fog
        results_dict["LIX"] = gradelix
        results_dict["SMOGIndex"] = smog
        results_dict["RIX"] = graderix
        results_dict["DaleChallIndex"] = gradedale
        results_dict["Spache"] = spache_og
        results_dict["SpacheExpanded"] = spache_ex
        results_dict["Spache-Allen"] =spache_allen
        results_list.append(results_dict.copy())
        
    return results_list

### Helper Functions

In [36]:
def get_mer_s1s3(q, queries):
    formulas_queries = pd.DataFrame(get_readabilities(q))
    formulas_queries['query'] = q
    formulas_queries['grade'] = queries['grade']
    
    formulas_queries = formulas_queries[formulas_queries['Spache'] != -1]

    formulas_queries['fkg_err'] = abs(formulas_queries['grade'] - formulas_queries['Kincaid'])
    formulas_queries['cl_err'] = abs(formulas_queries['grade'] - formulas_queries['ColemanLiau'])
    formulas_queries['gf_err'] = abs(formulas_queries['grade'] - formulas_queries['GunningFogIndex'])
    formulas_queries['lix_err'] = abs(formulas_queries['grade'] - formulas_queries['LIX'])
    formulas_queries['smog_err'] = abs(formulas_queries['grade'] - formulas_queries['SMOGIndex'])
    formulas_queries['rix_err'] = abs(formulas_queries['grade'] - formulas_queries['RIX'])
    formulas_queries['dc_err'] = abs(formulas_queries['grade'] - formulas_queries['DaleChallIndex'])
    formulas_queries['s_err'] = abs(formulas_queries['grade'] - formulas_queries['Spache'])
    formulas_queries['se_err'] = abs(formulas_queries['grade'] - formulas_queries['SpacheExpanded'])
    formulas_queries['sa_err'] = abs(formulas_queries['grade'] - formulas_queries['Spache-Allen'])
    
    return formulas_queries

In [37]:
def print_mer_s1s3(formulas_queries):
    print('FKG Error: ', round(formulas_queries['fkg_err'].mean(),2))
    print('DC Error: ', round(formulas_queries['dc_err'].mean(),2))
    print('SMOG Error: ', round(formulas_queries['smog_err'].mean(),2))
    print('GF Error: ', round(formulas_queries['gf_err'].mean(),2))
    print('LIX Error: ', round(formulas_queries['lix_err'].mean(),2))
    print('RIX Error: ', round(formulas_queries['rix_err'].mean(),2))
    print('CL Error: ', round(formulas_queries['cl_err'].mean(),2))
    print('S Error: ', round(formulas_queries['s_err'].mean(),2))
    print('SE Error: ', round(formulas_queries['se_err'].mean(),2))
    print('SA Error: ', round(formulas_queries['sa_err'].mean(),2))

In [38]:
def print_sig_s1s3(formulas_queries):
    print('FK / DC Significance: pvalue = ', stats.ttest_rel(np.array(formulas_queries['fkg_err']), np.array(formulas_queries['dc_err']))[1])
    print('FK / SMOG Significance: pvalue = ',stats.ttest_rel(np.array(formulas_queries['fkg_err']), np.array(formulas_queries['smog_err']))[1])
    print('FK / GF Significance: pvalue = ',stats.ttest_rel(np.array(formulas_queries['fkg_err']), np.array(formulas_queries['gf_err']))[1])
    print('FK / LIX Significance: pvalue = ',stats.ttest_rel(np.array(formulas_queries['fkg_err']), np.array(formulas_queries['lix_err']))[1])
    print('FK / RIX Significance: pvalue = ',stats.ttest_rel(np.array(formulas_queries['fkg_err']), np.array(formulas_queries['rix_err']))[1])
    print('FK / CL Significance: pvalue = ',stats.ttest_rel(np.array(formulas_queries['fkg_err']), np.array(formulas_queries['cl_err']))[1])
    print('FK / S Significance: pvalue = ',stats.ttest_rel(np.array(formulas_queries['fkg_err']), np.array(formulas_queries['s_err']))[1])
    print('FK / SE Significance: pvalue = ',stats.ttest_rel(np.array(formulas_queries['fkg_err']), np.array(formulas_queries['se_err']))[1])
    print('FK / SA Significance: pvalue = ',stats.ttest_rel(np.array(formulas_queries['fkg_err']), np.array(formulas_queries['sa_err']))[1])

### Scenario 1

In [42]:
queries = pd.read_csv('s1_queries.csv')
q = list(queries['query'])

In [43]:
mer_s1 = get_mer_s1s3(q, queries)

100%|██████████| 168/168 [00:08<00:00, 19.70it/s]


In [44]:
print_mer_s1s3(mer_s1)

FKG Error:  3.72
DC Error:  5.94
SMOG Error:  4.68
GF Error:  4.37
LIX Error:  4.49
RIX Error:  2.1
CL Error:  3.84
S Error:  2.74
SE Error:  2.3
SA Error:  2.34


In [45]:
print_sig_s1s3(mer_s1)

FK / DC Significance: pvalue =  9.898694991043038e-16
FK / SMOG Significance: pvalue =  1.4157834876396462e-05
FK / GF Significance: pvalue =  0.0011807224103815212
FK / LIX Significance: pvalue =  0.00020119067244079428
FK / RIX Significance: pvalue =  1.2615723720299363e-12
FK / CL Significance: pvalue =  0.6042045233376176
FK / S Significance: pvalue =  1.37412321228267e-07
FK / SE Significance: pvalue =  4.844324749805791e-13
FK / SA Significance: pvalue =  1.894323110372182e-12


### Scenario 2

In [None]:
data_original = pd.read_csv('fk_se_requik_results.csv')
data_dc = pd.read_csv('dc_se_requik_results.csv')
data_smog = pd.read_csv('smog_se_requik_results.csv')
data_gf = pd.read_csv('gf_se_requik_results.csv')
data_lix = pd.read_csv('lix_se_requik_results.csv')
data_rix = pd.read_csv('rix_se_requik_results.csv')
data_cl = pd.read_csv('cl_se_requik_results.csv')
data_s = pd.read_csv('s_se_requik_results.csv')
data_sa = pd.read_csv('sa_se_requik_results.csv')

data_fk_sa = pd.read_csv('fk_sa_requik_results.csv')

In [None]:
queries = list(data_original['query'].unique())

In [None]:
def get_mrr_s2(data, queries):
    s2_mrr = []
    for q in queries:
        q_data = data[data['query'] == q]
        rank = q_data.rank(q)
        s2_mrr.append(1/rank)
    return s2_mrr

In [None]:
og_mrr = get_mrr_s2(data_original, queries)
dc_mrr = get_mrr_s2(data_dc, queries)
smog_mrr = get_mrr_s2(data_smog, queries)
gf_mrr = get_mrr_s2(data_gf, queries)
lix_mrr = get_mrr_s2(data_lix, queries)
rix_mrr = get_mrr_s2(data_rix, queries)
cl_mrr = get_mrr_s2(data_cl, queries)
s_mrr = get_mrr_s2(data_s, queries)
sa_mrr = get_mrr_s2(data_sa, queries)
fk_sa_mrr = get_mrr_s2(data_fk_sa, queries)

In [None]:
print('Original MRR: ', round(sum(og_mrr) / len(og_mrr),2))
print('DC/S MRR: ', round(sum(dc_mrr) / len(dc_mrr),2))
print('SMOG/S MRR: ', round(sum(smog_mrr) / len(smog_mrr),2))
print('GF/S MRR: ', round(sum(gf_mrr) / len(gf_mrr),2))
print('LIX/S MRR: ', round(sum(lix_mrr) / len(lix_mrr),2))
print('RIX/S MRR: ', round(sum(rix_mrr) / len(rix_mrr),2))
print('CL/S MRR: ', round(sum(cl_mrr) / len(cl_mrr),2))
print('S/S MRR: ', round(sum(s_mrr) / len(s_mrr),2))
print('SA/S MRR: ', round(sum(sa_mrr) / len(sa_mrr),2))
print('FK/SA MRR: ', round(sum(fk_sa_mrr) / len(fk_sa_mrr),2))

In [None]:
print('OG / DC Significance: ', stats.ttest_rel(np.array(og_mrr), np.array(dc_mrr)))
print('OG / SMOG Significance: ',stats.ttest_rel(np.array(og_mrr), np.array(smog_mrr)))
print('OG / GF Significance: ',stats.ttest_rel(np.array(og_mrr), np.array(gf_mrr)))
print('OG / LIX Significance: ',stats.ttest_rel(np.array(og_mrr), np.array(lix_mrr)))
print('OG / RIX Significance: ',stats.ttest_rel(np.array(og_mrr), np.array(rix_mrr)))
print('OG / CL Significance: ',stats.ttest_rel(np.array(og_mrr), np.array(cl_mrr)))
print('OG / S Significance: ',stats.ttest_rel(np.array(og_mrr), np.array(s_mrr)))
print('OG / SA Significance: ',stats.ttest_rel(np.array(og_mrr), np.array(sa_mrr)))
print('OG / FK/SA Significance: ',stats.ttest_rel(np.array(og_mrr), np.array(fk_sa_mrr)))

### Scenario 3

In [None]:
snippets = pd.read_csv('s3_snippets.csv', index_col=0)
snippets.reset_index(inplace=True, drop=True)
s = list(snippets['snippet'])

In [None]:
mer_s3 = get_mer_s1s3(s, snippets)

In [None]:
print_mer_s1s3(mer_s3)

In [None]:
print_sig_s1s3(mer_s3)

### Scenario 4

In [None]:
data_s4 = pd.read_csv('data4korsce.csv', index_col=0)

In [None]:
# Uncomment for Senario 4 (K-4)

# data = data[data['grade'] <= 4]
# data.reset_index(drop=True, inplace=True)

In [None]:
mrr_fkg = []
mrr_cl = []
mrr_gf = []
mrr_lix = []
mrr_smog = []
mrr_rix = []
mrr_dc = []
mrr_s = []
mrr_se = []
mrr_sa = []

for x in tqdm(query):
    temp = data_s4[data_s4['qid'] == x]
    
    fkg = temp.sort_values(by=['fkg_rank_score'], ascending=False)
    fkg.reset_index(inplace=True, drop=True)
    
    cli = temp.sort_values(by=['cl_rank_score'], ascending=False)
    cli.reset_index(inplace=True, drop=True)
    
    gf = temp.sort_values(by=['gf_rank_score'], ascending=False)
    gf.reset_index(inplace=True, drop=True)
    
    lix = temp.sort_values(by=['lix_rank_score'], ascending=False)
    lix.reset_index(inplace=True, drop=True)
    
    smog = temp.sort_values(by=['smog_rank_score'], ascending=False)
    smog.reset_index(inplace=True, drop=True)
    
    rix = temp.sort_values(by=['rix_rank_score'], ascending=False)
    rix.reset_index(inplace=True, drop=True)
    
    dc = temp.sort_values(by=['dc_rank_score'], ascending=False)
    dc.reset_index(inplace=True, drop=True)
    
    s = temp.sort_values(by=['s_rank_score'], ascending=False)
    s.reset_index(inplace=True, drop=True)
    
    se = temp.sort_values(by=['se_rank_score'], ascending=False)
    se.reset_index(inplace=True, drop=True)
    
    sa = temp.sort_values(by=['sa_rank_score'], ascending=False)
    sa.reset_index(inplace=True, drop=True)
    
    
    
    rank_fkg = fkg[fkg['rank'] == 1].index[0] + 1
    rank_cl = cli[cli['rank'] == 1].index[0] + 1
    rank_gf = gf[gf['rank'] == 1].index[0] + 1
    rank_lix = lix[lix['rank'] == 1].index[0] + 1
    rank_smog = smog[smog['rank'] == 1].index[0] + 1
    rank_rix = rix[rix['rank'] == 1].index[0] + 1
    rank_dc = dc[dc['rank'] == 1].index[0] + 1
    rank_s = s[s['rank'] == 1].index[0] + 1
    rank_se = se[se['rank'] == 1].index[0] + 1
    rank_sa = sa[sa['rank'] == 1].index[0] + 1
    
    mrr_fkg.append(1/rank_fkg)
    mrr_cl.append(1/rank_cl)
    mrr_gf.append(1/rank_gf)
    mrr_lix.append(1/rank_lix)
    mrr_smog.append(1/rank_smog)
    mrr_rix.append(1/rank_rix)
    mrr_dc.append(1/rank_dc)
    mrr_s.append(1/rank_s)
    mrr_se.append(1/rank_se)
    mrr_sa.append(1/rank_sa)

In [None]:
print('Original MRR: ', round(sum(mrr_fkg)/len(mrr_fkg), 2))
print('DC MRR: ', round(sum(mrr_dc)/len(mrr_dc), 2))
print('Smog MRR: ', round(sum(mrr_smog)/len(mrr_smog), 2))
print('GF MRR: ', round(sum(mrr_gf)/len(mrr_gf), 2))
print('LIX MRR: ', round(sum(mrr_lix)/len(mrr_lix), 2))
print('RIX MRR: ', round(sum(mrr_rix)/len(mrr_rix), 2))
print('CL MRR: ', round(sum(mrr_cl)/len(mrr_cl), 2))
print('Spache MRR: ', round(sum(mrr_s)/len(mrr_s), 2))
print('Spache-E MRR: ', round(sum(mrr_se)/len(mrr_se), 2))
print('Spache-Allen MRR: ', round(sum(mrr_sa)/len(mrr_sa), 2))

In [None]:
print('OG / DC Significance: ', stats.ttest_rel(np.array(mrr_fkg), np.array(mrr_dc)))
print('OG / SMOG Significance: ',stats.ttest_rel(np.array(mrr_fkg), np.array(mrr_smog)))
print('OG / GF Significance: ',stats.ttest_rel(np.array(mrr_fkg), np.array(mrr_gf)))
print('OG / LIX Significance: ',stats.ttest_rel(np.array(mrr_fkg), np.array(mrr_lix)))
print('OG / RIX Significance: ',stats.ttest_rel(np.array(mrr_fkg), np.array(mrr_rix)))
print('OG / CL Significance: ',stats.ttest_rel(np.array(mrr_fkg), np.array(mrr_cl)))
print('OG / S Significance: ',stats.ttest_rel(np.array(mrr_fkg), np.array(mrr_s)))
print('OG / SE Significance: ',stats.ttest_rel(np.array(mrr_fkg), np.array(mrr_se)))
print('OG / SA Significance: ',stats.ttest_rel(np.array(mrr_fkg), np.array(mrr_sa)))