In [1]:
from scipy.stats import pearsonr, spearmanr
import numpy as np
import json
from sklearn.metrics import auc
from nltk.tokenize import word_tokenize
from rouge import Rouge
import os
import json

In [2]:
def loadResultsFile(resultsFilepath):
    with open(resultsFilepath, 'r') as fIn:
        data = json.load(fIn)
    return data

In [3]:
def getXYvalues(allData, metrics, minimumTotalLength):
    xAll = {} # topic -> workerId -> [x values]
    yAllRecall = {metric: {} for metric in metrics} # metric -> topic -> workerId -> [y values]
    yAllF1 = {metric: {} for metric in metrics} # metric -> topic -> workerId -> [y values]
    for topicId in allData:
        for sessionInfo in allData[topicId]:
            # ignore sessions whose total final length is less than the limit:
            if sessionInfo['scores'][-1]['summary_len'] < minimumTotalLength:
                continue
                
            workerId = sessionInfo['id']
            xSystem = []
            ySystemRecall = {metric: [] for metric in metrics}
            ySystemF1 = {metric: [] for metric in metrics}
            for resultsAtLen in sessionInfo['scores']:
                xSystem.append(resultsAtLen['summary_len'])
                for metric in metrics:
                    if metric in resultsAtLen['results']:
                        ySystemRecall[metric].append(resultsAtLen['results'][metric]['recall'])
                        if 'f1' in resultsAtLen['results'][metric]:
                            ySystemF1[metric].append(resultsAtLen['results'][metric]['f1'])
                        else:
                            ySystemF1[metric].append(-1.0)
                    else:
                        ySystemRecall[metric].append(-1.0)
                        ySystemF1[metric].append(-1.0)

            xAll.setdefault(topicId, {})[workerId] = xSystem
            for metric in metrics:
                yAllRecall[metric].setdefault(topicId, {})[workerId] = ySystemRecall[metric]
                yAllF1[metric].setdefault(topicId, {})[workerId] = ySystemF1[metric]
    
    return xAll, yAllRecall, yAllF1

def getIntersectingXYvalues(xAll, yAll):
    # returns new xAll and yAll values, looking only at the intersecting ranges of the systems
    
    # get the X value that is the lowest value of the maximum X values over all systems (for each of the topics)
    # get the X value that is the highest value of the minimum X values over all systems (for each of the topics)
    minmaxX = 9999999 #{topic:9999999 for topic in xAll} # topic -> minMaxX
    maxminX = -1 #{topic:-1 for topic in xAll} # topic -> maxMinX
    for topic in xAll:
        for systemName in xAll[topic]:
            maxXOfSystem = xAll[topic][systemName][-1] # the maximum X value of the system
            minXOfSystem = xAll[topic][systemName][0] # the minimum X value of the system
            if maxXOfSystem < minmaxX: #[topic]:
                #minmaxX[topic] = maxXOfSystem
                minmaxX = maxXOfSystem
            if minXOfSystem > maxminX: #[topic]:
                #maxminX[topic] = minXOfSystem
                maxminX = minXOfSystem
                
    # create the new X and Y value lists only looking from the maxminX upto the minmaxX:
    xAllNew = {}
    yAllNew = {}
    for topic in xAll:
        xAllNew[topic] = {}
        for systemName in xAll[topic]:
            xAllNew[topic][systemName] = []
            for xIdx, x in enumerate(xAll[topic][systemName]):
                
                # if the current x value is the one right before the minmax, then get the interpolated y value:
                if x < maxminX and xAll[topic][systemName][xIdx+1] > maxminX:
                    xAllNew[topic][systemName].append(maxminX) # the X is the maxmin (start of intersecting region)
                    for metric in yAll:
                        newYval = getInterpolatedYval(xAll[topic][systemName], yAll[metric][topic][systemName], xIdx, maxminX)
                        yAllNew.setdefault(metric, {}).setdefault(topic, {}).setdefault(systemName, []).append(newYval)
                        
                # if the current x value is the one right after the maxmin, then get the interpolated y value:
                elif x > minmaxX and xAll[topic][systemName][xIdx-1] < minmaxX:
                    xAllNew[topic][systemName].append(minmaxX) # the X is the minmax (end of intersecting region)
                    for metric in yAll:
                        newYval = getInterpolatedYval(xAll[topic][systemName], yAll[metric][topic][systemName], xIdx-1, minmaxX)
                        yAllNew.setdefault(metric, {}).setdefault(topic, {}).setdefault(systemName, []).append(newYval)
                        
                # if the current x value is the exact maxmin or minmax or between the two, put the original values as is:
                elif x >= maxminX and x <= minmaxX:
                    xAllNew[topic][systemName].append(x)
                    for metric in yAll:
                        yAllNew.setdefault(metric, {}).setdefault(topic, {}).setdefault(systemName, []).append(
                            yAll[metric][topic][systemName][xIdx])
                
    return xAllNew, yAllNew, maxminX, minmaxX

def getInterpolatedYval(xList, yList, idxStart, neededXval):
    # gets the y value at the neededXval, between idxStart and idxStart+1:
    return np.interp(neededXval, xList[idxStart:idxStart+2], yList[idxStart:idxStart+2])

def getInterpolatedXval(xList, yList, idxStart, neededYval):
    # gets the x value at the neededYval, between idxStart and idxStart+1
    # this is done by switching the X and Y axis, and setting the neededYval as the neededXval
    return getInterpolatedYval(yList, xList, idxStart, neededYval)

def getAUCvalues(xAll, yAll):
    aucAll = {} # metric -> topic -> systemName -> {'auc':aucVal, 'xRange':xRange}
    for metric in yAll:
        for topic in yAll[metric]:
            for systemName in yAll[metric][topic]:
                aucVal = auc(xAll[topic][systemName], yAll[metric][topic][systemName])
                xRange = xAll[topic][systemName][-1] - xAll[topic][systemName][0]
                xStart = xAll[topic][systemName][0]
                aucAll.setdefault(metric, {}).setdefault(topic, {})[systemName] = aucVal #{'auc':aucVal, 'xRange':xRange, 'xStart': xStart}
    return aucAll

def getXatYlimits(xAll, yAll, metricLimits):
    # gets the X values (word counts) at which the Y limits (scores) are reached
    xAllAtMetricLimits = {} # metric -> topic -> sysName -> {'xVal', 'xRelative'}
    for metric in yAll:
        curMetricLimit = metricLimits[metric]
        for topic in yAll[metric]:
            for sysName in yAll[metric][topic]:
                xValAtLimit = None
                xValRelativeInRange = None
                maxYidx = len(yAll[metric][topic][sysName]) - 1
                # find the X value for the Y value needed:
                for yValIdx, yVal in enumerate(yAll[metric][topic][sysName]):
                    # the current Y value is the limit, so just take the X value:
                    if yVal == curMetricLimit:
                        xValAtLimit = xAll[topic][sysName][yValIdx]
                        break
                    # the current Y value is smaller than the limit, and the next Y value is large, so interpolate:
                    elif yValIdx < maxYidx and yVal < curMetricLimit and yAll[metric][topic][sysName][yValIdx+1] > curMetricLimit:
                        xValAtLimit = getInterpolatedXval(xAll[topic][sysName], yAll[metric][topic][sysName], yValIdx, curMetricLimit)
                        break
                
                # if the X value was found, find the relative placement in the X range:
                if xValAtLimit != None:
                    xValRelativeInRange = (xValAtLimit - xAll[topic][sysName][0]) / (xAll[topic][sysName][-1] - xAll[topic][sysName][0])
                xAllAtMetricLimits.setdefault(metric, {}).setdefault(topic, {})[sysName] = {'xVal':xValAtLimit, 'xRelative':xValRelativeInRange}
    
    return xAllAtMetricLimits

def getYatXlimits(xAll, yAll, xLimit):
    # gets the Y values (scores) at which the X limit (word count) is reached
    yAllAtWordLimits = {} # metric -> topic -> sysName -> <yVal>
    for metric in yAll:
        for topic in yAll[metric]:
            for sysName in yAll[metric][topic]:
                yValAtLimit = None
                maxXidx = len(xAll[topic][sysName]) - 1
                # find the Y value for the X value needed:
                for xValIdx, xVal in enumerate(xAll[topic][sysName]):
                    # the current X value is the limit, so just take the Y value:
                    if xVal == xLimit:
                        yValAtLimit = yAll[metric][topic][sysName][xValIdx]
                        break
                    # the current X value is smaller than the limit, and the next X value is large, so interpolate:
                    elif xValIdx < maxXidx and xVal < xLimit and xAll[topic][sysName][xValIdx+1] > xLimit:
                        yValAtLimit = getInterpolatedYval(xAll[topic][sysName], yAll[metric][topic][sysName], xValIdx, xLimit)
                        break

                
                yAllAtWordLimits.setdefault(metric, {}).setdefault(topic, {})[sysName] = yValAtLimit

    return yAllAtWordLimits

In [4]:
def getIterationIndicesWithinRange(sessionInfo, startSummLen, endSummLen):
    # gets the start and end iteration indices in the session
    # where the summ length is atleast startSummLen and at most endSummLen
    
    sessionScoresInfo = sessionInfo['scores']
    iterationStartIdx = 0
    iterationEndIdx = len(sessionScoresInfo) - 1
    
    for iterationIdx, iterationScoreInfo in enumerate(sessionScoresInfo):
        if iterationScoreInfo['summary_len'] < startSummLen:
            iterationStartIdx = iterationIdx + 1
            continue
        elif iterationScoreInfo['summary_len'] > endSummLen:
            iterationEndIdx = iterationIdx - 1
            break
    
    return iterationStartIdx, iterationEndIdx

def getScoreDistanceFromAverage(scoresDict, relevantKey):
    avgScore = np.mean(list(scoresDict.values()))
    return scoresDict[relevantKey] - avgScore

In [21]:
def getScoresPerSession(allData): #, startIntersectionLen, endIntersectionLen):
    
    metrics = ['R1', 'R2', 'RL', 'RSU', 'litepyramid']
    # X axis is the word count, and Y axis is the ROUGE score
    # get the X,Y values from the results json (yAll is the ROUGE recall scores which are the ones to use mostly):
    xAll, yAll, yAllF1 = getXYvalues(allData, metrics, 250)
    # get the X,Y values that the different systems intersect on for all metrics/topics:
    xAllIntersecting, yAllIntersecting, lowerBoundX, upperBoundX = getIntersectingXYvalues(xAll, yAll)
    print('Limits: {} to {}'.format(lowerBoundX, upperBoundX))
    # get the AUCs of the intersecting regions for all metrics/topics/systems
    aucAllIntersecting = getAUCvalues(xAllIntersecting, yAllIntersecting)
    # get the Y values (ROUGE/LitePyr scores) at which the X limit (word count) is reached for all metrics/topics/systems:
    yAllAtWordLimits = getYatXlimits(xAll, yAllF1, 250)
    ## for the lite-pyarmid scores ar the wordCountLimit, we will use the recall score instead of F1:
    #yAllAtWordLimitsRecall = getYatXlimits(xAll, yAll, wordCountLimit)
    #for topic in yAllAtWordLimits['litepyramid']:
    #    for sysName in yAllAtWordLimits['litepyramid'][topic]:
    #        yAllAtWordLimits['litepyramid'][topic][sysName] = yAllAtWordLimitsRecall['litepyramid'][topic][sysName]
    # get the reference summaries:
    refSummsByTopic = getReferenceSummaries('../data/DUC2006Clean')
    
    scoresPerSession = {}
    for topicId in allData:
        for sessionInfo in allData[topicId]:
            workerId = sessionInfo['id']
            
            if workerId not in xAll[topicId]:
                continue
            
            sessionName = '{}_{}'.format(topicId, workerId)
            sessionScores = {}
            
            # get the first and last indices of the iterations within the summary length range specified:
            firstIterationIdxWithinRange, lastIterationIdxWithinRange = getIterationIndicesWithinRange(sessionInfo, lowerBoundX, upperBoundX)
            
            # UMUX-lite 1 score, UMUX-lite 2 score, SUS score:
            sessionScores['umuxLite1'] = sessionInfo['questionnaireRatings']['r1']['rating'] * 5
            sessionScores['umuxLite2'] = sessionInfo['questionnaireRatings']['r2']['rating'] * 5
            sessionScores['susScore'] = 0.65 * ((sessionScores['umuxLite1'] + sessionScores['umuxLite2'] - 2) * (100 / 8)) + 22.9
            if 'r3' in sessionInfo['questionnaireRatings']:
                sessionScores['queryResponsiveness'] = sessionInfo['questionnaireRatings']['r3']['rating'] * 5
            
            # Initial summary rating, Average response rating within range, Average response rating over all iterations,
            # Average overall rating within range, Average overall rating over all iterations:
            sessionRatings = [scoreInfo['rating'] for scoreInfo in sessionInfo['scores']]
            sessionScores['initialRating'] = sessionRatings[0]
            sessionScores['avgResponseRatingInRange'] = np.mean(sessionRatings[1:lastIterationIdxWithinRange+1])
            sessionScores['avgOverallRatingInRange'] = np.mean(sessionRatings[:lastIterationIdxWithinRange+1])
            sessionScores['avgResponseRatingAll'] = np.mean(sessionRatings[1:])
            sessionScores['stdResponseRatingAll'] = np.std(sessionRatings[1:])
            sessionScores['avgOverallRatingAll'] = np.mean(sessionRatings)
    
            # Total length of session in words:
            sessionScores['lengthOfSession'] = sessionInfo['scores'][-1]['summary_len']
        
            # Average time reading initial summary:
            sessionScores['initialReadingTime'] = sessionInfo['scores'][1]['query'][2]
            
            # Number of queries:
            sessionScores['numQueries'] = len(sessionInfo['scores']) - 1
            
            # Average time reading responses to queries:
            responseReadingTimes = []
            for iterIdx in range(2, len(sessionInfo['scores'])):
                responseReadingTimes.append(sessionInfo['scores'][iterIdx]['query'][2] - sessionInfo['scores'][iterIdx-1]['query'][2])
            responseReadingTimes.append(sessionInfo['exploreTime'] - sessionInfo['scores'][-1]['query'][2])
            #sessionScores['avgResponseReadingTime'] = float(sessionInfo['exploreTime'] - sessionScores['initialReadingTime']) / sessionScores['numQueries']
            sessionScores['avgResponseReadingTime'] = np.mean(responseReadingTimes)
            sessionScores['stdResponseReadingTime'] = np.std(responseReadingTimes)
            
            # ROUGE F1 scores at 250 words (R1, R2, RL, RSU)
            # the score used for a session will be the difference from the average in its topic
            sessionScores['rouge1_F1_At250'] = getScoreDistanceFromAverage(yAllAtWordLimits['R1'][topicId], workerId)
            sessionScores['rouge2_F1_At250'] = getScoreDistanceFromAverage(yAllAtWordLimits['R2'][topicId], workerId)
            sessionScores['rougeL_F1_At250'] = getScoreDistanceFromAverage(yAllAtWordLimits['RL'][topicId], workerId)
            sessionScores['rougeSU_F1_At250'] = getScoreDistanceFromAverage(yAllAtWordLimits['RSU'][topicId], workerId)
            # TODO: litepyramid
            
            # AUC score between intersecting sessions
            # the score used for a session will be the difference from the average in its topic
            sessionScores['aucInRange_R1'] = getScoreDistanceFromAverage(aucAllIntersecting['R1'][topicId], workerId)
            sessionScores['aucInRange_R2'] = getScoreDistanceFromAverage(aucAllIntersecting['R2'][topicId], workerId)
            sessionScores['aucInRange_RL'] = getScoreDistanceFromAverage(aucAllIntersecting['RL'][topicId], workerId)
            # TODO: litepyramid
            
            initialSummaryText = sessionInfo['scores'][0]['summary']
            rR1r, rR1p, rR1f, rR2r, rR2p, rR2f, rRLr, rRLp, rRLf = getRougeScore(initialSummaryText, refSummsByTopic[topicId])
            sessionScores['initialSummaryR1rec'] = rR1r
            sessionScores['initialSummaryR1prec'] = rR1p
            sessionScores['initialSummaryR1f1'] = rR1f
            sessionScores['initialSummaryR2rec'] = rR2r
            sessionScores['initialSummaryR2prec'] = rR2p
            sessionScores['initialSummaryR2f1'] = rR2f
            sessionScores['initialSummaryRLrec'] = rRLr
            sessionScores['initialSummaryRLprec'] = rRLp
            sessionScores['initialSummaryRLf1'] = rRLf

            scoresPerSession[sessionName] = sessionScores
    
    print('Number of sessions: {}'.format(len(scoresPerSession)))
    return scoresPerSession

In [6]:
def getCorrelations(scoreVals1, scoreVals2):
    # input: two lists of the same length with corresponsing values at each index
    pearson = pearsonr(scoreVals1, scoreVals2)
    spearman = spearmanr(scoreVals1, scoreVals2)
    return {'pearson': pearson, 'spearman': spearman}

In [7]:
def computeCorellations(scoresPerSession):
    sessionNames = [sessionName for sessionName in scoresPerSession]
    scoreNames = [scoreName for scoreName in scoresPerSession[sessionNames[0]]]
    scoreVals = {scoreName:[scoresPerSession[sessionName][scoreName] for sessionName in sessionNames] for scoreName in scoreNames}
    
    correlations = {}
    for scoreIdxI in range(len(scoreNames)):
        scoreNameI = scoreNames[scoreIdxI]
        for scoreIdxJ in range(scoreIdxI+1, len(scoreNames)):
            scoreNameJ = scoreNames[scoreIdxJ]
            correlationVals = getCorrelations(scoreVals[scoreNameI], scoreVals[scoreNameJ])
            correlations.setdefault(scoreNameI, {})[scoreNameJ] = correlationVals
            correlations.setdefault(scoreNameJ, {})[scoreNameI] = correlationVals
            
    return correlations # symmetric dicitionary: scoreNameI -> scoreNameJ -> {'pearson': (score, pVal), 'spearman': (score, pVal)}

In [8]:
def printCorrelations(correlations):
    print('ScoreName1\tScoreName2\tPearson\tpVal\tSpearman\tpVal')
    scoreNames = list(correlations.keys())
    for scoreIdxI in range(len(scoreNames)):
        scoreNameI = scoreNames[scoreIdxI]
        for scoreIdxJ in range(len(scoreNames)):#range(scoreIdxI+1, len(scoreNames)):
            scoreNameJ = scoreNames[scoreIdxJ]
            if scoreNameI != scoreNameJ:
                print('{}\t{}\t{}\t{}\t{}\t{}'.format(
                    scoreNameI,
                    scoreNameJ,
                    correlations[scoreNameI][scoreNameJ]['pearson'][0],
                    correlations[scoreNameI][scoreNameJ]['pearson'][1],
                    correlations[scoreNameI][scoreNameJ]['spearman'][0],
                    correlations[scoreNameI][scoreNameJ]['spearman'][1]
                ))

In [23]:
#########################################################################
# Get correlations between features on the full session level.
#########################################################################

#resultsFilepath = '../MechanicalTurk/results/wild/results_SummarizerClustering.json'
resultsFilepath = '../MechanicalTurk/RealSessions/results_SummarizerClustering.json'
#resultsFilepath = '../MechanicalTurk/RealSessions/results_SummarizerTextRankPlusLexical.json'

allData = loadResultsFile(resultsFilepath)
#startSummLen, endSummLen = getSummLenLimitsForAll(allData) # todo: get min and max range lengths where atleast 10 sessions exist
scoresPerSession = getScoresPerSession(allData)#, startSummLen, endSummLen)
correlations = computeCorellations(scoresPerSession)
print()
printCorrelations(correlations)

Limits: 97 to 333
Number of sessions: 72

ScoreName1	ScoreName2	Pearson	pVal	Spearman	pVal
umuxLite1	umuxLite2	0.5879947383437663	5.610680817702432e-08	0.5695945039248201	1.7732822986640885e-07
umuxLite1	susScore	0.9280195891778085	9.788537843981647e-32	0.9488905188293444	8.711062832492827e-37
umuxLite1	queryResponsiveness	0.6814447480574208	4.4469393905393294e-11	0.7046953705307436	4.909260781544685e-12
umuxLite1	initialRating	0.34349473804068575	0.0031359544473523516	0.31475198953464584	0.007084868282575302
umuxLite1	avgResponseRatingInRange	0.5993988818528514	2.6507003471344716e-08	0.6301966377723456	2.999260366412061e-09
umuxLite1	avgOverallRatingInRange	0.6155866910803406	8.68286649955514e-09	0.6327347226607478	2.4793747038628625e-09
umuxLite1	avgResponseRatingAll	0.6060811880873825	1.684849691570129e-08	0.6196529556147062	6.494697813166397e-09
umuxLite1	stdResponseRatingAll	-0.15294723401410615	0.1996152210550503	-0.1412191446133838	0.23670880389406876
umuxLite1	avgOverallRatingA

In [10]:
scoresPerSession

{'D0601_A1171IQSWQS0K8': {'umuxLite1': 4.0,
  'umuxLite2': 4.0,
  'susScore': 71.65,
  'initialRating': 0.6,
  'avgResponseRatingInRange': 0.6399999999999999,
  'avgOverallRatingInRange': 0.6333333333333333,
  'avgResponseRatingAll': 0.6,
  'stdResponseRatingAll': 0.24494897427831783,
  'avgOverallRatingAll': 0.6,
  'lengthOfSession': 478,
  'initialReadingTime': 18.150031089782715,
  'numQueries': 8,
  'avgResponseReadingTime': 19.10624611377716,
  'stdResponseReadingTime': 4.79310359029101,
  'rouge1_F1_At250': -0.02852391463414633,
  'rouge2_F1_At250': -0.026387524390243902,
  'rougeL_F1_At250': -0.029164768292682886,
  'rougeSU_F1_At250': -0.013187048780487806,
  'aucInRange_R1': -2.840830637885915,
  'aucInRange_R2': -3.425128852891156,
  'aucInRange_RL': -3.4047682654369424},
 'D0601_A3K2VSBTT3WUTI': {'umuxLite1': 5.0,
  'umuxLite2': 5.0,
  'susScore': 87.9,
  'initialRating': 0.8,
  'avgResponseRatingInRange': 0.7666666666666667,
  'avgOverallRatingInRange': 0.7714285714285715,


In [10]:
def getKeyphrases(keyphrasesJsonPath):
    with open(keyphrasesJsonPath, 'r') as inF:
        keyphrasesDict = json.load(inF)
    return keyphrasesDict

In [11]:
def getReferenceSummaries(dsFolder):
    refSumms = {}
    for topicFolder in os.listdir(dsFolder):
        topicRefsFolderpath = os.path.join(dsFolder, topicFolder, 'referenceSummaries')
        if os.path.isdir(topicRefsFolderpath):
            refSumms[topicFolder] = []
            for refFile in os.listdir(topicRefsFolderpath):
                refFilepath = os.path.join(topicRefsFolderpath, refFile)
                with open(refFilepath, 'r') as fIn:
                    refSumm = fIn.read()
                    refSumms[topicFolder].append(refSumm) #wordTokenize(refSumm))
    return refSumms # topicId -> list of texts

In [12]:
tokenizedText = {}
def wordTokenize(text):
    if text not in tokenizedText:
        tokenizedText[text] = " ".join(list(word_tokenize(text))).lower()
    return tokenizedText[text]

rouge = Rouge()
def getRougeScore(cand, refs):
    # calculate rouge-1 recall and f1
    r = [wordTokenize(ref) for ref in refs]
    c = [wordTokenize(cand) for _ in range(len(refs))]
    scores = rouge.get_scores(c, r, avg=True)#[0]
    return round(scores['rouge-1']['r'], 4), \
            round(scores['rouge-1']['p'], 4), \
            round(scores['rouge-1']['f'], 4), \
            round(scores['rouge-2']['r'], 4), \
            round(scores['rouge-2']['p'], 4), \
            round(scores['rouge-2']['f'], 4), \
            round(scores['rouge-l']['r'], 4), \
            round(scores['rouge-l']['p'], 4), \
            round(scores['rouge-l']['f'], 4)

# example usage for one hypothesis vs one ref:
# print(getRougeScore('This is the first sentence.', ['This is the second sentence which is different.']))
# example usage for one hypothesis vs multiple refs:
# print(getRougeScore('This is the first sentence.', ['This is the second sentence which is different.', 'This is the third sentence which is different as well.']))

In [16]:
def getHighestYVal(xAll, yAll):
    # returns highest y value from all data for each of the metrics:
    
    # get the highest X per topic
    maxXperTopic = {}
    for topicId in xAll:
        maxXperTopic[topicId] = -1
        for systemName in xAll[topicId]:
            maxXofSys = xAll[topicId][systemName][-1]
            if maxXofSys > maxXperTopic[topicId]:
                maxXperTopic[topicId] = maxXofSys
    # Keep the minimum max x value over all topics. This is where the upper bound Y value will be taken from.
    minMaxX = min(list(maxXperTopic.values()))
    #print(minMaxX)
    
    # get the y values at this x limit:
    yAllAtWordLimit = getYatXlimits(xAll, yAll, minMaxX)
    #print(yAllAtWordLimit)
    
    # get the maximum y value for each topic at the word limit
    maxY = {}
    for metric in yAllAtWordLimit:
        maxY[metric] = {}
        for topic in yAllAtWordLimit[metric]:
            maxY[metric][topic] = -1
            for systemName in yAllAtWordLimit[metric][topic]:
                maxYOfSystem = yAllAtWordLimit[metric][topic][systemName] # the Y value of the system at the word limit
                if maxYOfSystem == None:
                    maxYOfSystem = yAll[metric][topic][systemName][-1]
                if maxYOfSystem > maxY[metric][topic]:
                    maxY[metric][topic] = maxYOfSystem
                
    return maxY, minMaxX # metric -> topicId -> maxVal,   <last x value to consider>

def getScoresPerIteration(allData, iterationIndicesToConsider=None):
    # if iterationIndicesToConsider is None, considers all, otherwise only those in the given list
    
    metrics = ['R1', 'R2', 'RL', 'RSU', 'litepyramid']
    # X axis is the word count, and Y axis is the ROUGE score
    # get the X,Y values from the results json (yAll is the ROUGE recall scores which are the ones to use mostly):
    xAll, yAll, yAllF1 = getXYvalues(allData, metrics, 250)
    # get the X,Y values that the different systems intersect on for all metrics/topics:
    xAllIntersecting, yAllIntersecting, lowerBoundX, upperBoundX = getIntersectingXYvalues(xAll, yAll)
    print('Limits: {} to {}'.format(lowerBoundX, upperBoundX))
    # get the AUCs of the intersecting regions for all metrics/topics/systems
    aucAllIntersecting = getAUCvalues(xAllIntersecting, yAllIntersecting)
    # get the Y values (ROUGE/LitePyr scores) at which the X limit (word count) is reached for all metrics/topics/systems:
    yAllAtWordLimits = getYatXlimits(xAll, yAllF1, 250)
    ## for the lite-pyarmid scores ar the wordCountLimit, we will use the recall score instead of F1:
    #yAllAtWordLimitsRecall = getYatXlimits(xAll, yAll, wordCountLimit)
    #for topic in yAllAtWordLimits['litepyramid']:
    #    for sysName in yAllAtWordLimits['litepyramid'][topic]:
    #        yAllAtWordLimits['litepyramid'][topic][sysName] = yAllAtWordLimitsRecall['litepyramid'][topic][sysName]
    # get the maximum score per metric (as an upper bound for each topic)
    maxYsPerTopic, maxSummaryLen = getHighestYVal(xAll, yAll)
    # get the reference summaries:
    refSummsByTopic = getReferenceSummaries('../data/DUC2006Clean')
    # get the keyphrases in order for each topic (as presented in the UI):
    #keyphrasesByTopic = getKeyphrases('keyphrasesSuggestedQueriesNgramCount.json')
    keyphrasesByTopic = getKeyphrases('keyphrasesSuggestedQueriesNgramCount_server.json')
    
    scoresPerIteration = {}
    for topicId in allData:
        for sessionInfo in allData[topicId]:
            workerId = sessionInfo['id']
            
            if workerId not in xAll[topicId]:
                continue
            
            # get the first and last indices of the iterations within the summary length range specified:
            firstIterationIdxWithinRange, lastIterationIdxWithinRange = getIterationIndicesWithinRange(sessionInfo, lowerBoundX, upperBoundX)
            
            iterationScores = {}
            lastFullSummary = ''
            for iterationIdx, iterationInfo in enumerate(sessionInfo['scores']):
                
                if iterationIdx == 0:
                    print('---')
                    lastFullSummary = iterationInfo['summary']
                    continue
                elif iterationIndicesToConsider != None and iterationIdx not in iterationIndicesToConsider:
                    lastFullSummary = iterationInfo['summary']
                    continue
                elif iterationInfo['query'][1] == 'addmore' or iterationInfo['query'][0].strip() == '':
                    # empty query text
                    lastFullSummary = iterationInfo['summary']
                    continue
                elif iterationInfo['query'][1] != 'keyword':
                    # only look at the queries that are keyphrases
                    lastFullSummary = iterationInfo['summary']
                    continue
                elif iterationInfo['summary_len'] > maxSummaryLen:
                    break
                    
                iterationInfoLast = sessionInfo['scores'][iterationIdx-1]
                iterationInfoNext = sessionInfo['scores'][iterationIdx+1] if iterationIdx < len(sessionInfo['scores']) - 1 else None
                iterationName = '{}_{}_{}'.format(topicId, workerId, iterationIdx)

                # rating of iteration:
                iterationScores['rating'] = iterationInfo['rating']
                
                # number of words added in current iteration:
                iterationScores['lengthDelta'] = iterationInfo['summary_len'] - iterationInfoLast['summary_len']
                
                # ROUGE recall and F1 deltas in this iteration:
                iterationScores['metricRecallDeltaR1'] = iterationInfo['results']['R1']['recall'] - iterationInfoLast['results']['R1']['recall']
                iterationScores['metricF1DeltaR1'] = iterationInfo['results']['R1']['f1'] - iterationInfoLast['results']['R1']['f1']
                iterationScores['metricRecallDeltaR2'] = iterationInfo['results']['R2']['recall'] - iterationInfoLast['results']['R2']['recall']
                iterationScores['metricF1DeltaR2'] = iterationInfo['results']['R2']['f1'] - iterationInfoLast['results']['R2']['f1']
                iterationScores['metricRecallDeltaRL'] = iterationInfo['results']['RL']['recall'] - iterationInfoLast['results']['RL']['recall']
                iterationScores['metricF1DeltaRL'] = iterationInfo['results']['RL']['f1'] - iterationInfoLast['results']['RL']['f1']
                # TODO: litepyramid
                
                # ROUGE incrase relative to the score at 250 words (e.g. 5/30)
                iterationScores['metricRelativeRecallDeltaR1'] = iterationScores['metricRecallDeltaR1'] / yAllAtWordLimits['R1'][topicId][workerId]
                iterationScores['metricRelativeRecallDeltaR2'] = iterationScores['metricRecallDeltaR2'] / yAllAtWordLimits['R2'][topicId][workerId]
                iterationScores['metricRelativeRecallDeltaRL'] = iterationScores['metricRecallDeltaRL'] / yAllAtWordLimits['RL'][topicId][workerId]
                
                # ROUGE increase relative to amount of points left to upper bound:
                # e.g. if the upper bound is 40 and the last iteration was at 25 and we now increased by 8, then 8/15
                iterationScores['metricRemainingRecallDeltaR1'] = iterationScores['metricRecallDeltaR1'] / (maxYsPerTopic['R1'][topicId] - iterationInfoLast['results']['R1']['recall'])
                iterationScores['metricRemainingRecallDeltaR2'] = iterationScores['metricRecallDeltaR2'] / (maxYsPerTopic['R2'][topicId] - iterationInfoLast['results']['R2']['recall'])
                iterationScores['metricRemainingRecallDeltaRL'] = iterationScores['metricRecallDeltaRL'] / (maxYsPerTopic['RL'][topicId] - iterationInfoLast['results']['RL']['recall'])
                
                # time reading current iteration:
                if iterationInfoNext != None:
                    iterationScores['readingTime'] = iterationInfoNext['query'][2] - iterationInfo['query'][2]
                else:
                    iterationScores['readingTime'] = sessionInfo['exploreTime'] - iterationInfo['query'][2]

                # query index:
                iterationScores['iterationIdx'] = iterationIdx
                iterationScores['iterationIdxRelative'] = float(iterationIdx) / len(sessionInfo['scores'])
                
                iterationScores['queryLength'] = len(word_tokenize(iterationInfo['query'][0]))
                
                # response text to query text ROUGE scores:
                responseText = iterationInfo['summary'][len(lastFullSummary):]
                queryText = iterationInfo['query'][0].strip()
                qrR1r, qrR1p, qrR1f, qrR2r, qrR2p, qrR2f, qrRLr, qrRLp, qrRLf = getRougeScore(responseText, [queryText])
                iterationScores['queryResponseR1rec'] = qrR1r
                iterationScores['queryResponseR1prec'] = qrR1p
                iterationScores['queryResponseR1f1'] = qrR1f
                iterationScores['queryResponseR2rec'] = qrR2r
                iterationScores['queryResponseR2prec'] = qrR2p
                iterationScores['queryResponseR2f1'] = qrR2f
                iterationScores['queryResponseRLrec'] = qrRLr
                iterationScores['queryResponseRLprec'] = qrRLp
                iterationScores['queryResponseRLf1'] = qrRLf
                
                # response text to reference summaries ROUGE scores:
                #print([r[0:5] for r in refSummsByTopic[topicId]])
                rR1r, rR1p, rR1f, rR2r, rR2p, rR2f, rRLr, rRLp, rRLf = getRougeScore(responseText, refSummsByTopic[topicId])
                iterationScores['responseR1rec'] = rR1r
                iterationScores['responseR1prec'] = rR1p
                iterationScores['responseR1f1'] = rR1f
                iterationScores['responseR2rec'] = rR2r
                iterationScores['responseR2prec'] = rR2p
                iterationScores['responseR2f1'] = rR2f
                iterationScores['responseRLrec'] = rRLr
                iterationScores['responseRLprec'] = rRLp
                iterationScores['responseRLf1'] = rRLf
                
                # the index of the keyword in the list presented in the GUI:
                if queryText in keyphrasesByTopic[topicId]:
                    iterationScores['keyphraseIndex'] = keyphrasesByTopic[topicId].index(queryText)
                else:
                    iterationScores['keyphraseIndex'] = -1

                scoresPerIteration[iterationName] = iterationScores
                #print(iterationScores['metricRemainingRecallDeltaR1'], iterationScores['metricRecallDeltaR1'], maxYsPerTopic['R1'][topicId], iterationInfoLast['results']['R1']['recall'], iterationInfo['query'][1], ':', iterationInfo['query'][0])
                print('|', iterationScores['metricRemainingRecallDeltaR1'], '|', rR1f, '|', iterationInfo['query'][1], ':', queryText, iterationScores['keyphraseIndex'])
                
                lastFullSummary = iterationInfo['summary']
    
    print('Number of sessions: {}'.format(len(scoresPerIteration)))
    return scoresPerIteration



In [17]:
#########################################################################
# Get correlations between features on the iteration level.
#########################################################################

#resultsFilepath = '../MechanicalTurk/results/batch3456/results_SummarizerClustering.json'
#resultsFilepath = '../MechanicalTurk/RealSessions/results_SummarizerClustering.json'
resultsFilepath = '../MechanicalTurk/RealSessions/results_SummarizerTextRankPlusLexical.json'

allData = loadResultsFile(resultsFilepath)
#startSummLen, endSummLen = getSummLenLimitsForAll(allData) # todo: get min and max range lengths where atleast 10 sessions exist
scoresPerIteration = getScoresPerIteration(allData)#, iterationIndicesToConsider=[1])
correlations = computeCorellations(scoresPerIteration)
print()
printCorrelations(correlations)

Limits: 105 to 368
---
| 0.15919860698595759 | 0.1106 | keyword : american indian youth -1
| 0.2623916706474251 | 0.1732 | keyword : indian welfare -1
| 0.27921300216241063 | 0.1778 | keyword : indian culture -1
| 0.29365629561376405 | 0.1777 | keyword : american indian communities -1
| 0.19461171320997342 | 0.1194 | keyword : american indian gambling -1
| 0.30748844234027933 | 0.1489 | keyword : indian casinos -1
---
| 0.2653217391983337 | 0.1732 | keyword : indian welfare -1
| 0.14914271864648895 | 0.1081 | keyword : american indian communities -1
---
| 0.23458358425870707 | 0.1489 | keyword : indian casinos -1
| 0.23548616767122615 | 0.152 | keyword : american indian communities -1
| 0.15114495909264458 | 0.0934 | keyword : indian land -1
| 0.17116973764023366 | 0.1106 | keyword : american indian youth -1
| 0.43445605688690897 | 0.1134 | keyword : indian welfare -1
| 0.38410459715597395 | 0.1384 | keyword : tribal casinos -1
| 0.8315364273052205 | 0.1428 | keyword : indian culture -

| 0.14706077274835977 | 0.1307 | keyword : new radar data -1
| 0.1800699300699301 | 0.1461 | keyword : cockpit voice recorder 12
| 0.12617863065624255 | 0.1055 | keyword : u.s. accident investigators -1
| 0.16044897516538342 | 0.1204 | keyword : u.s. authorities -1
---
| 0.16515298474561305 | 0.1307 | keyword : new radar data -1
| 0.44109752460483126 | 0.1461 | keyword : cockpit voice recorder 12
---
| 0.19786357228066384 | 0.1507 | keyword : egyptair boeing -1
| 0.13999864800919326 | 0.0831 | keyword : u.s. investigators -1
| 0.17947829101308613 | 0.1307 | keyword : new radar data -1
---
| 0.12188170008828353 | 0.1055 | keyword : u.s. accident investigators -1
| 0.09631459512251174 | 0.1284 | keyword : the egyptair flight -1
| 0.2131595677050222 | 0.1461 | keyword : cockpit voice recorder 12
| 0.2222758731691614 | 0.1507 | keyword : egyptair boeing -1
| 0.1630794701986767 | 0.0662 | keyword : u.s. searchers -1
| 0.5121167161226493 | 0.1307 | keyword : new radar data -1
---
| 0.2052228

| 0.18969064705197616 | 0.1774 | keyword : russian efforts -1
| 0.28705071584566866 | 0.1808 | keyword : russian rescue capsule -1
| 0.14100149581095434 | 0.1528 | keyword : russian navy 2
| 0.1847943771766786 | 0.1127 | keyword : russian rescuers -1
| 0.4500835979778203 | 0.194 | keyword : time submarine -1
---
| 0.15897574456736313 | 0.1585 | keyword : russian officials 4
| 0.1606485097237685 | 0.1276 | keyword : russian vessels -1
| 0.22520523446883217 | 0.1774 | keyword : russian efforts -1
| 0.1162657490668205 | 0.1084 | keyword : russian officers -1
| 0.23884967604155358 | 0.1661 | keyword : sunken submarine kursk -1
| 0.4678626657454011 | 0.194 | keyword : time submarine -1
| 0.6331906079483094 | 0.1539 | keyword : russian rescuers -1
---
| 0.19274851553483624 | 0.1661 | keyword : sunken submarine kursk -1
| 0.10975631151423713 | 0.1336 | keyword : russian navy 2
| 0.24106783289411426 | 0.1726 | keyword : submarine rescue -1
---
---
| 0.23184485183663442 | 0.1601 | keyword : el 

In [23]:
metrics = ['R1', 'R2', 'RL', 'RSU', 'litepyramid']
# X axis is the word count, and Y axis is the ROUGE score
# get the X,Y values from the results json (yAll is the ROUGE recall scores which are the ones to use mostly):
xAll, yAll, yAllF1 = getXYvalues(allData, metrics, 250)
maxYsPerTopic = getHighestYVal(xAll, yAll)
maxYsPerTopic

({'R1': {'D0601': 0.5459585185185185,
   'D0603': 0.5714635593220339,
   'D0605': 0.5196433333333333,
   'D0608': 0.5236290476190477,
   'D0614': 0.60690625,
   'D0615': 0.51497,
   'D0616': 0.5830983720930232,
   'D0617': 0.5473606818181818,
   'D0620': 0.5144406557377049,
   'D0624': 0.6090732653061224,
   'D0627': 0.5515712500000001,
   'D0628': 0.586734,
   'D0629': 0.5206074468085107,
   'D0630': 0.495090652173913,
   'D0631': 0.562685,
   'D0640': 0.54006,
   'D0643': 0.5681553846153846,
   'D0645': 0.52473,
   'D0647': 0.6186591228070175,
   'D0650': 0.5288491489361702},
  'R2': {'D0601': 0.11115185185185185,
   'D0603': 0.11426830508474575,
   'D0605': 0.124105,
   'D0608': 0.090405,
   'D0614': 0.169645625,
   'D0615': 0.10721,
   'D0616': 0.13289953488372094,
   'D0617': 0.16415749999999998,
   'D0620': 0.09009,
   'D0624': 0.16192,
   'D0627': 0.119794375,
   'D0628': 0.120175,
   'D0629': 0.08988659574468086,
   'D0630': 0.10948847826086956,
   'D0631': 0.15905930232558138,