In [1]:
import json
import os
import csv

In [26]:
#######################################################
# Oracle simulation generation functionality
# (top 10 SCUs of each topic)
#######################################################

def getScusFromPyrFile(pyrFilepath):
    scus = []
    with open(pyrFilepath, 'r') as fIn:
        inScu = False
        scuContributorCount = 0
        for line in fIn:
            lineStripped = line.strip()
            if not inScu:
                if lineStripped.startswith('<scu '):
                    # ex. <scu uid="2" label="Tribes differ widely">
                    scuId = lineStripped.split('uid=')[1].split('"')[1]
                    scuTxt = lineStripped.split('label=')[1].split('"')[1]
                    scuText = scuText.replace('&quot;', '"')
                    inScu = True
                    scuContributorCount = 0
            else:
                if lineStripped.startswith('</scu>'):
                    scus.append({'id': scuId, 'text': scuTxt, 'weight': scuContributorCount})
                    inScu = False
                    scuContributorCount = 0
                elif lineStripped.startswith('<contributor '):
                    scuContributorCount += 1
    return scus

def getSCUinfo(pyramidFolderpath):
    scuInfo = {}
    pyrFiles = os.listdir(pyramidFolderpath)
    for pyrFile in pyrFiles:
        topicId = pyrFile.split('.')[0]
        pyrFilepath = os.path.join(pyramidFolderpath, pyrFile)
        scuInfo[topicId] = getScusFromPyrFile(pyrFilepath)
    return scuInfo

def createOracleQueriesJson(scusInfo, outputJsonPath):
    outJsonDict = {}
    for topic in scusInfo:
        ## keep only the SCUs with weight 3 or 4, and sort:
        #potentialScusInfo = [scuInfo for scuInfo in scusInfo[topic] if scuInfo['weight'] >= 3]
        #sorted(potentialScusInfo, key = lambda i: i['weight'], reverse=True)
        ## keep just the text of the SCUs:
        #outJsonDict[topic] = [{'type': 'free_text', 'text': scuInfo['text']} for scuInfo in potentialScusInfo]
        
        # take the first 10 SCUs (they are already sorted by weight)
        outJsonDict[topic] = [{'type': 'initial', 'request_len': 50}] # length in tokens
        outJsonDict[topic].extend([{'type': 'freetext', 'text': scuInfo['text'], 'request_len': 2} for scuInfo in scusInfo[topic][:10]]) # length in sentences
    
    with open(outputJsonPath, 'w') as fp:
        json.dump(outJsonDict, fp, indent=4)
        
        
        
PYRAMIDS_FOLDERPATH = 'C:/Users/user/Data/DUC/2006_unzipped/allpyramids'
JSON_OUTPUT_PATH = 'simulationQueriesOracle.json'

scuInfo = getSCUinfo(PYRAMIDS_FOLDERPATH)
createOracleQueriesJson(scuInfo, JSON_OUTPUT_PATH)

In [40]:
#######################################################
# Keyphrases simulation generation functionality
# (indicate to use the key phrases as queries)
#######################################################

def getTopicIds(pyramidFolderpath):
    pyrFiles = os.listdir(pyramidFolderpath)
    topicIds = [pyrFile.split('.')[0] for pyrFile in pyrFiles]
    return topicIds

def createKeyphraseQueriesJson(topicIds, outputJsonPath):
    outJsonDict = {}
    for topic in topicIds:
        outJsonDict[topic] = [{'type': 'initial', 'request_len': 175}] # length in tokens
        # indicate to take the next keyphrase in the list
        outJsonDict[topic].extend([{'type': 'keyword', 'text': '<[{}]>'.format(i), 'request_len': 3} for i in range(5)]) # length in sentences
    
    with open(outputJsonPath, 'w') as fp:
        json.dump(outJsonDict, fp, indent=4)


PYRAMIDS_FOLDERPATH = 'C:/Users/user/Data/DUC/2006_unzipped/allpyramids' # to get the relevant topic IDs
JSON_OUTPUT_PATH = 'simulationQueriesKeyphrases175_3.json'
topicIds = getTopicIds(PYRAMIDS_FOLDERPATH)
createKeyphraseQueriesJson(topicIds, JSON_OUTPUT_PATH)

In [8]:
#######################################################
# Highlight simulation generation functionality
# (indicate to use the the first few words of each previous output)
#######################################################

def getTopicIds(pyramidFolderpath):
    pyrFiles = os.listdir(pyramidFolderpath)
    topicIds = [pyrFile.split('.')[0] for pyrFile in pyrFiles]
    return topicIds

def createHighlightQueriesJson(topicIds, outputJsonPath):
    outJsonDict = {}
    for topic in topicIds:
        outJsonDict[topic] = [{'type': 'initial', 'request_len': 50}] # length in tokens
        # indicate to take the first 5 tokens of the last summary:
        outJsonDict[topic].extend([{'type': 'highlight', 'text': '<tokens[0:5]>', 'request_len': 2} for i in range(10)]) # length in sentences
        # can also send:
        #   '<tokens[i:j]>' for choosing a token range (i to j-1)
        #   '<chars[i:j]>' for choosing a character range (i to j-1)
        #   '<np[i]>' for the i-th noun phrase in the text
        #   '<ne[i]>' for the i-th named entity in the text
    
    with open(outputJsonPath, 'w') as fp:
        json.dump(outJsonDict, fp, indent=4)


PYRAMIDS_FOLDERPATH = 'C:/Users/user/Data/DUC/2006_unzipped/allpyramids' # to get the relevant topic IDs
JSON_OUTPUT_PATH = 'simulationQueriesHighlights.json'
topicIds = getTopicIds(PYRAMIDS_FOLDERPATH)
createHighlightQueriesJson(topicIds, JSON_OUTPUT_PATH)

In [9]:
#######################################################
# Highlight simulation generation functionality
# (indicate to use the the first named entity of each previous output)
#######################################################

def getTopicIds(pyramidFolderpath):
    pyrFiles = os.listdir(pyramidFolderpath)
    topicIds = [pyrFile.split('.')[0] for pyrFile in pyrFiles]
    return topicIds

def createHighlightQueriesJson(topicIds, outputJsonPath):
    outJsonDict = {}
    for topic in topicIds:
        outJsonDict[topic] = [{'type': 'initial', 'request_len': 50}] # length in tokens
        # indicate to take the first 5 tokens of the last summary:
        outJsonDict[topic].extend([{'type': 'highlight', 'text': '<ne[0]>', 'request_len': 2} for i in range(10)]) # length in sentences
        # can also send:
        #   '<tokens[i:j]>' for choosing a token range (i to j-1)
        #   '<chars[i:j]>' for choosing a character range (i to j-1)
        #   '<np[i]>' for the i-th noun phrase in the text
        #   '<ne[i]>' for the i-th named entity in the text
    
    with open(outputJsonPath, 'w') as fp:
        json.dump(outJsonDict, fp, indent=4)


PYRAMIDS_FOLDERPATH = 'C:/Users/user/Data/DUC/2006_unzipped/allpyramids' # to get the relevant topic IDs
JSON_OUTPUT_PATH = 'simulationQueriesHighlightsNE.json'
topicIds = getTopicIds(PYRAMIDS_FOLDERPATH)
createHighlightQueriesJson(topicIds, JSON_OUTPUT_PATH)

In [10]:
#######################################################
# Highlight simulation generation functionality
# (indicate to use the the first noun phrase of each previous output)
#######################################################

def getTopicIds(pyramidFolderpath):
    pyrFiles = os.listdir(pyramidFolderpath)
    topicIds = [pyrFile.split('.')[0] for pyrFile in pyrFiles]
    return topicIds

def createHighlightQueriesJson(topicIds, outputJsonPath):
    outJsonDict = {}
    for topic in topicIds:
        outJsonDict[topic] = [{'type': 'initial', 'request_len': 50}] # length in tokens
        # indicate to take the first 5 tokens of the last summary:
        outJsonDict[topic].extend([{'type': 'highlight', 'text': '<np[0]>', 'request_len': 2} for i in range(10)]) # length in sentences
        # can also send:
        #   '<tokens[i:j]>' for choosing a token range (i to j-1)
        #   '<chars[i:j]>' for choosing a character range (i to j-1)
        #   '<np[i]>' for the i-th noun phrase in the text
        #   '<ne[i]>' for the i-th named entity in the text
    
    with open(outputJsonPath, 'w') as fp:
        json.dump(outJsonDict, fp, indent=4)


PYRAMIDS_FOLDERPATH = 'C:/Users/user/Data/DUC/2006_unzipped/allpyramids' # to get the relevant topic IDs
JSON_OUTPUT_PATH = 'simulationQueriesHighlightsNP.json'
topicIds = getTopicIds(PYRAMIDS_FOLDERPATH)
createHighlightQueriesJson(topicIds, JSON_OUTPUT_PATH)

In [6]:
#######################################################
# Almost-Oracle simulation generation functionality
# (10 Lite-pyramid SCUs of each topic)
#######################################################

def getScusFromLitePyrFile(pyrFilepath):
    scus = []
    with open(pyrFilepath, 'r') as inF:
        csvReader = csv.DictReader(inF, delimiter=',', quotechar='"')
        for row in csvReader:
            if row['forUse'] == '1':
                scus.append({'id': row['questionId'], 'text': row['questionText'], 'weight': 1})
    
    return scus

def getLiteSCUinfo(litePyramidFolderpath):
    scuInfo = {}
    topicFolders = os.listdir(litePyramidFolderpath)
    for topicFolder in topicFolders:
        randomScusFilepath = os.path.join(litePyramidFolderpath, topicFolder, 'questions/batch10.csv')
        if os.path.exists(randomScusFilepath):
            scuInfo[topicFolder] = getScusFromLitePyrFile(randomScusFilepath)
            
    return scuInfo

def createOracleLiteQueriesJson(scusInfo, outputJsonPath):
    outJsonDict = {}
    for topic in scusInfo:
        ## keep only the SCUs with weight 3 or 4, and sort:
        #potentialScusInfo = [scuInfo for scuInfo in scusInfo[topic] if scuInfo['weight'] >= 3]
        #sorted(potentialScusInfo, key = lambda i: i['weight'], reverse=True)
        ## keep just the text of the SCUs:
        #outJsonDict[topic] = [{'type': 'free_text', 'text': scuInfo['text']} for scuInfo in potentialScusInfo]
        
        # take the first 10 SCUs (they are already sorted by weight)
        outJsonDict[topic] = [{'type': 'initial', 'request_len': 50}] # length in tokens
        outJsonDict[topic].extend([{'type': 'freetext', 'text': scuInfo['text'], 'request_len': 2} for scuInfo in scusInfo[topic][:10]]) # length in sentences
    
    with open(outputJsonPath, 'w') as fp:
        json.dump(outJsonDict, fp, indent=4)
        
        
        
LITE_PYRAMIDS_FOLDERPATH = 'C:/Users/user/Google Drive/School/Thesis/Summarization/ExtractiveSystems/qfse_shared/data/DUC2006Clean'
JSON_OUTPUT_PATH = 'simulationQueriesOracleLite.json'

scuInfo = getLiteSCUinfo(LITE_PYRAMIDS_FOLDERPATH)
createOracleLiteQueriesJson(scuInfo, JSON_OUTPUT_PATH)

In [6]:
#######################################################
# Keyphrases simulation generation functionality
# (indicate to use the key phrases as queries)
# No initial summary -- just getting a sentence at a time for the top keyphrases.
# This is used as a static summarization system, and less as an interactive summary.
#######################################################

def getTopicIds(pyramidFolderpath):
    pyrFiles = os.listdir(pyramidFolderpath)
    topicIds = [pyrFile.split('.')[0] for pyrFile in pyrFiles]
    return topicIds

def createKeyphraseQueriesJson(topicIds, outputJsonPath):
    outJsonDict = {}
    for topic in topicIds:
        outJsonDict[topic] = [{'type': 'initial', 'request_len': 0}] # length in tokens
        # indicate to take the next keyphrase in the list
        #outJsonDict[topic].extend([{'type': 'keyword', 'text': '<[{}]>'.format(i), 'request_len': 1} for i in range(17)]) # length in sentences
        outJsonDict[topic].extend([{'type': 'keyword', 'text': '<[1]>'.format(i), 'request_len': 1} for i in range(17)]) # length in sentences
    
    with open(outputJsonPath, 'w') as fp:
        json.dump(outJsonDict, fp, indent=4)


PYRAMIDS_FOLDERPATH = 'C:/Users/user/Data/DUC/2006_unzipped/allpyramids' # to get the relevant topic IDs
JSON_OUTPUT_PATH = 'simulationQueriesTopKeyphraseStatic.json'
topicIds = getTopicIds(PYRAMIDS_FOLDERPATH)
createKeyphraseQueriesJson(topicIds, JSON_OUTPUT_PATH)