In [1]:
import pandas as pd
from os import listdir
from os.path import isfile, join

input_folder = "/home/anca/Documents/frames/data/crowdsourcing/input/fn_corpus/"
onlyfiles = [f for f in listdir(input_folder) if isfile(join(input_folder, f))]

sent_ids = []

for f in range(len(onlyfiles)):
    input_sents = pd.read_csv(input_folder + onlyfiles[f])
    sent_ids = sent_ids + [sentid for sentid in input_sents["sent_id"]]
sent_ids.append("none")

In [2]:
# Read data

frames_folder = "/home/anca/Documents/frames/data/crowdsourcing/output/frame-centric/fn_corpus/"
#frames_dataset = pd.read_csv(frames_filename)

#frames_dataset.head(n=10)
import sys
stdout = sys.stdout

sys.path.append('../../')

from defaultconfig import Configuration

class FramesConfig(Configuration):
    inputColumns = [
        'Input.frame', 'Input.definition', 'Input.examples', 'Input.elements', 'Input.sentences', 'Input.sent_ids',
        'Input.words', 'Input.b', 'Input.e', 'Input.nr_sentences']
    outputColumns = ["Answer.PhraseType"]
    
    # processing of a closed task
    open_ended_task = False
    annotation_vector = sent_ids
    
    def processJudgments(self, judgments):
        # change default separator to whitespace to make it work with our file
        for col in self.outputColumns:
            judgments[col] = judgments[col].apply(lambda x: str(x).replace('|',','))
            
            judgments[col] = judgments[col].apply(lambda x: str(x).replace('None of the above.','none'))
        return judgments

config = FramesConfig()

# pre-process the data and create the annotation vectors
from controllers.inputController import processFile
pre_processed_results = processFile(
    root=".", directory=frames_folder, filename="",
    config=config
)

In [4]:
# %%debug 

sys.stdout = stdout

# run the metrics
from models import Metrics
processed_results = Metrics.run(pre_processed_results, config)

1 iterations; max d= 1.0 ; wqs d= 0.801315690494; sqs d= 0.701970166652; rqs d= 0.687076517569
2 iterations; max d= 0.346226001147 ; wqs d= 0.107201253875; sqs d= 0.0437590018865; rqs d= 0.0553910053745
3 iterations; max d= 0.119485792687 ; wqs d= 0.017063944617; sqs d= 0.0152232793072; rqs d= 0.0134987804602
4 iterations; max d= 0.0829002408177 ; wqs d= 0.0103144943069; sqs d= 0.0106742078086; rqs d= 0.00637821755235
5 iterations; max d= 0.0483559051397 ; wqs d= 0.00371415229266; sqs d= 0.00330788662148; rqs d= 0.00247583435074
6 iterations; max d= 0.0222987011649 ; wqs d= 0.00246389552442; sqs d= 0.00273180238475; rqs d= 0.00160762609392
7 iterations; max d= 0.015269701391 ; wqs d= 0.000988202272523; sqs d= 0.000759313868489; rqs d= 0.000615491600808
8 iterations; max d= 0.0065461908613 ; wqs d= 0.000635496531906; sqs d= 0.000714061061573; rqs d= 0.000427385567402
9 iterations; max d= 0.0044201705288 ; wqs d= 0.000269217993246; sqs d= 0.000179941954747; rqs d= 0.000165883844683
10 it

In [5]:
processed_results["annotations"]

Unnamed: 0,output.PhraseType,aqs
FNC-0031,12020,0.343675
FNC-0036,12020,0.510825
FNC-0054,12020,0.456762
FNC-0057,12020,0.635841
FNC-0059,12020,0.349443
FNC-0060,12020,0.615425
FNC-0061,12020,0.294251
FNC-0062,12020,0.461565
FNC-0080,12020,0.264401
FNC-0082,12020,0.697545


In [6]:
import numpy as np

unique_frames = np.unique(processed_results["units"]["input.frame"])
unique_sentences = config.annotation_vector

frames = []
sentences = []
numerator = []
denominator = []
judgments = []

for idx in processed_results["units"].index:
    frame_sent_arr = processed_results["units"]["input.sent_ids"][idx].split(",")
    frame_judg = processed_results["judgments"].loc[processed_results["judgments"]["unit"] == idx]
    frame_judg = frame_judg[~frame_judg.index.duplicated(keep='first')]
    
    for sent in frame_sent_arr:
        # print sent + " ; " + processed_results["units"]["input.frame"][idx]
        frames.append(processed_results["units"]["input.frame"][idx].replace('f:', '').lower())
        sentences.append(sent)
        
        num = 0.0
        den = 0.0
        for jdx in frame_judg.index:
            num += (frame_judg.loc[jdx]["output.PhraseType"][sent] * 
                    processed_results["workers"]["wqs"][frame_judg["worker"][jdx]])
            den += processed_results["workers"]["wqs"][frame_judg["worker"][jdx]]
        numerator.append(num)
        denominator.append(den)
        judgments.append(len(frame_judg.index))

sent_frame_res = pd.DataFrame({
    "frame" : frames,
    "sent" : sentences,
    "num" : numerator,
    "den" : denominator,
    "judgments" : judgments
})

In [7]:
#sent_frame_res

agg_res = sent_frame_res.groupby(["frame", "sent"]).sum()
agg_res

Unnamed: 0_level_0,Unnamed: 1_level_0,den,judgments,num
frame,sent,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abounding_with,FNC-0728,4.631365,15,1.62828
abounding_with,FNC-0749,4.631365,15,2.423163
abounding_with,FNC-2632,4.631365,15,1.26414
accoutrements,FNC-0144,4.733359,15,0.576845
accoutrements,FNC-0824,4.733359,15,0.0
accoutrements,FNC-0887,4.733359,15,2.576313
accoutrements,FNC-1487,4.733359,15,0.0
accoutrements,FNC-2539,4.733359,15,0.973435
accoutrements,FNC-2881,4.733359,15,0.0
accoutrements,FNC-2889,4.733359,15,0.684219


In [8]:
agg_res.to_csv(
    "/home/anca/Documents/frames/data/crowdsourcing/output/frame-centric/fn_corpus.csv"
)

In [9]:
processed_results["annotations"] = processed_results["annotations"][~processed_results["annotations"].index.duplicated(keep='first')]
processed_results["annotations"].to_csv(
    "/home/anca/Documents/frames/data/crowdsourcing/output/frame-centric/fn_corpus_sqs.csv"
)

In [10]:
processed_results["units"] = processed_results["units"][~processed_results["units"].index.duplicated(keep='first')]
processed_results["units"].to_csv(
    "/home/anca/Documents/frames/data/crowdsourcing/output/frame-centric/fn_corpus_fqs.csv"
)

In [11]:
processed_results["workers"] = processed_results["workers"][~processed_results["workers"].index.duplicated(keep='first')]
processed_results["workers"].to_csv(
    "/home/anca/Documents/frames/data/crowdsourcing/output/frame-centric/fn_corpus_wqs.csv"
)

In [12]:
processed_results["judgments"] = processed_results["judgments"][~processed_results["judgments"].index.duplicated(keep='first')]
processed_results["judgments"].to_csv(
    "/home/anca/Documents/frames/data/crowdsourcing/output/frame-centric/fn_corpus_judgments.csv"
)

In [13]:
rel_score = list()

frames = np.unique(processed_results["units"]["input.frame"])
for fr in frames:
    frame_units = processed_results["units"].loc[processed_results["units"]["input.frame"] == fr]
    nominator = 0.0
    denominator = 0.0
    
    # print "PROCESSING " + fr
    
    contor = 0
    for unit_id in frame_units.index:
        frame_judg = processed_results["judgments"].loc[processed_results["judgments"]["unit"] == unit_id]
        
        # print "unit " + str(contor) + " out of " + str(len(frame_units.index))
        contor += 1
        
        for idx in frame_judg.index:
            for sent_id in processed_results["annotations"].index:
                if frame_judg.loc[idx]["output.PhraseType"][sent_id] > 0:
                    for jdx in frame_judg.index:
                        if idx != jdx:
                            nominator += (
                                processed_results["annotations"].loc[sent_id]["aqs"] * 
                                frame_judg.loc[idx]["output.PhraseType"][sent_id] * 
                                processed_results["workers"].loc[frame_judg.loc[idx]["worker"]]["wqs"] *
                                frame_judg.loc[jdx]["output.PhraseType"][sent_id] *
                                processed_results["workers"].loc[frame_judg.loc[jdx]["worker"]]["wqs"]
                            )
                            denominator += (
                                processed_results["annotations"].loc[sent_id]["aqs"] * 
                                frame_judg.loc[idx]["output.PhraseType"][sent_id] *
                                processed_results["workers"].loc[frame_judg.loc[idx]["worker"]]["wqs"] *
                                processed_results["workers"].loc[frame_judg.loc[jdx]["worker"]]["wqs"]
                            )
    rel_score.append(nominator/denominator)
    print fr + " = " + str(rel_score[len(rel_score) - 1]) + " = " + str(nominator) + " / " + str(denominator)

f:Abounding_with = 0.405908258719 = 0.829791183226 / 2.04428258209
f:Accoutrements = 0.325391805101 = 1.03327143471 / 3.17546852289
f:Achieving_first = 0.681655511638 = 4.1691118372 / 6.11615657179
f:Active_substance = 0.90259594833 = 3.81662023714 / 4.22849254332
f:Activity_resume = 0.643868480279 = 2.47395865226 / 3.84233539618
f:Adding_up = 0.743008531282 = 2.6062206585 / 3.50765913011
f:Adducing = 0.764475860686 = 5.1374271504 / 6.72019538432
f:Adjusting = 0.420513396942 = 1.26648261138 / 3.01175330107
f:Adopt_selection = 0.780339647352 = 3.01111359573 / 3.85872178345
f:Adorning = 0.619690758668 = 4.5288562078 / 7.30825196997
f:Aggregate = 0.258577117853 = 5.46229569152 / 21.1244356688
f:Aiming = 0.619801881179 = 3.45321435022 / 5.57148091201
f:Amalgamation = 0.540590406159 = 3.05619693881 / 5.65344279883
f:Amounting_to = 0.57475744013 = 5.34703940294 / 9.30312342148
f:Appellations = 0.534210612689 = 1.11629919512 / 2.08962377123
f:Appointing = 0.396841412973 = 1.95645684762 / 4.93

In [14]:
frame_quality_prob = pd.DataFrame({
    "frame": frames,
    "fqs" : rel_score
})
frame_quality_prob

Unnamed: 0,fqs,frame
0,0.725990,f:Achieving_first
1,0.863036,f:Adducing
2,0.756714,f:Adorning
3,0.385290,f:Aggregate
4,0.562265,f:Aiming
5,0.695184,f:Amalgamation
6,0.707989,f:Amounting_to
7,0.522341,f:Appellations
8,0.648963,f:Architectural_part
9,0.501854,f:Arranging


In [15]:
frame_quality_prob.to_csv(
    "/home/anca/Documents/frames/data/crowdsourcing/output/frame-centric/fn_corpus_fqs_prob.csv",
    index = False
)