# Process raw results

We add judgment time and turn the dataset to long format so that we can tag it for correctness.

In [1]:
import os
import pathlib

# let's just make sure we are at the root
os.chdir(pathlib.Path().absolute().parent)
os.getcwd()

'/Users/jorge/Development/work/lyon/ConversationalAI'

In [2]:
%matplotlib inline

pwd = os.getcwd()
%env CORENLP_HOME $pwd/lib/corenlp

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from lib.preprocessing import text

env: CORENLP_HOME=/Users/jorge/Development/work/lyon/ConversationalAI/lib/corenlp


2021-11-20 13:47:16 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |

2021-11-20 13:47:16 INFO: Use device: cpu
2021-11-20 13:47:16 INFO: Loading: tokenize
2021-11-20 13:47:16 INFO: Loading: pos
2021-11-20 13:47:16 INFO: Loading: lemma
2021-11-20 13:47:16 INFO: Done loading processors!
2021-11-20 13:47:16 INFO: Writing properties to tmp file: corenlp_server-e7aa5e1738164ed8.props


In [36]:
df = pd.read_csv("exps/input/main-all-wide.csv",)
df.shape

(2040, 54)

In [37]:
# let's add the judgment time column
#df["OUTPUT:trace"] = df["OUTPUT:trace"].apply(lambda t: json.loads(t))

# we store judgment time in seconds
# for some reason on_focus_time does not work. So we fallback to total_task_time
#df["judgment_time"] = df["OUTPUT:trace"].apply(lambda t: t["total_task_time"]["milliseconds"] / 1000)


# we use the columns from Toloka because it did not manage to store the TRACE column.
df["ASSIGNMENT:submitted"] = df["ASSIGNMENT:submitted"].astype('datetime64')
df["ASSIGNMENT:started"] = df["ASSIGNMENT:started"].astype('datetime64')
df["judgment_time_3p"] = (df["ASSIGNMENT:submitted"] - df["ASSIGNMENT:started"]).dt.total_seconds()
df["judgment_time_1p"] = df["judgment_time_3p"] / 3
df.head()

Unnamed: 0,INPUT:tag,INPUT:row_pk,INPUT:intent,INPUT:input_utterance,INPUT:parameters,OUTPUT:paraphrase_0,OUTPUT:paraphrase_1,OUTPUT:paraphrase_2,INPUT:source,INPUT:pool_id,...,ASSIGNMENT:status,ASSIGNMENT:started,ASSIGNMENT:submitted,ASSIGNMENT:accepted,ASSIGNMENT:rejected,ASSIGNMENT:skipped,ASSIGNMENT:expired,ASSIGNMENT:reward,judgment_time_3p,judgment_time_1p
0,,5,CheckDevice,"""are the burglars alarms in the office good?""","[['location', 'office']]",are the burglars alarms in the office works?,are the burglars alarms in the office cool?,are the buglars alarms in the office the awesone?,ParaQuality,baseline,...,APPROVED,2021-11-16 17:14:53.853,2021-11-16 17:19:11.537,2021-11-16T17:19:11.537,,,,0.15,257.684,85.894667
1,,5,CheckDevice,"""are the burglars alarms in the office good?""","[['location', 'office']]",Does the office have good burglars alarms?,Is the security alarm in the office fine?,Does the office have good signaling?,ParaQuality,baseline,...,APPROVED,2021-11-16 17:15:45.418,2021-11-16 17:33:09.950,2021-11-16T17:33:09.950,,,,0.15,1044.532,348.177333
2,,5,CheckDevice,"""are the burglars alarms in the office good?""","[['location', 'office']]",does are the burglars alarms good in the office?,does in the office the burglars good?,what is the good in the office?,ParaQuality,baseline,...,APPROVED,2021-11-16 17:16:52.852,2021-11-16 17:19:54.037,2021-11-16T17:19:54.037,,,,0.15,181.185,60.395
3,,5,CheckDevice,"""are the burglars alarms in the office good?""","[['location', 'office']]",has the office a good alarm system?,is the alarm system in the office good?,is the security in the office good?,ParaQuality,baseline,...,APPROVED,2021-11-16 17:42:07.972,2021-11-16 17:50:27.700,2021-11-16T17:50:27.700,,,,0.15,499.728,166.576
4,,13,NavigatePlaylist,"""skip to previous song""",[],jump to previous song,skip to previous tune,jump to past song,ParaQuality,baseline,...,APPROVED,2021-11-16 17:43:11.843,2021-11-16 17:44:44.343,2021-11-16T17:44:44.343,,,,0.15,92.5,30.833333


In [38]:
# wide to long
def column_mask(column: str):
  return column.startswith("GOLDEN:") or column.startswith("HINT:")

to_remove = list(filter(column_mask, df.columns))
df.drop(columns=to_remove, inplace=True)

df.columns

Index(['INPUT:tag', 'INPUT:row_pk', 'INPUT:intent', 'INPUT:input_utterance',
       'INPUT:parameters', 'OUTPUT:paraphrase_0', 'OUTPUT:paraphrase_1',
       'OUTPUT:paraphrase_2', 'INPUT:source', 'INPUT:pool_id', 'INPUT:rand_cw',
       'INPUT:is_correct', 'INPUT:input_pattern', 'INPUT:masked_ngrams',
       'INPUT:rand_baseline', 'INPUT:prompt_context', 'INPUT:target_patterns',
       'INPUT:distance_to_mean', 'INPUT:distance_to_seed',
       'INPUT:input_utterance_words', 'INPUT:input_utterance_bootstrap',
       'INPUT:input_utterance_lemmatized_words', 'OUTPUT:trace',
       'OUTPUT:worker_uuid', 'OUTPUT:screen_width', 'OUTPUT:screen_height',
       'OUTPUT:page_started_at', 'OUTPUT:provided_ngrams',
       'OUTPUT:page_started_at_string', 'ASSIGNMENT:link',
       'ASSIGNMENT:task_id', 'ASSIGNMENT:assignment_id',
       'ASSIGNMENT:task_suite_id', 'ASSIGNMENT:worker_id', 'ASSIGNMENT:status',
       'ASSIGNMENT:started', 'ASSIGNMENT:submitted', 'ASSIGNMENT:accepted',
       'ASSIGN

In [39]:
value_vars = ["OUTPUT:paraphrase_0", "OUTPUT:paraphrase_1", "OUTPUT:paraphrase_2"]

def id_var_mask(column: str):
  return column not in value_vars

id_vars = list(filter(id_var_mask, df.columns))
df_long = pd.melt(df, id_vars=id_vars, value_vars=value_vars)
df_long.rename(columns={"variable": "paraphrase_key", "value": "paraphrase_value"}, inplace=True)
df_long.sort_values(by=["INPUT:row_pk", "ASSIGNMENT:worker_id"], inplace=True)
df_long.shape

(6120, 43)

In [41]:
# add paraphrase_patterns column
# and update input_pattern just in case
# we turn it to canonical form first as this is what we did in the UI as well before calling our backend services.


def get_pattern(paraphrase: str):
 template = text.get_parse_template(paraphrase)
 template = " ".join(template)
 return template


from typing import List

def get_key_for_parse(parameter_name, idx):
  return f"{idx}{parameter_name}{idx}".lower()  


def to_canonical(input_utterance: str, parameters: List[str], get_key_fn=get_key_for_parse):
  # we need to parameterize the input utterance to make tokenization work for parameters
  iu_canonical : str = input_utterance
  skips = 0

  for i, p in enumerate(parameters):
    if len(p) == 0:
      continue
    p_name, p_val = p
    # we lowercase to avoid issues with parameter values that were lower/upper cased by
    # workers in their paraphrases
    try:
      #print("looking in... " + iu_canonical.lower() + " VALUE " + p_val.lower())
      pos = iu_canonical.lower().index(p_val.lower())
      iu_canonical = iu_canonical[:pos] + get_key_fn(p_name, i) + iu_canonical[pos+len(p_val):]
    except:
      print("\tvalue " + p_val.lower() + "not found... we SKIP")
      skips += 1
  return iu_canonical, skips


# add paraphrase_pattern column
input_patterns_rows = []
paraphrase_patterns_rows = []
skips = 0

for idx, row in df_long.iterrows():
  parameters = eval(row["INPUT:parameters"])
  p_canonical, local_skips = to_canonical(row["paraphrase_value"], parameters)
  skips += local_skips
  paraphrase_patterns_rows.append(get_pattern(p_canonical))

  iu_canonical, _ = to_canonical(row["INPUT:input_utterance"], parameters)
  input_patterns_rows.append(get_pattern(iu_canonical))
print("Processing done... skipped parameters " + str(skips))

df_long["paraphrase_pattern"] = paraphrase_patterns_rows
df_long["INPUT:input_pattern"] = input_patterns_rows

2021-11-20 15:23:23 INFO: Starting server with command: java -Xmx5G -cp /Users/jorge/Development/work/lyon/ConversationalAI/lib/corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 10000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-e7aa5e1738164ed8.props -annotators tokenize,ssplit,parse -preload -outputFormat json


	value 7-daynot found... we SKIP
Processing done... skipped parameters 1


In [42]:
df_long.head()

Unnamed: 0,INPUT:tag,INPUT:row_pk,INPUT:intent,INPUT:input_utterance,INPUT:parameters,INPUT:source,INPUT:pool_id,INPUT:rand_cw,INPUT:is_correct,INPUT:input_pattern,...,ASSIGNMENT:accepted,ASSIGNMENT:rejected,ASSIGNMENT:skipped,ASSIGNMENT:expired,ASSIGNMENT:reward,judgment_time_3p,judgment_time_1p,paraphrase_key,paraphrase_value,paraphrase_pattern
667,,1,EndEC2Instance,"""Finish i-a541 now""","[['VM', 'i-a541']]",ParaQuality,baseline-cw,0.947585,1,( ROOT ( NP ( FRAG ) ) ),...,2021-11-18T08:14:26.315,,,,0.15,97.699,32.566333,OUTPUT:paraphrase_0,Get over with i-a541,( ROOT ( S ( VP ) ) )
2707,,1,EndEC2Instance,"""Finish i-a541 now""","[['VM', 'i-a541']]",ParaQuality,baseline-cw,0.947585,1,( ROOT ( NP ( FRAG ) ) ),...,2021-11-18T08:14:26.315,,,,0.15,97.699,32.566333,OUTPUT:paraphrase_1,Get i-a541 done,( ROOT ( S ( VP ) ) )
4747,,1,EndEC2Instance,"""Finish i-a541 now""","[['VM', 'i-a541']]",ParaQuality,baseline-cw,0.947585,1,( ROOT ( NP ( FRAG ) ) ),...,2021-11-18T08:14:26.315,,,,0.15,97.699,32.566333,OUTPUT:paraphrase_2,Do I-a541,( ROOT ( NP ( NNP ) ( NNPS ) ) )
1686,,1,EndEC2Instance,Terminate i-a541 now,"[['VM', 'i-a541']]",ParaQuality,taboo-patterns,0.244053,1,( ROOT ( FRAG ( NP ) ( ADVP ) ) ),...,2021-11-18 16:48:51,,,,0.15,49.0,16.333333,OUTPUT:paraphrase_0,I want to visit terminate i-a541,( ROOT ( S ( NP ) ( VP ) ) )
3726,,1,EndEC2Instance,Terminate i-a541 now,"[['VM', 'i-a541']]",ParaQuality,taboo-patterns,0.244053,1,( ROOT ( FRAG ( NP ) ( ADVP ) ) ),...,2021-11-18 16:48:51,,,,0.15,49.0,16.333333,OUTPUT:paraphrase_1,Is terminate i-a541 opened?,( ROOT ( SQ ( VBZ ) ( NP ) ( VP ) ( . ) ) )


let's do some data cleaning. We remove `(.)` from the patterns in the provided paraphrases and also those in the input patterns.

In [43]:
# remove "( . )" in patterns
to_remove_mask = df_long.paraphrase_pattern.str.contains(" \\( \\. \\)")
df_long[to_remove_mask].shape

(2342, 44)

In [44]:
df_long[to_remove_mask]["paraphrase_pattern"]

3726    ( ROOT ( SQ ( VBZ ) ( NP ) ( VP ) ( . ) ) )
1423                    ( ROOT ( S ( VP ) ( . ) ) )
3463                    ( ROOT ( S ( VP ) ( . ) ) )
5503                    ( ROOT ( S ( VP ) ( . ) ) )
993                     ( ROOT ( S ( VP ) ( . ) ) )
                           ...                     
2421     ( ROOT ( SBARQ ( WHADVP ) ( SQ ) ( . ) ) )
4461     ( ROOT ( SQ ( MD ) ( NP ) ( VP ) ( . ) ) )
379        ( ROOT ( SBARQ ( WHNP ) ( SQ ) ( . ) ) )
2419       ( ROOT ( SBARQ ( WHNP ) ( SQ ) ( . ) ) )
4459       ( ROOT ( SBARQ ( WHNP ) ( SQ ) ( . ) ) )
Name: paraphrase_pattern, Length: 2342, dtype: object

In [45]:
df_long["paraphrase_pattern"] = df_long.paraphrase_pattern.apply(lambda pp: pp.replace(" ( . )", ""))
df_long[df_long.paraphrase_pattern.str.contains(" \\( \\. \\)")].shape

(0, 44)

In [46]:
df_long[to_remove_mask]["paraphrase_pattern"]

3726    ( ROOT ( SQ ( VBZ ) ( NP ) ( VP ) ) )
1423                    ( ROOT ( S ( VP ) ) )
3463                    ( ROOT ( S ( VP ) ) )
5503                    ( ROOT ( S ( VP ) ) )
993                     ( ROOT ( S ( VP ) ) )
                        ...                  
2421     ( ROOT ( SBARQ ( WHADVP ) ( SQ ) ) )
4461     ( ROOT ( SQ ( MD ) ( NP ) ( VP ) ) )
379        ( ROOT ( SBARQ ( WHNP ) ( SQ ) ) )
2419       ( ROOT ( SBARQ ( WHNP ) ( SQ ) ) )
4459       ( ROOT ( SBARQ ( WHNP ) ( SQ ) ) )
Name: paraphrase_pattern, Length: 2342, dtype: object

In [47]:
# remove "( . )" in input_pattern
to_remove_mask = df_long["INPUT:input_pattern"].str.contains(" \\( \\. \\)")
df_long[to_remove_mask].shape

(1860, 44)

In [48]:
df_long[to_remove_mask]["INPUT:input_pattern"]

1009            ( ROOT ( S ( `` ) ( VP ) ( . ) ( '' ) ) )
3049            ( ROOT ( S ( `` ) ( VP ) ( . ) ( '' ) ) )
5089            ( ROOT ( S ( `` ) ( VP ) ( . ) ( '' ) ) )
1015            ( ROOT ( S ( `` ) ( VP ) ( . ) ( '' ) ) )
3055            ( ROOT ( S ( `` ) ( VP ) ( . ) ( '' ) ) )
                              ...                        
2657          ( ROOT ( S ( `` ) ( SBAR ) ( VP ) ( . ) ) )
4697          ( ROOT ( S ( `` ) ( SBAR ) ( VP ) ( . ) ) )
452     ( ROOT ( SBARQ ( `` ) ( WHNP ) ( SQ ) ( . ) ( ...
2492    ( ROOT ( SBARQ ( `` ) ( WHNP ) ( SQ ) ( . ) ( ...
4532    ( ROOT ( SBARQ ( `` ) ( WHNP ) ( SQ ) ( . ) ( ...
Name: INPUT:input_pattern, Length: 1860, dtype: object

In [50]:
df_long["INPUT:input_pattern"] = df_long["INPUT:input_pattern"].apply(lambda pp: pp.replace(" ( . )", ""))
df_long[df_long["INPUT:input_pattern"].str.contains(" \\( \\. \\)")].shape

(0, 44)

In [51]:
df_long[to_remove_mask]["INPUT:input_pattern"]

1009                 ( ROOT ( S ( `` ) ( VP ) ( '' ) ) )
3049                 ( ROOT ( S ( `` ) ( VP ) ( '' ) ) )
5089                 ( ROOT ( S ( `` ) ( VP ) ( '' ) ) )
1015                 ( ROOT ( S ( `` ) ( VP ) ( '' ) ) )
3055                 ( ROOT ( S ( `` ) ( VP ) ( '' ) ) )
                              ...                       
2657               ( ROOT ( S ( `` ) ( SBAR ) ( VP ) ) )
4697               ( ROOT ( S ( `` ) ( SBAR ) ( VP ) ) )
452     ( ROOT ( SBARQ ( `` ) ( WHNP ) ( SQ ) ( '' ) ) )
2492    ( ROOT ( SBARQ ( `` ) ( WHNP ) ( SQ ) ( '' ) ) )
4532    ( ROOT ( SBARQ ( `` ) ( WHNP ) ( SQ ) ( '' ) ) )
Name: INPUT:input_pattern, Length: 1860, dtype: object

In [52]:
df_long.to_csv("exps/output/main-all.csv", index=False)