This notebook reads in data from a specifically formatted file (ASR'd format),
queries a pre-existing alignment matrix, and generates expanded vocabulary for each question.  The bottom half converts questions broken up by sentences into a single question.

# Generating Expanded Data

In [9]:
import json
import numpy as np
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from typing import List, Dict, NamedTuple, Tuple
from tqdm import tqdm



In [None]:
class Translation(NamedTuple):
    word: str
    prob: float

    def __add__(self, other):
        return Translation(self.word + ', ' + other.word, self.prob + other.prob)

    def __radd__(self, other):
        if other == 0: return self
        return self.__add__(other)


class TPM(defaultdict):
    def up_to(self, k: str, p: float = 0.9) -> List[Translation]:
        sorted, included = self.sorted(k), []
        i = 0 ; prob = 0.
        while prob < p:
            included.append(sorted[i])
            prob += sorted[i].prob ; i += 1
        return included

    def sorted(self, k: str) -> List[Translation]:
        return sorted(self[k], key=lambda t: t.prob, reverse=True)


def read_translation_probability_matrix(fp) -> TPM:
    probabilities = TPM(list)
    for line in tqdm(fp):
        w2, w1, prob = line.strip().split()
        probabilities[w1].append(Translation(w2, float(prob)))
    return probabilities


if __name__ == '__main__':
    with open('/fs/clip-scratch/jdbarrow/qanta/lex.out') as fp:
    #
    #with open('/fs/clip-scratch/yogarshi/qanta-joe/align-try2/train/lex.f2e') as fp:
        tpm = read_translation_probability_matrix(fp)

### Make sure you specify the correct file

In [None]:
with open('/fs/clip-quiz/dpeskov/data/asr_qanta.test.2018.04.18.json') as f:
    data = json.load(f)

In [10]:
for question in data['questions']:
    new_question = []
    new_question_confidences = []
    for q_index, sentence in enumerate(question['sentences']):
        new_sentence = []
        new_sentence_confidences = []
        for s_index, word in enumerate(sentence): 
                
            try:
                #print(word, [tupple[0] for tupple in tpm.up_to(word, .9) if tupple[0] not in ['unk','NULL']])
                new_sentence.extend([tupple[0] for tupple in tpm.up_to(word, .9)[:5] if tupple[0] not in ['unk','NULL']])
                new_sentence_confidences.extend([tupple[1] for tupple in tpm.up_to(word, .9)[:5] if tupple[0] not in ['unk', 'NULL']])
            except:
                new_sentence.extend([word])
                new_sentence_confidences.extend([question['confidences'][q_index][s_index]])
                #print ("Issue with ", word)

        new_question.append(new_sentence)
        new_question_confidences.append(new_sentence_confidences)
    question['sentences'] = new_question
    question['confidences'] = new_question_confidences

### Specify the write location of new file

In [11]:
print ("Writing out file of length ", len(data['questions']))
with open(f'/fs/clip-quiz/dpeskov/data/asr_qanta.test_expanded_v2.2018.04.18.json', 'w') as fp:
    json.dump({"questions":data['questions']}, fp)

Writing out file of length  2151


# Converting Data Indvidual Sentences to One Question

In [12]:
import json
with open('/fs/clip-quiz/dpeskov/data/asr_qanta.dev_expanded.2018.04.18.json') as f:
    data = json.load(f)

In [13]:
for question in data['questions']:
    sentences = sum((question['sentences']), [])
    confidences = sum((question['confidences']), [])
    question['sentences'] = [sentences]
    question['confidences'] = [confidences]

print (data['questions'][:5])

[{'qnum': 93135, 'sentences': [['five', 'years', 'after', 'a', 'the', 'uh', 'payment', 'payments', 'painting', 'if', 'its', 'is', 'ten', 'pretend', 'million', 'dollars', 'dollars', 'healthy', 'area', 'issue', 'issues', 'you', '<unk>', 'anthony', 'jones', 'joan', 'propose', 'proposed', 'plan', 'planned', 'plans', 'planning', '<unk>', 'power', 'reserves', 'reserve', 'reserved', 'a', 'the', 'uh', 'week', 'weekend', 'weak', 'weeks', 'or', 'are', 'our', 'so', 'that', 'oh', 'i', 'first', 'one', 'had', 'has', 'have', 'to', 'too', 'write', 'right', 'ride', 'split', 'but', 'led', 'into', 'to', 'in', '<unk>', 'in', 'and', 'you', 'the', 'a', 'that', 'future', 'i', 'first', 'one', '<unk>', 'began', 'begin', 'again', 'discussing', 'disgusting', 'this', 'the', 'it', 'plan', 'planned', 'plans', 'planning', 'was', 'with', 'would', 'able', 'to', 'too', 'lobster', 'before', 'for', 'or', 'are', 'our', 'after', 'he', 'you', 'hes', 'died', 'regulator', 'conduct', 'linear', 'moderator', 'with', 'was', 'with

In [14]:
print ("Writing out file of length ", len(data['questions']))
with open(f'/fs/clip-quiz/dpeskov/data/asr_qanta.dev_expanded_joined.2018.04.18.json', 'w') as fp:
    json.dump({"questions":data['questions']}, fp)

Writing out file of length  1054
