In [1]:
import getpass
import os
import re
import json
import operator
import pymysql
import pandas as pd
from sqlalchemy import create_engine

password = getpass.getpass()

 ········


In [2]:
engine_vision = create_engine(f'mysql+pymysql://fnbrasil:{password}@localhost:3307/vision_db', pool_recycle=3600)
engine_fnbrdb = create_engine(f'mysql+pymysql://fnbrasil:{password}@localhost:3307/fnbr_db', pool_recycle=3600)

In [3]:
frame_names = pd.read_sql('''
    select frame.idFrame, entry.name
    from frame
    join entry on entry.entry = frame.entry
    where entry.idLanguage = 2;''', engine_fnbrdb)

fe_names = pd.read_sql('''
    select frameelement.idFrameElement , entry.name
    from frameelement 
    join entry on entry.entry = frameelement.entry
    where entry.idLanguage = 2;''', engine_fnbrdb)

frame_names = { r["idFrame"]:r["name"] for _, r in frame_names.iterrows() } 
fe_names = { r["idFrameElement"]:r["name"] for _, r in fe_names.iterrows() } 

In [4]:
def write_to_txt(file_path, series):
    """
    Write a pd.Series into a .txt file.

    Args:
    file_path (str): The path to the .txt file.
    series: The pd.Series being stored.
    """
    with open(file_path, "w") as fp:
        for sent in series:
            fp.write(sent + "\n")


def read_jsonl(file_path):
    """
    Read a JSONL file and return a list of dictionaries.

    Args:
    file_path (str): The path to the JSONL file.

    Returns:
    list: A list of dictionaries representing JSON objects.
    """
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data


def get_name(id, type=None):
    """
    Gets the name of a FrameNet entity. 

    Args:
    id: The integer id of the entity or a string with the type and the id.
    type (str): The type of entity. Must be specified if id is int.
    """
    if isinstance(id, str) and type is None:
        matches = re.match(r"(\w+)_(\d+)", id)
        if matches is None:
            print(id)
        type = matches.group(1)
        id = int(matches.group(2))

    if type == "frm":
        return frame_names[id]
    elif type == "fe":
        return fe_names[id]
    else:
        raise "Failed to infer entity type"


def lome_to_release(annotation): 
    """
    Converts a LOME output tree to the release format used in Framed30k.

    Args:
    annotation (dict): The dict representing the output tree of the instance.
    """
    return [{
        "id": get_name(frame["label"]),
        "span": frame["span"],
        "frameElements": [{
            "id": get_name(fe["label"]),
            "span": fe["span"]
        } for fe in frame["children"] if fe["label"] != "@@PADDING@@" ]
    } for frame in annotation["children"]]


def to_multi30k_task1(df, set):
    """
    Stores subsets of the dataframe sets according to the Multi30k's task1 splits.

    Args:
    df (pd.DataFrame): The dataframe containing the release data.
    set (str): The subset being stored. Can be 'train', 'val', 'test_2016_flickr' or 'test_2016_images'.
    """
    split_path = os.path.join("multi30k-dataset", "data", "task1", "image_splits", f"{set}.txt")
    output_base = os.path.join("release", "multi30k", "task1")

    with open(split_path) as fp:
        images = [line.rstrip('\n') for line in fp]

    merged = pd.DataFrame(pd.Series(images, name="imageFile")).reset_index().merge(df)
    merged = merged.sort_values("index")

    # save raw
    with open(os.path.join(output_base, "raw", f"{set}.pt"), "w") as fp:
        for sent in merged["sentence"]:
            fp.write(sent + "\n")

    # save tok
    with open(os.path.join(output_base, "tok", f"{set}.lc.norm.tok.pt"), "w") as fp:
        for tokens in merged["tokens"]:
            fp.write(' '.join(tokens).lower() + "\n")


def to_multi30k_task2(df, set):
    """
    Stores subsets of the dataframe sets according to the Multi30k's task2 splits.

    Args:
    df (pd.DataFrame): The dataframe containing the release data.
    set (str): The subset being stored. Can be 'train', 'val', 'test_2016'.
    """
    split_path = os.path.join("multi30k-dataset", "data", "task2", "image_splits", f"{set}_images.txt")
    output_base = os.path.join("release", "multi30k", "task2")

    with open(split_path) as fp:
        images = [line.rstrip('\n') for line in fp]

    merged = pd.DataFrame(pd.Series(images, name="imageFile")).reset_index().merge(df)
    merged = merged.sort_values("index")

    # save raw
    for i in range(1, 6):
        subset = merged[merged["sentenceNumber"] == i]
        
        assert len(subset) == len(images)
        
        with open(os.path.join(output_base, "raw", f"{set}.{i}.pt"), "w") as fp:
            for sent in subset["sentence"]:
                fp.write(sent + "\n")

    # save tok
    for i in range(1, 6):
        subset = merged[merged["sentenceNumber"] == i]
        
        assert len(subset) == len(images)
        
        with open(os.path.join(output_base, "tok", f"{set}.lc.norm.tok.{i}.pt"), "w") as fp:
            for tokens in subset["tokens"]:
                fp.write(' '.join(tokens).lower() + "\n")

## PTT sentences

In [5]:
import pymysql
import pandas as pd
from sqlalchemy import create_engine

ptt = pd.read_sql('''
    select ptt.idAnnotation, image.imageFile, eno.sentenceNumber, ptt.sentence
    from annotation ptt
    join annotation eno on eno.idAnnotation = ptt.idRefAnnotation 
    join image on image.idImage = eno.idImage 
    where ptt.source = 'vision'
    	and exists (
    		select 1
    		from annotation det
    		where source in ('multi30k-german', 'multi30k-german-2k-sample')
    			and det.idRefAnnotation = ptt.idRefAnnotation 
    	);''', engine_vision)

ptt["sentenceNumber"] += 1

Filter out any duplicates for any imageFile#sentenceNumber, preserve the longest.

We expect exactly 31014 records after that operation (this number comes from Multi-Flickr30k)

In [6]:
idxs = []
for _, group in ptt.groupby(["imageFile", "sentenceNumber"])["sentence"]:
    idxs.append(group.str.len().idxmax())


assert len(idxs) == 31014

ptt = ptt.loc[idxs]

Write to text file to be consumed by LOME.

In [7]:
write_to_txt("data/ptt.txt", ptt["sentence"])

Build release file for annotation and in raw formats.

In [8]:
# start by sorting
ptt.sort_values(["imageFile", "sentenceNumber"], inplace=True)

In [9]:
lome_output = read_jsonl("data/ptt.jsonl")

assert len(lome_output) == 31014

ptt["tokens"] = list(map(operator.itemgetter("tokens"), lome_output))
ptt["frames"] = list(map(lome_to_release, map(operator.itemgetter("annotation"), lome_output)))

ptt.rename(columns={
    "idAnnotation": "sentenceId",
    "imageFile": "flickr30kImageId",
    "sentenceNumber": "flickr30kSentenceNumber",
}).to_json("release/framed30k/PTT.jsonl", orient="records", lines=True, index=False)

In [10]:
frm_ptt_labels = ptt["frames"].map(len)
fe_ptt_labels = ptt["frames"].map(lambda frames: sum(len(frm["frameElements"]) for frm in frames))

print('Frame labels (PTT): ', frm_ptt_labels.sum())
print('FE labels (PTT): ', fe_ptt_labels.sum())
print('Total labels (PTT): ', (frm_ptt_labels + fe_ptt_labels).sum())
print('Avg p/ sent (PTT): ', (frm_ptt_labels + fe_ptt_labels).mean())

Frame labels (PTT):  193143
FE labels (PTT):  179829
Total labels (PTT):  372972
Avg p/ sent (PTT):  12.025923776359063


In [11]:
to_multi30k_task1(ptt, "test_2016_flickr")
to_multi30k_task1(ptt, "train")
to_multi30k_task1(ptt, "val")

## PTO sentences

In [12]:
pto = pd.read_sql('''
    select pto.idAnnotation, image.imageFile, pto.sentence
    from annotation pto
    join image on image.idImage = pto.idImage 
    where pto.source = 'vision' and idRefAnnotation is null;''', engine_vision)

Select top-5 longest sentences for each image as the reference ones.

Final dataset must have 158915 records (5 x 31783).

In [13]:
idxs = []
for _, group in pto.groupby("imageFile")["sentence"]:
    for i in group.str.len().nlargest(5).index:
        idxs.append(i)

assert len(idxs) == 158915

pto = pto.loc[idxs]

Randomly assigns sentence numbers to new sentences (PTO).

In [14]:
numbers = list(range(1, 6))
pto["sentenceNumber"] = -1

for _, group in pto.groupby("imageFile"):
    pto.loc[group.sample(frac=1, random_state=1234).index, "sentenceNumber"] = numbers

In [15]:
# start by sorting
pto.sort_values(["imageFile", "sentenceNumber"], inplace=True)

Write to text file to be consumed by LOME.

In [16]:
write_to_txt("data/pto.txt", pto["sentence"])

Build release file for annotation and in raw formats.

In [17]:
lome_output = read_jsonl("data/pto.jsonl")

assert len(lome_output) == 158915

pto["tokens"] = list(map(operator.itemgetter("tokens"), lome_output))
pto["frames"] = list(map(lome_to_release, map(operator.itemgetter("annotation"), lome_output)))

pto.rename(columns={
    "idAnnotation": "sentenceId",
    "imageFile": "flickr30kImageId",
    "sentenceNumber": "flickr30kSentenceNumber",
}).to_json("release/framed30k/PTO.jsonl", orient="records", lines=True, index=False)

In [18]:
frm_pto_labels = pto["frames"].map(len)
fe_pto_labels = pto["frames"].map(lambda frames: sum(len(frm["frameElements"]) for frm in frames))

print('Frame labels (PTO): ', frm_pto_labels.sum())
print('FE labels (PTO): ', fe_pto_labels.sum())
print('Total labels (PTO): ', (frm_pto_labels + fe_pto_labels).sum())
print('Avg p/ sent (PTO): ', (frm_pto_labels + fe_pto_labels).mean())

Frame labels (PTO):  1106580
FE labels (PTO):  1024456
Total labels (PTO):  2131036
Avg p/ sent (PTO):  13.409910958688608


In [19]:
to_multi30k_task2(pto, "test_2016")
to_multi30k_task2(pto, "train")
to_multi30k_task2(pto, "val")

In [20]:
raw_output = pto["imageFile"] + "#" + pto["sentenceNumber"].astype(str) + "\t" + pto["sentence"] + "\n"

with open("release/flickr30k/captions.txt", "w") as fp:
    for line in raw_output:
        fp.write(line)

## ENO sentences

In [21]:
eno = pd.read_sql('''
    select eno.idAnnotation, image.imageFile, eno.sentenceNumber, eno.sentence
    from annotation eno
    join image on image.idImage = eno.idImage 
    where eno.source = 'flickr30k';''', engine_vision)

eno["sentenceNumber"] += 1

In [22]:
# start by sorting
eno.sort_values(["imageFile", "sentenceNumber"], inplace=True)

Write to text file to be consumed by LOME.

In [23]:
write_to_txt("data/eno.txt", eno["sentence"])

Build release file for annotation.

In [24]:
lome_output = read_jsonl("data/eno.jsonl")

assert len(lome_output) == 158915

eno["tokens"] = list(map(operator.itemgetter("tokens"), lome_output))
eno["frames"] = list(map(lome_to_release, map(operator.itemgetter("annotation"), lome_output)))

eno.rename(columns={
    "idAnnotation": "sentenceId",
    "imageFile": "flickr30kImageId",
    "sentenceNumber": "flickr30kSentenceNumber",
}).drop(columns=["sentence"]).to_json("release/framed30k/ENO.jsonl", orient="records", lines=True, index=False)

In [25]:
frm_eno_labels = eno["frames"].map(len)
fe_eno_labels = eno["frames"].map(lambda frames: sum(len(frm["frameElements"]) for frm in frames))

print('Frame labels (ENO): ', frm_eno_labels.sum())
print('FE labels (ENO): ', fe_eno_labels.sum())
print('Total labels (ENO): ', (frm_eno_labels + fe_eno_labels).sum())
print('Avg p/ sent (ENO): ', (frm_eno_labels + fe_eno_labels).mean())

Frame labels (ENO):  764860
FE labels (ENO):  1308254
Total labels (ENO):  2073114
Avg p/ sent (ENO):  13.045426800490828


In [26]:
## ENO reference sentences for translation
enot = eno.merge(ptt[["imageFile", "sentenceNumber"]], on=["imageFile", "sentenceNumber"])

write_to_txt("data/enot.txt", enot["sentence"])

## Image annotations

In [27]:
img = pd.read_sql('''
    select
    	sentencemm.idSentenceMM,
    	corpus.entry,
    	imagemm.name as 'flickr30kImageId',
    	sentencemm.idFlickr30k+1 as 'flickr30kSentenceNumber',
    	frme.name as 'frameName',
    	fee.name as 'feName',
    	omm.idFlickr30k as 'flickr30kEntitiesObjectId'
    from corpus
    join document on document.idCorpus = corpus.idCorpus
    join documentmm on documentmm.idDocument = document.idDocument
    join sentencemm on sentencemm.idDocumentMM = documentmm.idDocumentMM
    join imagemm on imagemm.idImageMM = sentencemm.idImageMM
    -- annotation part
    join objectsentencemm osmm on osmm.idSentenceMM = sentencemm.idSentenceMM
    join objectmm omm on omm.idObjectMM = osmm.idObjectMM 
    join frameelement fe on fe.idFrameElement = osmm.idFrameElement 
    join frame frm on frm.idFrame = fe.idFrame 
    join entry fee on fee.entry = fe.entry and fee.idLanguage = 2
    join entry frme on frme.entry = frm.entry and frme.idLanguage = 2
    where corpus.entry  in (
    	'crp_oficina_com_sentenca_1',
    	'crp_oficina_com_sentenca_2',
    	'crp_oficina_sem_sentenca_1',
    	'crp_oficina_sem_sentenca_2',
    	'crp_oficina_com_sentenca_3',
    	'crp_oficina_com_sentenca_4',
    	'crp_oficina_sem_sentenca_3',
    	'crp_oficina_sem_sentenca_4'
    	-- 'crp_nlperspectives-2k',
    	-- 'crp_flickr30k-1k-1',
    	-- 'crp_flickr30k-1k-2'
    )
    order by flickr30kImageId, flickr30kSentenceNumber;
''', engine_fnbrdb)


img["annotationCondition"] = ""
img.loc[img["entry"].str.contains("flickr30k-1k"), "annotationCondition"] = "VWC"
img.loc[img["entry"].str.contains("oficina_com_sentenca"), "annotationCondition"] = "VWC"
img.loc[img["entry"].str.contains("crp_nlperspectives-2k"), "annotationCondition"] = "VNC"
img.loc[img["entry"].str.contains("oficina_sem_sentenca"), "annotationCondition"] = "VNC"

In [28]:
img_grouped = pd.DataFrame([
    {
        "flickr30kImageId": img_idx[1], 
        "flickr30kSentenceNumber": img_idx[2],
        "annotationCondition": img_idx[0],
        "frames": [
            {
                "id": frm_idx,
                "frameElements": [
                    { "id": r["feName"], "flickr30kEntitiesObjectId": r["flickr30kEntitiesObjectId"] }
                    for _, r in frm_group.iterrows()
                ]
            }
            for frm_idx, frm_group in img_group.groupby("frameName")
        ]
    }
    for img_idx, img_group in img.groupby(["annotationCondition", "flickr30kImageId", "flickr30kSentenceNumber"])
])

In [29]:
img_grouped.to_json("release/framed30k/IMG.jsonl", orient="records", lines=True, index=False)

In [30]:
187*40*4

29920

In [33]:
img[["flickr30kImageId", "flickr30kSentenceNumber"]].drop_duplicates()

Unnamed: 0,flickr30kImageId,flickr30kSentenceNumber
0,1000092795.jpg,2
4,10002456.jpg,1
10,1000268201.jpg,4
14,1000344755.jpg,2
22,1000366164.jpg,4
...,...,...
169535,969655512.jpg,4
169539,96973080.jpg,2
169544,96973080.jpg,5
169548,969779007.jpg,4
