In [1]:
import pandas as pd

#  Read the g1 dataset.
g1 = pd.read_csv('../data/csvs/g1_news_10000.csv')
g1.head()


Unnamed: 0,title,section,date,url,content
0,nova regra fiscal: governo vai arrumar a casa ...,política,2023-04-13,https://g1.globo.com/politica/noticia/2023/04/...,"a ministra do planejamento e orçamento, simon..."
1,pl aciona conselho de ética para acusar deputa...,política,2023-04-13,https://g1.globo.com/politica/noticia/2023/04/...,o pl apresentou nesta quinta–feira (13) ao co...
2,sites chineses usam 3 estratégias para burlar ...,economia,2023-04-13,https://g1.globo.com/economia/noticia/2023/04/...,o anúncio do fim da isenção de imposto para en...
3,"subsídio do governo no minha casa, minha vida ...",economia,2023-04-13,https://g1.globo.com/economia/noticia/2023/04/...,o governo publicou uma portaria que estabelec...
4,dólar em queda: saiba se é um bom momento para...,turismo e viagem,2023-04-13,https://g1.globo.com/turismo-e-viagem/noticia/...,"na quarta–feira, a moeda estava a r$ 4,9421. ..."


In [2]:
import tempfile
import os
import subprocess

def run_udpipe2_client(text: str):
    """Run the UDPipe2 client on a text and return the result as a string."""
    # Create a temporary file with the text.
    with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False) as inputFile:
        inputFile.write(text)
        inputFile.close()
        # Run the UDPipe2 client on the temporary file.
        # The output is written to a file.
        outputFileName = inputFile.name + ".out"
        subprocess.Popen(f'python3 udpipe2_client.py --model portuguese-gsd-ud-2.10-220711 --input generic_tokenizer --tokenizer ranges --tagger 1 --parser 1 --outfile {outputFileName} {inputFile.name}', shell=True).wait()
        
        # Read the output file.
        with open(outputFileName, "r", encoding="utf-8") as outputFile:
            result = outputFile.read()
        
        # Delete the temporary files.
        os.remove(inputFile.name)
        os.remove(outputFileName)
        
    return result

In [23]:
def get_lemma(word, lemma):
    return lemma if lemma != '_' else word

def joinSubjectOrObject(wordAdjList: list, subjectOrObjectIndex: int, subjectOrObjectWord: str, subjectOrObjectLemma: str, subjectOrObjectXpos):
    #   nsubj/obj
    #    /      \
    # rel1     rel2
    if subjectOrObjectXpos == 'PRON':
        return; # ignore pronouns

    bannedSubjectOrObjects = {'-', '––', '—', '‘', '’', '“', '”', '♪'}

    lemma = get_lemma(subjectOrObjectWord, subjectOrObjectLemma)
    if lemma in bannedSubjectOrObjects:
        return

    relationsToAppend = {'amod', 'nmod', 'nummod', 'case', 'appos', 'flat'}
    strComponents = [(subjectOrObjectIndex, lemma)]

    def helper(index: int):
        nonlocal wordAdjList
        nonlocal strComponents
        nonlocal relationsToAppend
        
        for auxIndex, auxWord, auxLemma, _, auxRelation in wordAdjList[index]:
            lemma = get_lemma(auxWord, auxLemma)
            if auxRelation in relationsToAppend and lemma not in bannedSubjectOrObjects:
                strComponents.append((auxIndex, lemma))
                helper(auxIndex)
    
    # O(w), worst case is the tree is a list with all words being of a relation to append
    helper(subjectOrObjectIndex)

    # O(w * log(w) + len(subj)) to sort the list + join the words
    subj = ' '.join([word for _, word in sorted(strComponents, key=lambda x: x[0])])
    return subj

def joinVerb(wordAdjList: list, verbIndex: int, verbWord: str, verbLemma: str):
    #    verb
    #    /   \
    #  rel1  verb
    #          \
    #         rel2
    nonVerbRelationsToAppend = {'advmod'}
    verbRelationsToAppend = {'xcomp', 'conj'}

    strComponents = [(verbIndex, get_lemma(verbWord, verbLemma))]
    def findAllVerb(index: int):
        nonlocal wordAdjList
        nonlocal verbRelationsToAppend
        nonlocal strComponents

        for auxIndex, auxWord, auxLemma, auxXpos, relation in wordAdjList[index]:
            if auxXpos == 'VERB' and relation in verbRelationsToAppend:
                strComponents.append((auxIndex, get_lemma(auxWord, auxLemma)))
                findAllVerb(auxIndex)    
        
    findAllVerb(verbIndex)

    # find all non-verb relations to append
    verbIndexes = [index for index, _ in strComponents]
    for i in verbIndexes:
        for auxIndex, auxWord, auxLemma, auxXpos, auxRelation in wordAdjList[i]:
            if auxXpos != 'VERB' and auxRelation in nonVerbRelationsToAppend:
                strComponents.append((auxIndex, get_lemma(auxWord, auxLemma)))

    # O(w * log(w)) to sort the list
    verb = [word for _, word in sorted(strComponents, key=lambda x: x[0])]   
    verb = ' '.join(verb) 

    return verb, min(verbIndexes), max(verbIndexes)

def parse_udpipe2_output(text: str):
    """Parse the UDPipe2 output and return a list of tuples."""
    # get each sentence in a list (they are separated by a blank line)
    sentences = text.rstrip().split('\n\n')
    
    tuples = []
    for sentence in sentences: # O(n * w^3)
        # Remove every line that does not start with a number.
        sentence = [line for line in sentence.splitlines() if line and line[0].isdigit()]

        # get the word, lemma, xpos, head, and relation (remove lines with ranges as index)
        words = [line.split('\t') for line in sentence]
        words_tuples = [(int(w[0]), w[1], w[2], w[4], int(w[6]) if w[6] != '_' else 0, w[7]) for w in words if w[0].isdigit()]

        # create a adjlist from the words_tuples
        # adjList[i] = every word that has i as head
        wordAdjList = [[] for _ in range(len(words_tuples) + 1)]
        for index, word, lemma, xpos, head, relation in words_tuples:
            wordAdjList[head].append((index, word, lemma, xpos, relation))

        # O(w^3 * log(w))
        # Form tuples that follow this pattern
        #     verb
        #    /    \
        #  nsubj   obj
        nsubj = ''
        verb = ''
        obj = ''
        for verbIndex, verbWord, verbLemma, verbXpos, _, _ in words_tuples:
            if verbXpos != 'VERB': continue
            verb, firstVerbIndex, lastVerbIndex = joinVerb(wordAdjList, int(verbIndex), verbWord, verbLemma)
            if not verb: continue
            for subjIndex, subjWord, subjLemma, subjXpos, subjRelation in wordAdjList[int(firstVerbIndex)]:
                if subjRelation != 'nsubj': continue
                nsubj = joinSubjectOrObject(wordAdjList, subjIndex, subjWord, subjLemma, subjXpos)
                if not nsubj: continue
                for objIndex, objWord, objLemma, objXpos, objRelation in wordAdjList[int(lastVerbIndex)]:
                    if objRelation != 'obj': continue
                    obj = joinSubjectOrObject(wordAdjList, objIndex, objWord, objLemma, objXpos)
                    if not obj: continue
                    tuples.append((nsubj, verb, obj))

            
            # reset variables
            verb = ''
            nsubj = ''
            obj = ''

    return tuples

In [24]:
# Run the UDPipe2 client on the text.
results = []
for auxIndex, (title, content) in g1.head()[['title', 'content']].iterrows():
    udpipe2_output = run_udpipe2_client(title + '.\n' + content)
        
    # Parse the UDPipe2 output.
    results.extend(parse_udpipe2_output(udpipe2_output))

print(results)

[('governo', 'arrumar cobrar', 'queda de juro'), ('setor produtivo', 'não conseguir mais pegar', 'dinheiro'), ('banco central brasileiro', 'iniciar', 'processo de corte de taxa básico de juro'), ('proposta', 'substituir', 'teto de gasto'), ('pl', 'apresentar', 'pedido de abertura de processo contra deputado federal márcio jerry pcdob– ma'), ('zanatta', 'respeitar', 'deputado'), ('ato de jerry', 'configurar', 'claro prática de importunação sexual'), ('g1', 'procurar', 'deputado'), ('jurídico de legenda', 'também estudar', 'possibilidade jurídico'), ('mesa diretor de câmara', 'ter', 'prazo de três sessão encaminhar'), ('órgão', 'analisar', 'representação contra deputado envolvido em suposto ato de quebra de decoro parlamentar'), ('site chinês', 'usar', '3 estratégia'), ('anúncio de fim de isenção de imposto para encomenda de exterior de us $ 50', 'buscar conter', 'esquema de site de e commerce'), ('cliente', 'pedir', 'produto'), ('site', 'dividir', 'compra'), ('cobrança', 'sempre existir

In [5]:
# Run the UDPipe2 client on all text using threads
import threading
from threading import Lock

number_of_threads = 24
number_of_texts = g1.shape[0]
texts_per_thread = number_of_texts // number_of_threads

threads = []
results = []

lock = Lock()
def process_texts_thread(lock, start, end):
    global results
    global g1
    partial_results = []
    for _, (title, content) in g1.iloc[start:end][['title', 'content']].iterrows():
        udpipe2_output = run_udpipe2_client(title + '.\n' + content)
        
        # Parse the UDPipe2 output.
        partial_results.extend(parse_udpipe2_output(udpipe2_output))

    lock.acquire()
    results.extend(partial_results)
    lock.release()
    
for i in range(number_of_threads):
    start = i * texts_per_thread
    end = (i + 1) * texts_per_thread if i < number_of_threads - 1 else number_of_texts
    thread = threading.Thread(target=process_texts_thread, args=(lock, start, end))
    threads.append(thread)
    thread.start()

for thread in threads:
    thread.join()

In [16]:
# save the results in a csv
df = pd.DataFrame(results, columns=['subject', 'verb', 'object'])
df = df.drop_duplicates()
df = df.sort_values(by=['subject', 'verb', 'object'])
df.to_csv('results.csv', index=False)

