In [None]:
%%capture
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install conllu

In [None]:
import nltk
import spacy
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from conllu import parse

In [None]:
nlp = spacy.load("en_core_web_sm")
nltk.download('wordnet')

In [194]:
def identify_predicates_and_arguments_from_conllu(conllu_file):
    sentence_dict = {}
    predicates_and_arguments_by_sentence = []
    
    with open(conllu_file, 'r', encoding='utf-8') as file:
        for sentence in file.read().strip().split('\n\n'):
            sentence_predicates = []
            sentence_arguments = []
            sent_id = None  
            for line in sentence.split('\n'):
                if line.startswith("# sent_id"):
                    sent_id = line.split("=")[1].strip()  
                    sentence_dict[sent_id] = []  
                if line.startswith("#"):
                    continue
                parts = line.split("\t")
                token_id = parts[0]
                if '.' in token_id:
                    continue
                if len(parts) < 11:
                    predicate_sense = ''
                    arguments = []
                else:
                    predicate_sense = parts[10]
                    arguments = parts[11:]

                if predicate_sense != '_':
                    predicate_info = [int(token_id), predicate_sense] 
                    sentence_predicates.append(predicate_info)
                    if sent_id:  
                        sentence_dict[sent_id].append([predicate_info, []])  
                        
                for arg_token_id, arg_labels in zip(range(1, len(arguments) + 1), arguments):
                    if arg_labels != '_':
                        argument_info = [int(token_id), arg_labels] 
                        sentence_arguments.append(argument_info)
                        if sent_id and sentence_dict.get(sent_id):  
                            
                            sentence_dict[sent_id][-1][1].append(argument_info)
                    
            predicates_and_arguments_by_sentence.append((sentence_predicates, sentence_arguments))
            
    for i, (predicates, arguments) in enumerate(predicates_and_arguments_by_sentence):
        predicate_ids = set(token_id for token_id, _ in predicates)
        predicates_and_arguments_by_sentence[i] = (predicates, [(token_id, label) for token_id, label in arguments if token_id not in predicate_ids])

    return predicates_and_arguments_by_sentence, sentence_dict

def filter_arguments(sentence_dict1):
    for predicates in sentence_dict1.values():
        for predicate in predicates:
            pred_id = predicate[0][0]
            predicate[1] = [argument for argument in predicate[1] if argument[0] != pred_id]


In [195]:
result, sentence_dict1 = identify_predicates_and_arguments_from_conllu("data/en_ewt-up-train.conllu")
filter_arguments(sentence_dict1)

print(sentence_dict1['weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0001'][0])
print(sentence_dict1['weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0002'][0])
print(sentence_dict1['weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0002'][1])
print(sentence_dict1['weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0002'][2])
print(sentence_dict1['weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0002'][3])
print('\n')
print(sentence_dict1['weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0002'])

[[7, 'kill.01'], [[8, 'ARG1'], [18, 'ARGM-LOC']]]
[[3, 'kill.01'], [[7, 'ARG1'], [8, 'ARGM-MOD']]]
[[9, 'be.03'], []]
[[10, 'cause.01'], [[11, 'ARGM-GOL'], [12, 'ARG1'], [14, 'ARGM-TMP'], [14, 'ARG1']]]
[[16, 'come.01'], []]


[[[3, 'kill.01'], [[7, 'ARG1'], [8, 'ARGM-MOD']]], [[9, 'be.03'], []], [[10, 'cause.01'], [[11, 'ARGM-GOL'], [12, 'ARG1'], [14, 'ARGM-TMP'], [14, 'ARG1']]], [[16, 'come.01'], []]]
