In [12]:
import requests
from collections import defaultdict

#url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_German-HDT/dev/de_hdt-ud-dev.conllu'


# MHD Bashir Kabbani 3017375
# Vialli Tem Ndumbe 2911472 

# Step 1: Download the treebank file
url = 'https://github.com/UniversalDependencies/UD_German-HDT.git'
response = requests.get(url)
file_content = response.text

# Step 2: Parse the file content
def parse_conllu(file_content):
    sentences = []
    sentence = []
    sentence_meta = {}
    for line in file_content.split('\n'):
        if line.startswith('#'):
            if line.startswith('# sent_id'):
                sentence_meta['id'] = line.split('=')[-1].strip()
            elif line.startswith('# text'):
                sentence_meta['text'] = line.split('=')[-1].strip()
        elif line.strip() == '':
            if sentence:
                sentences.append((sentence_meta, sentence))
                sentence = []
                sentence_meta = {}
        else:
            sentence.append(line.split('\t'))
    print(len(sentences))
    return sentences

# Step 3: Identify non-projective trees
def is_non_projective(sentence):
    arcs = []
    for token in sentence:
        if len(token) < 8:
            continue  # Skip tokens with fewer than 8 elements
        try:
            head = int(token[6])
            dependent = int(token[0])
        except ValueError:
            continue  # Skip tokens where head or id is not an integer
        if head == 0:  # Skip root
            continue
        if head > dependent:
            head, dependent = dependent, head
        arcs.append((head, dependent))
    
    for i in range(len(arcs)):
        for j in range(i + 1, len(arcs)):
            (h1, d1) = arcs[i]
            (h2, d2) = arcs[j]
            if (h1 < h2 < d1 < d2) or (h2 < h1 < d2 < d1):
                return True
    return False

# Step 4: Extract non-projective sentences and their identifiers
def extract_non_projective_sentences(file_content):
    sentences = parse_conllu(file_content)
    non_projective_sentences = []
    for sentence_meta, sentence in sentences:
        if is_non_projective(sentence):
            non_projective_sentences.append((sentence_meta['id'], sentence_meta['text'], sentence))
    return non_projective_sentences

# Step 5: Main execution
non_projective_sentences = extract_non_projective_sentences(file_content)
print("Non-projective sentences:")
for sentence_id, sentence_text, sentence in non_projective_sentences:
    print(f"ID: {sentence_id}")
    print(f"Text: {sentence_text}")
    print()

# Step 5: Count non-projective sentences
def count_non_projective_sentences(non_projective_sentences):
    return len(non_projective_sentences)

# Print the count of non-projective sentences
total_non_projective = count_non_projective_sentences(non_projective_sentences)
print(f"Total number of non-projective sentences: {total_non_projective}")

# Exercise 2: Reordered list of words to make the tree projective
def create_dependency_graph(sentence):
    graph = defaultdict(list)
    words = {}
    for token in sentence:
        if len(token) < 8:
            continue
        try:
            head = int(token[6])
            dependent = int(token[0])
            word = token[1]
        except ValueError:
            continue  # Skip tokens where head or id is not an integer
        graph[head].append(dependent)
        words[dependent] = word
    return graph, words

def inorder_traversal(node, graph, words, result):
    if node not in graph:
        return
    children = sorted(graph[node])
    for child in children:
        inorder_traversal(child, graph, words, result)
    if node in words:
        result.append(words[node])

def make_projective(sentence):
    graph, words = create_dependency_graph(sentence)
    result = []
    inorder_traversal(0, graph, words, result)
    return result

print("\nExercise 2: Reordered sentences to make the tree projective:")
for sentence_id, sentence_text, sentence in non_projective_sentences:
    projective_order = make_projective(sentence)
    print(f"ID: {sentence_id}")
    print(f"Original Text: {sentence_text}")
    print("Projective Order:", " ".join(projective_order))
    print()

# Exercise 3: Analysis of syntactic construction types
def analyze_construction_types(non_projective_sentences, num_sentences=50):
    construction_types = {
        'wh-movement/fronting': [],
        'extraposition': [],
        'parentheticals/insertions': []
    }

    for sentence_id, sentence_text, sentence in non_projective_sentences[:num_sentences]:
        for token in sentence:
            if len(token) >= 8:
                construction = token[7]
                if construction.startswith('wh'):
                    construction_types['wh-movement/fronting'].append((sentence_id, sentence_text))
                elif construction.startswith('extr'):
                    construction_types['extraposition'].append((sentence_id, sentence_text))
                elif construction.startswith('parataxis'):
                    construction_types['parentheticals/insertions'].append((sentence_id, sentence_text))

    return construction_types

print("\nExercise 3: Analysis of Non-Projective Sentences:")
construction_types = analyze_construction_types(non_projective_sentences)

print("\n## Most frequent syntactic construction type: Wh-movement/Fronting")
print("Examples:")
for sent_id, sent_text in construction_types['wh-movement/fronting']:
    print(f"- ID: {sent_id}, Text: {sent_text}")

print("\n### Other construction types:")
print("1. Extraposition")
print("Examples:")
for sent_id, sent_text in construction_types['extraposition']:
    print(f"- ID: {sent_id}, Text: {sent_text}")

print("\n2. Parentheticals/Insertions")
print("Examples:")
for sent_id, sent_text in construction_types['parentheticals/insertions']:
    print(f"- ID: {sent_id}, Text: {sent_text}")


331
Non-projective sentences:
Total number of non-projective sentences: 0

Exercise 2: Reordered sentences to make the tree projective:

Exercise 3: Analysis of Non-Projective Sentences:

## Most frequent syntactic construction type: Wh-movement/Fronting
Examples:

### Other construction types:
1. Extraposition
Examples:

2. Parentheticals/Insertions
Examples:


In [55]:
files_list = []
for f in os.listdir('.'):
    if f.endswith('conllu'):
        files_list.append(f)

In [56]:
files_list

['de_dht-ud-train_conllu.conllu',
 'de_hdt-ud-dev.conllu',
 'de_hdt-ud-dev_filtered.conllu',
 'de_hdt-ud-test.conllu',
 'de_hdt-ud-test_filtered.conllu',
 'de_hdt-ud-train-a-1.conllu',
 'de_hdt-ud-train-a-1_filtered.conllu',
 'de_hdt-ud-train-a-2.conllu',
 'de_hdt-ud-train-a-2_filtered.conllu',
 'de_hdt-ud-train-b-1.conllu',
 'de_hdt-ud-train-b-1_filtered.conllu',
 'de_hdt-ud-train-b-2.conllu',
 'de_hdt-ud-train-b-2_filtered.conllu']

In [57]:
for file_url in files_list:
    with open(file_url, 'r') as f:
        lines = f.readlines()
        non_projective_sentences = extract_non_projective_sentences(''.join(lines))
        
        print(file_url)
        print(len(non_projective_sentences))
        print('===========================')

325
de_dht-ud-train_conllu.conllu
0
18434
de_hdt-ud-dev.conllu
1240
17194
de_hdt-ud-dev_filtered.conllu
0
18459
de_hdt-ud-test.conllu
1208
17251
de_hdt-ud-test_filtered.conllu
0
38102
de_hdt-ud-train-a-1.conllu
2322
35780
de_hdt-ud-train-a-1_filtered.conllu
0
37515
de_hdt-ud-train-a-2.conllu
2652
34863
de_hdt-ud-train-a-2_filtered.conllu
0
38411
de_hdt-ud-train-b-1.conllu
2687
35724
de_hdt-ud-train-b-1_filtered.conllu
0
39007
de_hdt-ud-train-b-2.conllu
2717
36290
de_hdt-ud-train-b-2_filtered.conllu
0


In [37]:
def count_sents(lines):
    c = 0
    for line in lines:
        if 'sent_id' in line:
            c = c + 1
    print(c)

In [40]:
def remove_sentences(input_file, output_file, ids_to_remove):
    with open(input_file, 'r', encoding='utf-8') as f:
        sentences = f.read().strip().split('\n\n')
    
    remaining_sentences = []
    
    for sentence in sentences:
        lines = sentence.split('\n')
        sent_id = None
        
        # Find the sent_id in the current sentence
        for line in lines:
            if line.startswith('# sent_id ='):
                sent_id = line.split('=')[1].strip()
                break
        
        # If the sent_id is not in the list to remove, keep the sentence
        if sent_id not in ids_to_remove:
            remaining_sentences.append(sentence)
    
    # Write the remaining sentences to the output file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('\n\n'.join(remaining_sentences) + '\n')

In [53]:
m = [
 'de_hdt-ud-dev.conllu',
 'de_hdt-ud-test.conllu',
 'de_hdt-ud-train-a-1.conllu',
 'de_hdt-ud-train-a-2.conllu',
 'de_hdt-ud-train-b-1.conllu',
]

In [54]:
for file in m:
    print(file)
    with open(file, 'r') as f:
        lines = f.readlines()
    non_projective_sentences = extract_non_projective_sentences(''.join(lines))

    input_file = file
    output_file = file.replace('.conllu', '_filtered.conllu')
    ids_to_remove = [tup[0] for tup in non_projective_sentences]
    remove_sentences(input_file, output_file, ids_to_remove)

de_hdt-ud-dev.conllu
18434
de_hdt-ud-test.conllu
18459
de_hdt-ud-train-a-1.conllu
38102
de_hdt-ud-train-a-2.conllu
37515
de_hdt-ud-train-b-1.conllu
38411
