In [1]:
ls

README.md       config.yaml     run.py          test.txt        [34mutils[m[m/
Untitled.ipynb  [34mmodels[m[m/         runpol.py       train.txt


In [None]:
import numpy as np


# Function to preprocess the dataset and extract valid sentences
def preprocess_dataset(dataset):
    global L1
    valid_sentences = []
    current_sentence = []
    for line in dataset:
        L1=line
        if line.startswith('#'):  # Ignore comments
            continue
        if not line.strip():  # End of sentence
            if current_sentence:
                valid_sentences.append(current_sentence)
                current_sentence = []
        else:
            parts = line.split(' ')
            if len(parts) >= 5:  # Ensure it's a valid line with at least 5 columns
                word = parts[1]
                pos_tag = parts[3]
                head = int(parts[6]) if parts[6] != '_' else None
                dependency_relation = parts[7] if len(parts) > 7 else None
                current_sentence.append((word, pos_tag, head, dependency_relation))
    return valid_sentences

# Function to extract vocabulary, POS tags, and dependency relations from the dataset
def extract_vocab_pos_dep(dataset):
    vocabulary = set()
    pos_tags = set()
    dependency_relations = set()
    for sentence in dataset:
        for token in sentence:
            vocabulary.add(token[0])  # Token
            pos_tags.add(token[1])  # POS
            dependency_relations.add(token[3])  # Dependency Relation
    return vocabulary, pos_tags, dependency_relations

# Function to generate binary feature vector for a given configuration and transition
def generate_feature_vector(configuration, transition, vocabulary, pos_tags, dependency_relations):
    # Extract configuration elements
    stack, buffer, arcs = configuration
    
    # Initialize feature vector
    feature_vector = np.zeros(4 * (2 * len(vocabulary) + 3 * len(pos_tags) + 4 * len(dependency_relations)))
    
    # Define helper function to set feature values
    def set_feature_value(condition, value):
        nonlocal feature_vector
        feature_vector[condition] = value
    
    # Set feature values based on transition and configuration elements
    offset = 0
    if transition == 'LA':
        offset = 0
    elif transition == 'RA':
        offset = 1
    elif transition == 'RE':
        offset = 2
    elif transition == 'SH':
        offset = 3
    
    # TO DO: Implement feature extraction based on configuration
    
    return feature_vector

# Function to apply transitions on the parser configuration
def apply_transition(stack, buffer, arcs, transition):
    if transition == 'LA' and len(stack) >= 1 and len(buffer) >= 1:
        head, dependent = stack[-1], buffer[0]
        arcs.append((head, dependent))
        stack.pop()
    elif transition == 'RA' and len(stack) >= 1 and len(buffer) >= 1:
        head, dependent = stack[-1], buffer.pop(0)
        arcs.append((head, dependent))
    elif transition == 'REDUCE' and len(stack) >= 1:
        stack.pop()
    elif transition == 'SHIFT' and len(buffer) >= 1:
        stack.append(buffer.pop(0))

# Function to evaluate predictions using Unlabeled Attachment Score (UAS) metric
def evaluate(predictions, gold_standard):
    total_correct = 0
    total_tokens = 0
    for pred_sent, gold_sent in zip(predictions, gold_standard):
        for (gold_head, gold_dep), (pred_head, pred_dep) in zip(gold_sent, pred_sent):
            if gold_head == pred_head:
                total_correct += 1
            total_tokens += 1
    uas = total_correct / total_tokens
    return uas

# Function to read data using the provided function
def read_data(filename):
    data = []
    current_sentence = []
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line.startswith('#'):
                continue
            if not line:
                if current_sentence:
                    data.append(current_sentence)
                    current_sentence = []
            else:
                parts = line.split(' ')
                if len(parts) >= 5:  # Ensure it's a valid line with at least 5 columns
                    word = parts[1]
                    pos_tag = parts[3]
                    head = int(parts[4]) if parts[4] != '_' else None
                    dependency_relation = parts[5] if len(parts) > 5 else None
                    current_sentence.append((word, pos_tag, head, dependency_relation))
        if current_sentence:  # Append the last sentence if not empty
            data.append(current_sentence)
    return data


# Read train and test data
train_data = read_data('train.txt')
test_data = read_data('test.txt')

# Preprocess train and test data
#train_data = preprocess_dataset(train_data)
#test_data = preprocess_dataset(test_data)

# Extract vocabulary, POS tags, and dependency relations
vocabulary, pos_tags, dependency_relations = extract_vocab_pos_dep(train_data)

# Generate feature vectors for training instances
train_instances = []
for sentence in train_data:
    stack = []
    buffer = sentence[:]
    arcs = []
    while len(buffer) > 0:
        for transition in ['LA', 'RA', 'RE', 'SH']:
            feature_vector = generate_feature_vector((stack, buffer, arcs), transition, vocabulary, pos_tags, dependency_relations)
            train_instances.append((feature_vector, transition))
            apply_transition(stack, buffer, arcs, transition)
print("-----",train_instances)
# Initialize classifier weights
weights = np.random.rand(len(train_instances[0][0]))

# Train classifier using online learning
learning_rate = 0.1
for feature_vector, transition in train_instances:
    score = np.dot(weights, feature_vector)
    gold_standard = 1 if transition == gold_transition else 0
    weights += learning_rate * (gold_standard - score) * feature_vector

# Make predictions on test data
test_predictions = []
for sentence in test_data:
    stack = []
    buffer = sentence[:]
    arcs = []
    while len(buffer) > 0:
        for transition in ['LA', 'RA', 'RE', 'SH']:
            feature_vector = generate_feature_vector((stack, buffer, arcs), transition, vocabulary, pos_tags, dependency_relations)
            score = np.dot(weights, feature_vector)
            predicted_transition = 'SH'  # Default transition in case of invalid predictions
            if transition == 'LA' and len(stack) >= 2:
                predicted_transition = 'LA'
            elif transition == 'RA' and len(stack) >= 2:
                predicted_transition = 'RA'
            elif transition == 'RE' and len(stack) >= 1:
                predicted_transition = 'RE'
            elif transition == 'SH':
                predicted_transition = 'SH'
            apply_transition(stack, buffer, arcs, predicted_transition)
    test_predictions.append(arcs)

# Evaluate predictions using UAS metric
uas = evaluate(test_predictions, gold_standard)
print("Unlabeled Attachment Score (UAS):", uas)
