In [1]:
print("Telugu@NLP")

Telugu@NLP


In [2]:
import conllu

# Function to load and parse a CoNLL-U file
def load_conllu_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = conllu.parse(f.read())
    return data

# Load the train, dev, and test files
train_data = load_conllu_data('UD_Telugu_English-TECT-master/qte_tect-ud-train.conllu')
test_data = load_conllu_data('UD_Telugu_English-TECT-master/qte_tect-ud-train.conllu')

# Example: Print words and POS tags for the first sentence in the train data
for sentence in train_data[:1]:  # Iterate over the first sentence
    words = [token['form'] for token in sentence]
    pos_tags = [token['upostag'] for token in sentence]
    print("Words:", words)
    print("POS Tags:", pos_tags)


Words: ['hello', 'lights', 'dim', 'cēyaṇḍi']
POS Tags: ['NOUN', 'NOUN', 'VERB', 'VERB']


In [3]:
from collections import defaultdict

# Function to extract sentences and POS tags
def extract_sentences_and_tags(data):
    sentences = []
    pos_tags = []
    for sentence in data:
        words = [token['form'].lower() for token in sentence]  # Convert to lowercase
        tags = [token['upostag'] for token in sentence]
        sentences.append(words)
        pos_tags.append(tags)
    return sentences, pos_tags

# Extract words and POS tags from each dataset
train_sentences, train_pos_tags = extract_sentences_and_tags(train_data)
test_sentences, test_pos_tags = extract_sentences_and_tags(test_data)


In [4]:

# Build word vocabulary with an <UNK> token
word_vocab = defaultdict(lambda: len(word_vocab))  # Assigns unique IDs
word_vocab["<PAD>"] = 0  # Padding token
word_vocab["<UNK>"] = 1  # Unknown words

# Build POS vocabulary
pos_vocab = defaultdict(lambda: len(pos_vocab))  # Assigns unique IDs
pos_vocab["<PAD>"] = 0  # Padding token

# Populate vocabularies using training data only
for sentence, tags in zip(train_sentences, train_pos_tags):
    for word in sentence:
        _ = word_vocab[word]  # Assign ID if not already present
    for tag in tags:
        _ = pos_vocab[tag]  # Assign ID if not already present

# Convert words and POS tags to IDs
def convert_to_ids(sentences, pos_tags, word_vocab, pos_vocab):
    encoded_sentences = [[word_vocab.get(word, word_vocab["<UNK>"]) for word in sent] for sent in sentences]
    encoded_pos_tags = [[pos_vocab[tag] for tag in tags] for tags in pos_tags]
    return encoded_sentences, encoded_pos_tags

# Convert datasets to numerical form
train_sentences_ids, train_pos_tags_ids = convert_to_ids(train_sentences, train_pos_tags, word_vocab, pos_vocab)
# dev_sentences_ids, dev_pos_tags_ids = convert_to_ids(dev_sentences, dev_pos_tags, word_vocab, pos_vocab)
test_sentences_ids, test_pos_tags_ids = convert_to_ids(test_sentences, test_pos_tags, word_vocab, pos_vocab)

# Print sample output
print("Sample Encoded Sentence:", train_sentences_ids[1])
print("Sample Encoded POS Tags:", train_pos_tags_ids[1])


Sample Encoded Sentence: [6, 7, 5]
Sample Encoded POS Tags: [1, 2, 2]


In [5]:

# Reverse mapping from ID to word/tag
id_to_word = {idx: word for word, idx in word_vocab.items()}
id_to_pos = {idx: tag for tag, idx in pos_vocab.items()}

# Decode the first sentence
decoded_words = [id_to_word[idx] for idx in train_sentences_ids[0]]
decoded_tags = [id_to_pos[idx] for idx in train_pos_tags_ids[0]]

print("Decoded Sentence:", decoded_words)
print("Decoded POS Tags:", decoded_tags)


Decoded Sentence: ['hello', 'lights', 'dim', 'cēyaṇḍi']
Decoded POS Tags: ['NOUN', 'NOUN', 'VERB', 'VERB']


In [6]:

from collections import Counter, defaultdict

# Step 1: Create a dictionary mapping words to their most frequent POS tag
word_to_pos = defaultdict(lambda: "NOUN")  # Default to NOUN for unknown words
pos_counts = defaultdict(Counter)  # Stores POS counts for each word

# Populate the dictionary using training data
for sentence, tags in zip(train_sentences, train_pos_tags):
    for word, tag in zip(sentence, tags):
        pos_counts[word][tag] += 1

# Assign the most frequent POS tag for each word
for word, counter in pos_counts.items():
    word_to_pos[word] = counter.most_common(1)[0][0]

# Function to predict POS tags for a given sentence
def rule_based_tagger(sentence):
    return [word_to_pos[word] if word in word_to_pos else "NOUN" for word in sentence]  # Default to NOUN

# Example: Predict on a sentence from the test set
sample_sentence = test_sentences[0]  # Take first test sentence
predicted_tags = rule_based_tagger(sample_sentence)
print("\nSample Sentence:", sample_sentence)
print("Predicted POS Tags:", predicted_tags)



Sample Sentence: ['hello', 'lights', 'dim', 'cēyaṇḍi']
Predicted POS Tags: ['NOUN', 'NOUN', 'VERB', 'VERB']


In [7]:
def evaluate_rule_based_tagger(sentences, true_pos_tags):
    correct = 0
    total = 0

    for sentence, true_tags in zip(sentences, true_pos_tags):
        predicted_tags = rule_based_tagger(sentence)

        # Compare predictions with ground truth
        for pred, true in zip(predicted_tags, true_tags):
            if pred == true:
                correct += 1
            total += 1

    accuracy = correct / total if total > 0 else 0
    print(f"Rule-Based POS Tagger Accuracy: {accuracy:.2%}")

# Evaluate on dev set
evaluate_rule_based_tagger(test_sentences, test_pos_tags)


Rule-Based POS Tagger Accuracy: 97.93%


In [8]:
from collections import Counter

def analyze_errors(sentences, true_pos_tags):
    total_errors = 0
    error_details = []
    pos_error_counts = Counter()
    
    for sentence, true_tags in zip(sentences, true_pos_tags):
        predicted_tags = rule_based_tagger(sentence)
        
        for word, pred, true in zip(sentence, predicted_tags, true_tags):
            if pred != true:  # Error detected
                total_errors += 1
                pos_error_counts[true] += 1
                error_details.append((word, pred, true))
    
    # Display overall error count
    print(f"Total Errors: {total_errors}")
    
    # Display most common misclassified words
    print("\nTop 10 Most Misclassified Words:")
    for word, pred in Counter(error_details).most_common(10):
        print(f"Word: {word[0]} | Predicted: {word[1]} | True: {word[2]} | No. of times: {pred}")

    # Display which POS tags are misclassified the most
    print("\nMost Commonly Misclassified POS Tags:")
    for pos, count in pos_error_counts.most_common(10):
        print(f"POS: {pos} | Errors: {count}")

# Run analysis on dev set
analyze_errors(test_sentences, test_pos_tags)

Total Errors: 6

Top 10 Most Misclassified Words:
Word: enta | Predicted: ADV | True: ADJ | No. of times: 1
Word: small | Predicted: ADJ | True: NOUN | No. of times: 1
Word: kamala | Predicted: PROPN | True: NOUN | No. of times: 1
Word: rāmu | Predicted: PROPN | True: NOUN | No. of times: 1
Word: ikkaḍiki | Predicted: ADV | True: NOUN | No. of times: 1
Word: telugu | Predicted: PROPN | True: NOUN | No. of times: 1

Most Commonly Misclassified POS Tags:
POS: NOUN | Errors: 5
POS: ADJ | Errors: 1
