import stuff

In [None]:
import pyconll
import numpy as np
import tensorflow as tf
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, TimeDistributed
from tensorflow.keras.utils import to_categorical


In [None]:
#!pip install pyconll
#install this cause in every runtime need to install it again ( its to use conllu files)

Collecting pyconll
  Downloading pyconll-3.3.1-py3-none-any.whl.metadata (7.9 kB)
Downloading pyconll-3.3.1-py3-none-any.whl (27 kB)
Installing collected packages: pyconll
Successfully installed pyconll-3.3.1


In [None]:


# --- CONFIGURATION AND PATHS ---
TRAIN_PATH = 'PosData/fa_perdt-ud-train.conllu'
DEV_PATH = 'PosData/fa_perdt-ud-dev.conllu'
TEST_PATH = 'PosData/fa_perdt-ud-test.conllu'

# Hyperparameters
MAX_LEN = 100         # All sequences will be padded/truncated to this length typical
EMBEDDING_DIM = 100   # Size of the word embeddings idk why
LSTM_UNITS = 128      # Size of the LSTM hidden state idk why
EPOCHS = 2           # I did 5 at first but took too much time

# --- STEP 1: DATA LOADING ---
def load_conllu(path):
    """Loads CoNLL-U files and extracts sentences as lists of (words, tags)."""
    data = []
    try:
        conll = pyconll.load_from_file(path)
    except FileNotFoundError:
        print(f"Error: File not found at {path}. Returning empty data.")
        return []

    for sentence in conll:
        # Extract words and UPOS tags, ensuring both exist for a token
        words = [token.form for token in sentence if token.form and token.upos]
        tags = [token.upos for token in sentence if token.form and token.upos]

        # Skip incomplete sentences
        if len(words) == len(tags) and len(words) > 0:
            data.append((words, tags))
    return data

print("Loading data...")
train_data = load_conllu(TRAIN_PATH)
dev_data = load_conllu(DEV_PATH)
test_data = load_conllu(TEST_PATH)
print(f"Train sentences: {len(train_data)}, Dev sentences: {len(dev_data)}, Test sentences: {len(test_data)}")

# --- STEP 2: PREPROCESSING (Tokenization and Padding) ---

def preprocess_data(data, max_len=MAX_LEN, word_tokenizer=None, tag_tokenizer=None):
    """Tokenizes words and tags, pads sequences, and one-hot encodes tags."""
    X_list = [sent[0] for sent in data]
    y_list = [sent[1] for sent in data]

    # 1. Initialize Tokenizers (fit only on training data)
    if word_tokenizer is None:
        word_tokenizer = Tokenizer(oov_token='<UNK>', lower=True)
        word_tokenizer.fit_on_texts(X_list)

    if tag_tokenizer is None:
        tag_tokenizer = Tokenizer(lower=False)
        tag_tokenizer.fit_on_texts(y_list)

    # 2. Convert to Sequences
    # Directly tokenize the list of sentences
    X_seq = word_tokenizer.texts_to_sequences(X_list)
    y_seq = tag_tokenizer.texts_to_sequences(y_list)

    # 3. Padding
    X_padded = pad_sequences(X_seq, maxlen=max_len, padding='post')
    y_padded = pad_sequences(y_seq, maxlen=max_len, padding='post')

    # 4. One-Hot Encoding for Softmax Loss (required for y_train only)
    # The padding index (0) is kept as all-zeros.
    y_categorical = to_categorical(y_padded, num_classes=len(tag_tokenizer.word_index) + 1)

    return X_padded, y_categorical, y_padded, word_tokenizer, tag_tokenizer

# Preprocess training data and get tokenizers
X_train, y_train_cat, y_train_idx, word_tokenizer, tag_tokenizer = preprocess_data(train_data)

# Preprocess dev and test data (y_dev_cat is used for validation, y_dev_idx for final metric calculation)
X_dev, y_dev_cat, y_dev_idx, _, _ = preprocess_data(dev_data, word_tokenizer=word_tokenizer, tag_tokenizer=tag_tokenizer)
X_test, y_test_cat, y_test_idx, _, _ = preprocess_data(test_data, word_tokenizer=word_tokenizer, tag_tokenizer=tag_tokenizer)


# Extract vocabulary size and number of tags
WORD_VOCAB_SIZE = len(word_tokenizer.word_index) + 1
TAG_VOCAB_SIZE = len(tag_tokenizer.word_index) + 1

# --- STEP 3: MODEL DEFINITION ---

def create_bilstm_softmax_model(word_vocab_size, tag_vocab_size, max_len, embedding_dim, lstm_units):
    """Defines and compiles the Bi-LSTM model with a final Softmax layer."""

    input_layer = Input(shape=(max_len,))

    # 1. Word Embedding Layer
    embedding_layer = Embedding(
        input_dim=word_vocab_size,
        output_dim=embedding_dim,
        input_length=max_len
    )(input_layer)

    # 2. Bi-LSTM Layer
    bilstm_layer = Bidirectional(
        LSTM(lstm_units, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)
    )(embedding_layer)

    # 3. Time Distributed Dense + Softmax
    # This applies a softmax classifier to the output of EVERY time step (word)
    output_layer = TimeDistributed(Dense(tag_vocab_size, activation='softmax'))(bilstm_layer)

    # Define Model and Compile
    model = Model(inputs=input_layer, outputs=output_layer)

    # Use categorical_crossentropy loss for multi-class classification with one-hot targets
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    print(model.summary())
    return model

# Create and compile the model
model = create_bilstm_softmax_model(WORD_VOCAB_SIZE, TAG_VOCAB_SIZE, MAX_LEN, EMBEDDING_DIM, LSTM_UNITS)

# --- STEP 4: TRAINING ---

print(f"\nTraining Bi-LSTM (Softmax) model for {EPOCHS} epochs...")
history = model.fit(
    X_train, y_train_cat,
    batch_size=32,
    epochs=EPOCHS,
    validation_data=(X_dev, y_dev_cat),
    verbose=1
)

# --- STEP 5: EVALUATION ---

def evaluate_model(model, X_data, y_true_padded_idx, tag_tokenizer, dataset_name='set'):
    """Predicts tags, unpacks padding, and prints metrics."""

    # Inverse map for converting predicted indices back to tag strings
    idx_to_tag = {v: k for k, v in tag_tokenizer.word_index.items()}

    # Prediction: Output is a 3D tensor (sequences, max_len, tag_vocab_size)
    y_pred_proba = model.predict(X_data, verbose=0)

    # Convert probabilities to indices by picking the max probability tag for each word
    y_pred_padded_idx = np.argmax(y_pred_proba, axis=-1)

    y_true = []
    y_pred = []

    # Unpack the sequences, ignoring padding (index 0)
    for true_seq, pred_seq in zip(y_true_padded_idx, y_pred_padded_idx):
        for true_idx, pred_idx in zip(true_seq, pred_seq):
            if true_idx != 0: # 0 is the padding index, which we ignore
                # True tag must be looked up from its index
                true_tag = idx_to_tag.get(true_idx, '<PAD>')

                # Predicted tag must be looked up from its index
                pred_tag = idx_to_tag.get(pred_idx, '<PAD>')

                if true_tag != '<PAD>':
                    y_true.append(true_tag)
                    y_pred.append(pred_tag)

    print(f'\n{"="*10} Evaluation on {dataset_name} Set {"="*10}')

    if not y_true:
        print("No valid data for evaluation.")
        return

    # Overall Accuracy
    print('Overall Tagging Accuracy:', accuracy_score(y_true, y_pred))

    # Detailed Metrics (Precision, Recall, F1-Score)
    print('\nDetailed Metrics (Precision, Recall, F1-Score) per UPOS Tag:')

    # Use only tags present in the true labels for the report
    target_names = sorted(list(set(y_true)))
    print(classification_report(y_true, y_pred, labels=target_names, target_names=target_names, digits=4, zero_division=0))
    print('='*50)


# ===============================================
#              MAIN EXECUTION
# ===============================================

# 1. Evaluate on Development Set (Tuning)
evaluate_model(model, X_dev, y_dev_idx, tag_tokenizer, 'Development (Dev)')

# 2. Evaluate on Test Set (Final Score)
evaluate_model(model, X_test, y_test_idx, tag_tokenizer, 'Final Test')


Loading data...
Train sentences: 26196, Dev sentences: 1456, Test sentences: 1455




None

Training Bi-LSTM (Softmax) model for 2 epochs...
Epoch 1/2
[1m819/819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m483s[0m 573ms/step - accuracy: 0.9191 - loss: 0.3197 - val_accuracy: 0.9894 - val_loss: 0.0341
Epoch 2/2
[1m819/819[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m468s[0m 571ms/step - accuracy: 0.9916 - loss: 0.0282 - val_accuracy: 0.9914 - val_loss: 0.0265

Overall Tagging Accuracy: 0.9499461142378158

Detailed Metrics (Precision, Recall, F1-Score) per UPOS Tag:
              precision    recall  f1-score   support

         ADJ     0.8628    0.8665    0.8646      1872
         ADP     0.9770    0.9913    0.9841      3553
         ADV     0.9388    0.8825    0.9098       417
         AUX     0.9889    0.9770    0.9829       912
       CCONJ     0.9964    0.9982    0.9973      1106
         DET     0.9218    0.9701    0.9454       535
        INTJ     0.9444    0.8293    0.8831        41
        NOUN     0.9399    0.9432    0.9415      8289
         NUM     0.9