# Agenda

1. Reading the data into a pandas dataframe
2. Exploring the dataset
3. Trying a basic model and see its accuracy

In [3]:
!pip install pyconll
!pip install nltk



In [4]:
import pyconll
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re

## Reading the data into a pandas dataframe

In [5]:
file_path = '/kaggle/input/ud-arabic-padt/ar_padt-ud-train.conllu'
dataset = pyconll.load_from_file(file_path)

dev_path= '/kaggle/input/ud-arabic-padt/ar_padt-ud-dev.conllu'
dev_dataset = pyconll.load_from_file(dev_path)

test_path ='/kaggle/input/ud-arabic-padt/ar_padt-ud-test.conllu'
test_dataset = pyconll.load_from_file(test_path)

In [6]:
def get_df(dataset):
    # Initialize lists to store data
    samples = []

    # Iterate over a few sentences to collect samples
    for i, sentence in enumerate(dataset):
        # Initialize lists to store tokens and POS tags for the current sentence
        tokens = []
        pos_tags = []
        # Iterate over tokens in the sentence and collect data
        for token in sentence:
            tokens.append(token.form)
            pos_tags.append(token.upos)
        # Append data for the current sentence as a tuple (word form, POS tag)
        samples.append((tokens, pos_tags))

    # Create a DataFrame from the collected samples
    return pd.DataFrame(samples, columns=['sent', 'pos_tags'])

# Print the Data
df = get_df(dataset)
df.head()

Unnamed: 0,sent,pos_tags
0,"[برلين, ترفض, حصول, شركة, اميركية, على, رخصة, ...","[X, VERB, NOUN, NOUN, ADJ, ADP, NOUN, NOUN, NO..."
1,"[برلين, 15, -, 7, (, اف, ب, ), -, افادت, صحيفة...","[X, NUM, PUNCT, NUM, PUNCT, X, X, PUNCT, PUNCT..."
2,"[وفي, و, في, نيسان, /, ابريل, الماضي, ،, تخلت,...","[None, CCONJ, ADP, NOUN, PUNCT, NOUN, ADJ, PUN..."
3,"[وكانت, و, كانت, خسائر, المجموعة, الاسبانية, ا...","[None, CCONJ, VERB, NOUN, NOUN, ADJ, ADJ, VERB..."
4,"[واشارت, و, أشارت, صحيفة, الاحد, الى, ان, المس...","[None, CCONJ, VERB, NOUN, NOUN, ADP, SCONJ, NO..."


In [None]:
df.shape

In [None]:
df["sent"][0]

In [None]:
df["pos_tags"][0]

## dividing

In [7]:
dev_df= get_df(dev_dataset)
test_df= get_df(test_dataset)

## Exploring the dataset

In [None]:
# Initialize lists to store data
samples = {}
tokens = []
pos_tags = []
for i, sentence in enumerate(dataset):
    # Iterate over tokens in the sentence and collect data
    for token in sentence:
        tokens.append(token.form)
        pos_tags.append(token.upos)
    # Append data for the current sentence as a tuple (word form, POS tag)
samples["tokens"]=tokens
samples["pos_tags"]=pos_tags

# Create a DataFrame from the collected samples
df2 = pd.DataFrame(samples)

# Print the Data
df2.head()

In [None]:
df2.shape

In [None]:
df2.describe()

In [None]:
df2.pos_tags.value_counts()

In [None]:
df2['pos_tags'].value_counts().plot.bar()

In [None]:
print(len(df2))

### Can a token have 2 different POS tags?

Short answer: **Yes!**

#### Long answer:

In [None]:
t_d={}
for i in range(len(df2)):
    if df2["tokens"][i] in t_d.keys():
        t_d[df2["tokens"][i]].append(df2["pos_tags"][i])
    else:
        t_d[df2["tokens"][i]]=[]
counter = 20
sum=0
for i in t_d.keys():
    c=len(set(t_d[i]))
    if c>1:
        sum+=1
        if counter>0:
            counter-=1
            print(i,set(t_d[i]))
print("total",sum)

## Modelling

In [8]:
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, SimpleRNN
from tensorflow.keras.models import Sequential

In [9]:
def get_numbers(df):
    # Convert tokens and POS tags to numeric representation
    vocab = set([word for sentence in df["sent"] for word in sentence])
    pos_tag_set = set([tag for tags in df["pos_tags"] for tag in tags])

    word2idx = {w: i + 1 for i, w in enumerate(vocab)}
    pos2idx = {t: i for i, t in enumerate(pos_tag_set)}
    idx2pos = {i: t for i, t in enumerate(pos_tag_set)}

    return [[word2idx[token] for token in sentence] for sentence in df["sent"]],\
           [[pos2idx[tag] for tag in tags] for tags in df["pos_tags"]], vocab, pos_tag_set

In [12]:
X, Y, vocab, pos_tag_set = get_numbers(df)
dev_X, dev_Y, _, _ = get_numbers(dev_df)
test_X, test_Y, _, _ = get_numbers(test_df)

### Pre-Processing

In [None]:
X[0], Y[0]

#### Determine the length of the longest sentence

In [13]:
# Pad sequences to have same length
max_len=478

X_padded =pad_sequences(X, maxlen=max_len, padding='post')
Y_padded =pad_sequences(Y, maxlen=max_len, padding='post')

dev_X_padded=pad_sequences(dev_X, maxlen=max_len, padding='post')
dev_Y_padded = pad_sequences(dev_Y, maxlen=max_len, padding='post')

test_X_padded=pad_sequences(test_X, maxlen=max_len, padding='post') 
test_Y_padded=pad_sequences(test_Y, maxlen=max_len, padding='post')

In [None]:
def evaluate_model(model, X_test, y_test):
    loss, accuracy = model.evaluate(X_test, y_test)
    print("Test Loss:", loss)
    print("Test Accuracy:", accuracy)

### RNN

In [None]:
# Build RNN model
model = Sequential([
        Embedding(input_dim=len(vocab) + 1, output_dim=8),
        SimpleRNN(16, return_sequences=True),
        Dense(len(pos_tag_set), activation='softmax')
    ])
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
history = model.fit(X_padded, Y_padded, epochs=2, batch_size=64)

In [None]:
evaluate_model(model, dev_X_padded, dev_Y_padded)
evaluate_model(model, test_X_padded, test_Y_padded)

## Networx

In [6]:
!pip install spacy networkx



In [8]:
import networkx as nx
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, SimpleRNN, Dense, Embedding, Flatten, Concatenate, TimeDistributed, Reshape

2024-08-03 22:00:39.697851: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-03 22:00:39.697950: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-03 22:00:39.813861: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [19]:
# Define data preprocessing and feature extraction functions
def preprocess_data(df):
    vocab = {word: idx for idx, word in enumerate(set(word for sublist in df['sent'] for word in sublist))}
    unique_pos_tags = list(set(tag for sublist in df['pos_tags'] for tag in sublist))
    pos_to_index = {tag: idx for idx, tag in enumerate(unique_pos_tags)}

    df['sent_idx'] = df['sent'].apply(lambda tokens: [vocab[token] for token in tokens])
    df['pos_tag_idx'] = df['pos_tags'].apply(lambda tags: [pos_to_index[tag] for tag in tags])

    max_seq_length = 478#max(len(tags) for tags in df['pos_tag_idx'])
    X = pad_sequences(df['sent_idx'], maxlen=max_seq_length, padding='post')
    y = pad_sequences(df['pos_tag_idx'], maxlen=max_seq_length, padding='post')

    y = [to_categorical(i, num_classes=len(unique_pos_tags)) for i in y]
    y = np.array(y)

    return X, y, vocab, unique_pos_tags#, max_seq_length

def create_graphs(df):
    graphs = []
    for index, row in df.iterrows():
        G = nx.Graph()
        tokens = row['sent']
        for i, token in enumerate(tokens):
            G.add_node(token)
            if i > 0:
                G.add_edge(tokens[i-1], token)
        graphs.append(G)
    return graphs

def extract_graph_features(graphs, max_seq_length):
    features = []
    for G in graphs:
        adj_matrix = nx.adjacency_matrix(G).todense()
        adj_matrix = np.array(adj_matrix)
        padded_adj = np.zeros((max_seq_length, max_seq_length))
        padded_adj[:adj_matrix.shape[0], :adj_matrix.shape[1]] = adj_matrix
        features.append(padded_adj)
    return np.array(features)

# Define the model
def build_model(vocab_size, max_seq_length, num_pos_tags):
    seq_input = Input(shape=(max_seq_length,))
    graph_input = Input(shape=(max_seq_length, max_seq_length))

    # Embedding layer for sequences
    embedding = Embedding(input_dim=vocab_size + 1, output_dim=4, input_length=max_seq_length)(seq_input)

    # SimpleRNN layer
    rnn_output = SimpleRNN(8, return_sequences=True)(embedding)

    # Flatten graph input
    flatten_graph = Flatten()(graph_input)

    # Ensure the graph features are compatible with RNN output dimensions
    graph_dense = Dense(max_seq_length, activation='relu')(flatten_graph)
    graph_dense_reshaped = Reshape((max_seq_length, -1))(graph_dense)

    # Combine RNN output and graph features
    combined = Concatenate(axis=-1)([rnn_output, graph_dense_reshaped])

    # Output layer with TimeDistributed for sequence prediction
    output = TimeDistributed(Dense(num_pos_tags, activation='softmax'))(combined)

    # Define and compile the model
    model = Model(inputs=[seq_input, graph_input], outputs=output)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

def generate_batches(X, graph_features, y, batch_size):
    num_batches = len(X) // batch_size
    for i in range(num_batches):
        start = i * batch_size
        end = (i + 1) * batch_size
        yield (X[start:end], graph_features[start:end], y[start:end])

In [None]:
X, y, vocab, unique_pos_tags = preprocess_data(df)
max_seq_length = 478
graphs = create_graphs(df)
graph_features = extract_graph_features(graphs, max_seq_length)

In [14]:
print(X.shape, y.shape)

(6075, 478) (6075, 478, 18)


In [11]:
model = build_model(len(vocab), max_seq_length, len(unique_pos_tags))
batch_size = 16

# Train the model using batch processing cause it craches kaggle notebook when I just call model.fit with th whole dataset
for X_batch, graph_batch, y_batch in generate_batches(X, graph_features, y, batch_size):
    history = model.fit([X_batch, graph_batch], y_batch, epochs=2, batch_size=batch_size, validation_split=0.2)



Epoch 1/2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122s/step - accuracy: 0.0134 - loss: 2.8727

I0000 00:00:1722722599.376826   31488 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 180s/step - accuracy: 0.0134 - loss: 2.8727 - val_accuracy: 0.0042 - val_loss: 2.8137
Epoch 2/2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step - accuracy: 0.0056 - loss: 2.8149 - val_accuracy: 0.0031 - val_loss: 2.7684
Epoch 1/2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step - accuracy: 0.0047 - loss: 2.7676 - val_accuracy: 0.0042 - val_loss: 2.7186
Epoch 2/2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step - accuracy: 0.0038 - loss: 2.7187 - val_accuracy: 0.0031 - val_loss: 2.6680
Epoch 1/2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step - accuracy: 0.0042 - loss: 2.6707 - val_accuracy: 0.0047 - val_loss: 2.6234
Epoch 2/2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step - accuracy: 0.0051 - loss: 2.6199 - val_accuracy: 0.0052 - val_loss: 2.5739
Epoch 1/2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [12]:
def evaluate_in_batches(model, X, graph_features, y, batch_size):
    num_batches = len(X) // batch_size
    total_loss = 0
    total_accuracy = 0
    for X_batch, graph_batch, y_batch in generate_batches(X, graph_features, y, batch_size):
        loss, accuracy = model.evaluate([X_batch, graph_batch], y_batch, verbose=0)
        total_loss += loss
        total_accuracy += accuracy
    return total_loss / num_batches, total_accuracy / num_batches

In [20]:
X_dev, y_dev, dev_vocab, unique_pos_tags= preprocess_data(dev_df)
dev_graphs = create_graphs(dev_df)
dev_graph_features = extract_graph_features(dev_graphs, max_seq_length)

loss, accuracy = evaluate_in_batches(model, X_dev, dev_graph_features, y_dev, 8)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Loss: 0.24035578293610463, Accuracy: 0.9311034874578493


In [23]:
print(X_dev.shape, y_dev.shape)

(909, 478) (909, 478, 18)


In [None]:
X_test, y_test, test_vocab, unique_pos_tags = preprocess_data(test_df)
test_graphs = create_graphs(test_df)
test_graph_features = extract_graph_features(test_graphs, max_seq_length)

loss, accuracy = evaluate_in_batches(model, X_test, test_graph_features, y_test, 8)
print(f'Loss: {loss}, Accuracy: {accuracy}')

## Acknowledgements:
1. orginal dataset https://github.com/UniversalDependencies/UD_Arabic-PADT
2. Nada Alswah https://www.kaggle.com/code/nadaalswah/arabic-pos-with-lstm