## Feature extraction

In [8]:
# Imports
from gensim.models import FastText
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertModel, BertTokenizer
from collections import OrderedDict
import torch
import csv
import pandas as pd

In [9]:
%run preprocessing.ipynb

15
Char [['m', 's', '>', 'l', 'p'], ['w', 'm', 'n'], ['H', 'n', 'v'], ['w', 'h', 'w'], ['q', 'A', 'd', 'r'], ['E', 'l', 'Y'], ['A', 'l', '<', 'T', 'E', 'A', 'm'], ['>', 'w'], ['A', 'l', 'k', 's', 'w', 'p'], ['>', 'w'], ['A', 'l', 'E', 't', 'q'], ['v', 'm'], ['A', 'f', 't', 'q', 'r'], ['f', 'E', 'j', 'z'], ['E', 'n'], ['k', 'l'], ['*', 'l', 'k'], ['l', 'm'], ['y', 'j', 'z', 'h'], ['A', 'l', 'S', 'w', 'm'], ['>', 'S', 'l', 'A']]
Diac [['a', 'o', 'a', 'a', 'N'], ['a', 'a', 'o'], ['a', 'i', 'a'], ['a', 'u', 'a'], ['a', ' ', 'i', 'N'], ['a', 'a', ' '], [' ', ' ', 'i', 'o', 'a', ' ', 'i'], ['a', 'o'], [' ', 'o', 'i', 'o', 'a', 'i'], ['a', 'o'], [' ', 'o', 'i', 'o', 'i'], ['u', '~a'], [' ', 'o', 'a', 'a', 'a'], ['a', 'a', 'a', 'a'], ['a', 'o'], ['u', '~i'], ['a', 'i', 'a'], ['a', 'o'], ['u', 'o', 'i', 'i'], [' ', ' ', '~a', 'o', 'u'], ['a', 'o', 'F', ' ']]


### Read the corpus

In [4]:
def readFile(path):
	sentences = []
	with open(path, 'r', encoding='utf-8') as file:
		for line in file:
			sentences.append(line.strip())

	return sentences

PATH = "../dataset/train.txt"
corpus = readFile(PATH)

In [8]:
data = []
for sentence in corpus:
    sentence = run_buckwalter(sentence)
    char_list, _ = extract_labels(sentence)

    char_list = ["".join(sen) for sen in char_list]
    data.append(char_list)

### FastText

In [59]:
# Defining values for parameters
embedding_size = 100
window_size = 20
min_word = 5
down_sampling = 1e-2

fast_Text_model = FastText(
                            vector_size=embedding_size,
                            window=window_size,
                            min_count=min_word,
                            sample=down_sampling,
                            workers=4,
                            epochs=50,
                            seed=42,
                            sg=1)
fast_Text_model.build_vocab(data, progress_per=10000)
fast_Text_model.train(data, total_examples=fast_Text_model.corpus_count, epochs=50, report_delay=1)

(98820800, 105103400)

In [74]:
fast_Text_model.save("./models/ft_model")
print(fast_Text_model.wv[buckwalter.transliterate("ياكل")])
print(fast_Text_model.wv.similarity(buckwalter.transliterate("احمد"), buckwalter.transliterate("محمد")))

[-0.17587607  0.02877168  0.04709973  0.34052846 -0.10787471  0.03396543
 -0.11186929  0.03204577  0.02601715 -0.04619154 -0.64464664  0.23255761
 -0.20373139 -0.03959081  0.08147166  0.05366697  0.05857076 -0.03167843
 -0.06125082  0.17182784 -0.14936416  0.07375959 -0.27469558 -0.34764272
  0.29371625 -0.10324     0.08420175 -0.05359089  0.08896402 -0.04694974
  0.3940764   0.08949303 -0.5643163  -0.2209296   0.33949855  0.14446706
  0.13378167 -0.15082327  0.15100697 -0.05039954  0.28573117  0.02301891
  0.01095174 -0.22188954 -0.2218215  -0.28840852 -0.01482417 -0.0590201
  0.36846104 -0.03828136 -0.39380917  0.06536474  0.321336    0.25974402
  0.27468032 -0.1842511  -0.28447118  0.0251733  -0.09457348  0.19323996
  0.25880393 -0.04231093 -0.12811872  0.352846    0.5898262  -0.15524375
  0.19650614 -0.0900159  -0.19504753 -0.3754987   0.2750499  -0.06968006
  0.26242155  0.08634424  0.2556117  -0.35728014 -0.35281286  0.2248184
  0.05599839  0.34907785  0.10232218  0.3731012  -0.0

In [10]:
data = []
word_count = 0
for sentence in corpus:
    sentence = run_buckwalter(sentence)
    char_list, _ = extract_labels(sentence)

    char_list = ["".join(sen) for sen in char_list]
    joined_with_space = " ".join(char_list)
    word_count += len(joined_with_space)
    data.append(joined_with_space)

In [18]:
print(len(data), word_count)

50000 10403546


### TF_IDF

In [None]:
tr_idf_model = TfidfVectorizer(lowercase=False)
tf_idf_vector = tr_idf_model.fit_transform(data)
words_set = tr_idf_model.get_feature_names_out()
df_tf_idf = pd.DataFrame(columns=words_set)
chunk_size=1000
for i in range(0, tf_idf_vector.shape[0], chunk_size):
        end_idx = min(i + chunk_size, tf_idf_vector.shape[0])
        tf_idf_chunk = tf_idf_vector[i:end_idx].toarray()
        chunk_df = pd.DataFrame(tf_idf_chunk, columns=words_set)
        df_tf_idf = pd.concat([df_tf_idf, chunk_df], ignore_index=True)
df_tf_idf.to_csv('models/tf_idf.csv', index=False)

In [None]:
# Read the DataFrame from the CSV file
df_from_csv = pd.read_csv('your_dataframe.csv')

# Accessing TF-IDF values for the word 'example'
tf_idf_for_example = df_from_csv['example']

### Bag Of Words

In [None]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(data)

feature_names = vectorizer.get_feature_names_out()

X_array = X.toarray()

df = pd.DataFrame(data=X_array, columns=feature_names, index=data)

df.to_csv('models/bag_of_words.csv', index=False)

### Contextual Embeddings

In [38]:
def bert_text_preparation(text, tokenizer):
    """
    Preprocesses text input in a way that BERT can interpret.
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.encode(marked_text, max_length=512, truncation=True, padding=True)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)
    # convert inputs to tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = torch.tensor([segments_ids])
    return tokenized_text, tokens_tensor, segments_tensor

def get_bert_embeddings(tokens_tensor, segments_tensor, model):
    """
    Obtains BERT embeddings for tokens.
    """
    # gradient calculation id disabled
    with torch.no_grad():
        # obtain hidden states
        outputs = model(tokens_tensor, segments_tensor)
        hidden_states = outputs[2]
    # concatenate the tensors for all layers
    # use "stack" to create new dimension in tensor
    token_embeddings = torch.stack(hidden_states, dim=0)
    # remove dimension 1, the "batches"
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    # swap dimensions 0 and 1 so we can loop over tokens
    token_embeddings = token_embeddings.permute(1, 0, 2)
    # intialized list to store embeddings
    token_vecs_sum = []
    # "token_embeddings" is a [Y x 12 x 768] tensor
    # where Y is the number of tokens in the sentence
    # loop over tokens in sentence
    for token in token_embeddings:
        # "token" is a [12 x 768] tensor
        # sum the vectors from the last four layers
        sum_vec = torch.sum(token[-4:], dim=0)
        token_vecs_sum.append(sum_vec)
    return token_vecs_sum

def visualize_embeddings(context_tokens, context_embeddings):
    filepath = "models/embeddings.tsv"
    with open(filepath, 'w+') as file_metadata:
        for i, token in enumerate(context_tokens):
            file_metadata.write(token + '\n')
    with open(filepath, 'w+') as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t')
        for embedding in context_embeddings:
            writer.writerow(embedding.numpy())

In [11]:
model_name = "bert-base-multilingual-cased"
model = BertModel.from_pretrained(model_name, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(model_name)

context_embeddings = []
context_tokens = []
for sentence in data:
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(
        sentence, tokenizer)
    list_token_embeddings = get_bert_embeddings(
        tokens_tensor, segments_tensors, model)
    # make ordered dictionary to keep track of the position of each   word
    tokens = OrderedDict()
    # loop over tokens in sensitive sentence
    for token in tokenized_text[1:-1]:
        # keep track of position of word and whether it occurs multiple times
        if token in tokens:
            tokens[token] += 1
        else:
            tokens[token] = 1
        # compute the position of the current token
        token_indices = [i for i, t in enumerate(
            tokenized_text) if t == token]
        current_index = token_indices[tokens[token]-1]
        # get the corresponding embedding
        token_vec = list_token_embeddings[current_index]
        # save values
        context_tokens.append(token)
        context_embeddings.append(token_vec)

visualize_embeddings(context_tokens, context_embeddings)

KeyboardInterrupt: 

### Main

In [78]:
# word_embeddings_fasttext()
loaded_model = FastText.load("./models/ft_model")
print(loaded_model.wv.similarity(buckwalter.transliterate("احمد"), buckwalter.transliterate("محمد")))

0.56691873
