# setting up files, packages, and libraries

Install Required Libraries

In [None]:
!pip install hazm
!pip install evaluate
!pip install jiwer
!pip install mega.py

restart session

In [None]:
# Restart Session
import os
os.kill(os.getpid(), 9)

import libraries

In [87]:
import hazm
import evaluate
from mega import Mega
from collections import Counter
import jiwer
from jiwer import wer

upload files

In [106]:
%ls

 fasttext_model.zip
 fasttext_skipgram_300.bin
 fasttext_skipgram_300.vec
 HarfOutp.txt
'[Persian] آلن تورینگ پدر هوش مصنوعی و بزرگترین کد شکن تاریخ [DownSub.com].txt'
 pos_tagger.model
 [0m[01;34msample_data[0m/


# Q4

Initialize normalizer

In [100]:
normalizer = hazm.Normalizer()
normalizer

<hazm.normalizer.Normalizer at 0x79a816febeb0>

Function to read and normalize text from a file

In [101]:
def read_and_normalize(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    normalized_text = normalizer.normalize(text)
    return normalized_text

Read and normalize the prediction and reference texts

In [102]:
prediction_text = read_and_normalize('HarfOutp.txt')
reference_text = read_and_normalize('[Persian] آلن تورینگ پدر هوش مصنوعی و بزرگترین کد شکن تاریخ [DownSub.com].txt')

print(">>> prediction_text[:85]:")
print(f"\t {prediction_text[:85]}")
print(">>> reference_text[:85]:")
print(f"\t {reference_text[:85]}")

>>> prediction_text[:85]:
	 . در دوران جنگ جهانی دوم، یکی از برگ برنده‌های متفقین رمزگشایی از پیام‌های سری نازی‌ه
>>> reference_text[:85]:
	 در دوران جنگ جهانی دوم یکی از برگه
برنده‌های متفقین رمزگشایی از پیام‌های

سری نازی‌ها


Align the sequences using jiwer

In [103]:
transformation = jiwer.Compose([
    jiwer.RemovePunctuation(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip(),
    jiwer.ToLowerCase(),
    jiwer.RemoveEmptyStrings()
])

reference_text = reference_text.replace("\n", " ") # remove new lines

# Transform the text
transformed_prediction = transformation(prediction_text)
transformed_reference = transformation(reference_text)

print(">>> transformed_prediction[:85]:")
print(f"\t {transformed_prediction[:85]}")
print(">>> transformed_reference[:85]:")
print(f"\t {transformed_reference[:85]}")

>>> transformed_prediction[:85]:
	 در دوران جنگ جهانی دوم یکی از برگ برنده‌های متفقین رمزگشایی از پیام‌های سری نازی‌ها ب
>>> transformed_reference[:85]:
	 در دوران جنگ جهانی دوم یکی از برگه برنده‌های متفقین رمزگشایی از پیام‌های سری نازی‌ها 


In [104]:
# Calculate WER using the evaluate library
wer_metric = evaluate.load("wer")

# truncation_threshold = min(len(pred_words), len(ref_words))
# wer = wer_metric.compute(predictions=pred_words[:truncation_threshold], references=ref_words[:truncation_threshold])
# wer = wer_metric.compute(predictions=[prediction_text], references=[reference_text])
wer = wer_metric.compute(predictions=[transformed_prediction], references=[transformed_reference])

# Print the WER
print(f">>> Word Error Rate (WER): {wer:.6f}")

>>> Word Error Rate (WER): 0.227290


In [105]:
# Calculate CER using the evaluate library
cer_metric = evaluate.load("cer")

# truncation_threshold = min(len(pred_words), len(ref_words))
# cer = cer_metric.compute(predictions=pred_words[:truncation_threshold], references=ref_words[:truncation_threshold])
# cer = cer_metric.compute(predictions=[prediction_text], references=[reference_text])
cer = cer_metric.compute(predictions=[transformed_prediction], references=[transformed_reference])


# Print the CER
print(f">>> Character Error Rate (CER): {cer:.6f}")

>>> Character Error Rate (CER): 0.100094


# Q5

download model

In [46]:
!gdown 1Q3JK4NVUC2t5QT63aDiVrCRBV225E_B3

Downloading...
From: https://drive.google.com/uc?id=1Q3JK4NVUC2t5QT63aDiVrCRBV225E_B3
To: /content/pos_tagger.model
100% 19.2M/19.2M [00:00<00:00, 89.2MB/s]


Initialize normalizer and POS tagger

In [48]:
normalizer = hazm.Normalizer()

tagger = hazm.POSTagger(model='pos_tagger.model')
# tagger = hazm.POSTagger(model=hazm.utils.download_model('postagger')) # should be implemented by publishers !!!

Function to read, normalize, and POS tag text from a file

In [49]:
def read_normalize_and_tag(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    normalized_text = normalizer.normalize(text)
    words = hazm.word_tokenize(normalized_text)
    pos_tags = tagger.tag(words)
    return pos_tags

Function to count verbs and adverbs in POS tagged text

In [50]:
def count_verbs_and_adverbs(pos_tags):
    verbs = sum(1 for word, tag in pos_tags if tag.startswith('V'))
    adverbs = sum(1 for word, tag in pos_tags if tag == 'ADV')
    return verbs, adverbs

Process the prediction and reference files

In [51]:
prediction_pos_tags = read_normalize_and_tag('HarfOutp.txt')
reference_pos_tags = read_normalize_and_tag('[Persian] آلن تورینگ پدر هوش مصنوعی و بزرگترین کد شکن تاریخ [DownSub.com].txt')

Count verbs and adverbs

In [52]:
prediction_verbs, prediction_adverbs = count_verbs_and_adverbs(prediction_pos_tags)
reference_verbs, reference_adverbs = count_verbs_and_adverbs(reference_pos_tags)

print(f">>> Prediction \n\t Verbs: {prediction_verbs}, Adverbs: {prediction_adverbs}")
print(f">>> Reference \n\t Verbs: {reference_verbs}, Adverbs: {reference_adverbs}")

>>> Prediction 
	 Verbs: 294, Adverbs: 78
>>> Reference 
	 Verbs: 290, Adverbs: 74


# Q6

Initialize normalizer, POS tagger, and lemmatizer

In [69]:
# !gdown 1Q3JK4NVUC2t5QT63aDiVrCRBV225E_B3 # previously we did in #Q5

normalizer = hazm.Normalizer()
tagger = hazm.POSTagger(model='pos_tagger.model')
# stemmer = hazm.Stemmer()
lemmatizer = hazm.Lemmatizer()

Function to read, normalize, and POS tag text from a file

In [70]:
def read_normalize_and_tag(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    normalized_text = normalizer.normalize(text)
    words = hazm.word_tokenize(normalized_text)
    pos_tags = tagger.tag(words)
    return pos_tags

Function to extract and count verb stems

In [71]:
def extract_and_count_verbs(pos_tags):
    verbs = [word for word, tag in pos_tags if tag.startswith('V')]

    # verb_stems = [stemmer.stem(verb) for verb in verbs] # رفتم -> رف
    # verb_counts = Counter(verb_stems) # رفتم -> رفت#رو
    verb_lemmas = [lemmatizer.lemmatize(verb) for verb in verbs]
    verb_counts = Counter(verb_lemmas)

    most_common_verb = verb_counts.most_common(1)[0]
    return most_common_verb

Process the prediction and reference files

In [72]:
prediction_pos_tags = read_normalize_and_tag('HarfOutp.txt')
reference_pos_tags = read_normalize_and_tag('[Persian] آلن تورینگ پدر هوش مصنوعی و بزرگترین کد شکن تاریخ [DownSub.com].txt')

Extract and count verbs

In [73]:
prediction_most_common_verb = extract_and_count_verbs(prediction_pos_tags)
reference_most_common_verb = extract_and_count_verbs(reference_pos_tags)

Print the most common verbs

In [74]:
print(f">>> Prediction \n\t Most common verb root: '{prediction_most_common_verb[0]}', Count: {prediction_most_common_verb[1]}")
print(f">>> Reference.txt \n\t Most common verb root: '{reference_most_common_verb[0]}', Count: {reference_most_common_verb[1]}")

>>> Prediction 
	 Most common verb root: 'کرد#کن', Count: 45
>>> Reference.txt 
	 Most common verb root: 'کرد#کن', Count: 36


# Q7

downolad fasttext_skipgram_300.bin

In [27]:
mega = Mega()
# m = mega.login(email, password)
# # login using a temporary anonymous account
m = mega.login()
m.download_url('https://mega.nz/file/GqZUlbpS#XRYP5FHbPK2LnLZ8IExrhrw3ZQ-jclNSVCz59uEhrxY')

PosixPath('fasttext_model.zip')

unzip model

In [28]:
!unzip fasttext_model.zip

Archive:  fasttext_model.zip
  inflating: fasttext_skipgram_300.bin  
  inflating: fasttext_skipgram_300.vec  


test on a single example

In [39]:
word_embedding = hazm.WordEmbedding(model_type = 'fasttext', model_path = 'fasttext_skipgram_300.bin')
word_embedding.doesnt_match(['سلام' ,'درود' ,'خداحافظ' ,'پنجره'])

'پنجره'

write desired function in question

In [40]:
def find_unrelated_words(_words_lists, _word_embedding):
    unrelated_words = []
    for words_list in _words_lists:
        unrelated_word = _word_embedding.doesnt_match(words_list)
        unrelated_words.append(unrelated_word)
    return unrelated_words

test desired function

In [43]:
list_of_word_lists = [
        ['کتاب', 'مدرسه', 'ماشین', 'دانشگاه', 'مداد'],
        ['گل', 'درخت', 'چمن', 'صندلی', 'بوته'],
        ['سلام' ,'درود' ,'خداحافظ' ,'پنجره', 'بدرورد'],
        ['ساعت' ,'پلنگ' ,'شیر', 'ببر', 'سگ']
    ]


unrelated_words = find_unrelated_words(list_of_word_lists, word_embedding)
print(f">>> Words lists:")
for list_of_word_list in list_of_word_lists:
    print(f"\t {list_of_word_list}")
print()
print(f">>> Unrelated words in each list:")
for unrelated_word in unrelated_words:
    print(f"\t {unrelated_word}")
print()

>>> Words lists:
	 ['کتاب', 'مدرسه', 'ماشین', 'دانشگاه', 'مداد']
	 ['گل', 'درخت', 'چمن', 'صندلی', 'بوته']
	 ['سلام', 'درود', 'خداحافظ', 'پنجره', 'بدرورد']
	 ['ساعت', 'پلنگ', 'شیر', 'ببر', 'سگ']

>>> Unrelated words in each list:
	 ماشین
	 صندلی
	 پنجره
	 ساعت

