In [6]:
drive.mount('/content/drive')
dataset_path = '/content/drive/My Drive/NLP_2/lab 1/ilur-news-corpus/ilur-news-corpus/train'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
import os
import re
import string
import gensim
import numpy as np
from google.colab import drive
from collections import Counter
from gensim.models import Word2Vec

# Armenian-specific character normalization
def normalize_armenian_text(text):
    replacements = {
        "֏": "դր.",  # Armenian Dram symbol to "դր."
        "«": '"', "»": '"',  # Armenian quotes to standard quotes
        "՝": ",", "․": ".",  # Armenian punctuation to standard
        "—": "-",  # Armenian dash
    }
    for key, value in replacements.items():
        text = text.replace(key, value)
    return text

def preprocess_text(text):
    text = text.lower()
    text = normalize_armenian_text(text)
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()
    return words

# Load and preprocess
all_sentences = []
word_freq = Counter()

for category in os.listdir(dataset_path):
    category_path = os.path.join(dataset_path, category)
    if os.path.isdir(category_path):
        for filename in os.listdir(category_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(category_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    words = preprocess_text(text)
                    word_freq.update(words)
                    all_sentences.append(words)

# Remove rare words
filtered_sentences = [[word for word in sentence if word_freq[word] >= 5] for sentence in all_sentences]

# Train
model = Word2Vec(
    sentences=filtered_sentences,
    vector_size=300,
    window=5,
    sg=1,  # Skip-gram
    negative=4,  # Negative sampling
    epochs=20
)

# Save
model_path = "/content/drive/My Drive/armenian_word2vec.model"
model.save(model_path)
print(f"Model saved at {model_path}")

Model saved at /content/drive/My Drive/armenian_word2vec.model


In [9]:
# Test
word = "մենք"
if word in model.wv:
    print(f"Words similar to '{word}':", model.wv.most_similar(word))
else:
    print(f"'{word}' not in vocabulary")

Words similar to 'մենք': [('չենք', 0.6284009218215942), ('ենք', 0.6123251914978027), ('լինենք', 0.5881370902061462), ('չունենք', 0.579660177230835), ('ունենք', 0.5709298849105835), ('բոլորս', 0.5578805804252625), ('գործել', 0.5464447140693665), ('էինք', 0.5415245294570923), ('հաստատ', 0.5328810811042786), ('տղաներն', 0.5326166749000549)]


In [16]:
from gensim.models import Word2Vec

model_path = "/content/drive/My Drive/armenian_word2vec.model"
model = Word2Vec.load(model_path)

word = "մենք"

if word in model.wv:
    print(f"Words similar to '{word}':", model.wv.most_similar(word))
else:
    print(f"'{word}' not in vocabulary")

vector = model.wv[word] if word in model.wv else None
# print(f"Vector for '{word}':\n{vector}")

Words similar to 'մենք': [('չենք', 0.5803602933883667), ('ենք', 0.5495970249176025), ('բոլորս', 0.49090391397476196), ('ունենք', 0.4745944142341614), ('չէինք', 0.4368734061717987), ('անելիքներ', 0.4304770231246948), ('կորոշեք', 0.4225435256958008), ('հասկանանք', 0.41318249702453613), ('չունենք', 0.4104703366756439), ('կկարողանանք', 0.4080561101436615)]
