In [6]:
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from collections import Counter


nltk.download('wordnet')
nltk.download('stopwords')

def load_quran_texts(base_path):
    translations = [
        'en.ahmedali', 'en.ahmedraza', 'en.arberry', 'en.daryabadi',
        'en.hilali', 'en.itani', 'en.maududi', 'en.mubarakpuri',
        'en.pickthall', 'en.qarai', 'en.qaribullah', 'en.sahih',
        'en.sarwar', 'en.shakir', 'en.transliteration', 'en.wahiduddin',
        'en.yusufali'
    ]
    all_texts = []
    for translation in translations:
        file_path = os.path.join(base_path, f"{translation}.txt")
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                all_texts.append(text.lower())
            print(f"Loaded: {translation}")
        except FileNotFoundError:
            print(f"Warning: File not found - {file_path}")
    return all_texts

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return words

def lemmatize_text(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

def train_word2vec(texts, size=200, window=10, min_count=5, workers=4):
    model = Word2Vec(sentences=texts, vector_size=size, window=window, min_count=min_count, workers=workers, sg=0)  # sg=0 for CBOW
    return model
def get_vocabulary(texts):
    all_words = [word for text in texts for word in text]
    vocab = Counter(all_words)
    return vocab

def print_vocabulary_stats(vocab):
    print(f"Total unique words: {len(vocab)}")
    print(f"Top 20 most common words:")
    for word, count in vocab.most_common(20):
        print(f"{word}: {count}")

def main():
    print("Loading and preprocessing Quran texts...")
    base_path = r'C:\Users\Alamsyah\Skripsi 6\Quran'  # Update this path
    quran_texts = load_quran_texts(base_path)
    
    preprocessed_texts = [preprocess_text(text) for text in quran_texts]
    lemmatized_texts = [lemmatize_text(text) for text in preprocessed_texts]
    
    # Get vocabulary after preprocessing
    vocab = get_vocabulary(lemmatized_texts)
    print_vocabulary_stats(vocab)
    
    print("\nTraining word2vecLL model...")
    word2vecLL = train_word2vec(lemmatized_texts)
    
    # Print vocabulary size in the trained model
    print(f"\nVocabulary size in trained Word2Vec model: {len(word2vecLL.wv.key_to_index)}")
    
    print("\nSaving word2vecLL model...")
    if not os.path.exists('models'):
        os.makedirs('models')
    word2vecLL.save("models/word2vecLL_quran.model")
    
    print("word2vecLL model training and saving completed.")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alamsyah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alamsyah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading and preprocessing Quran texts...
Loaded: en.ahmedali
Loaded: en.ahmedraza
Loaded: en.arberry
Loaded: en.daryabadi
Loaded: en.hilali
Loaded: en.itani
Loaded: en.maududi
Loaded: en.mubarakpuri
Loaded: en.pickthall
Loaded: en.qarai
Loaded: en.qaribullah
Loaded: en.sahih
Loaded: en.sarwar
Loaded: en.shakir
Loaded: en.transliteration
Loaded: en.wahiduddin
Loaded: en.yusufali
Total unique words: 36254
Top 20 most common words:
allah: 32922
god: 16415
lord: 15867
shall: 12084
say: 11725
said: 11444
people: 9788
day: 8585
one: 7962
indeed: 7399
u: 7279
know: 6291
earth: 6199
believe: 5882
ye: 5727
surely: 5602
may: 5552
come: 5467
upon: 5430
would: 5364

Training word2vecLL model...

Vocabulary size in trained Word2Vec model: 9998

Saving word2vecLL model...
word2vecLL model training and saving completed.


In [9]:
import numpy as np
from gensim.models import Word2Vec
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import joblib

# Load the information content
brown_ic = wordnet_ic.ic('ic-brown.dat')

def safe_similarity(func):
    def wrapper(word1, word2):
        try:
            return func(word1, word2)
        except:
            return 0
    return wrapper

@safe_similarity
def wu_palmer_similarity(word1, word2):
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    if synsets1 and synsets2:
        max_sim = max((s1.wup_similarity(s2) or 0) for s1 in synsets1 for s2 in synsets2)
        return max_sim if max_sim > 0 else 0
    return 0

@safe_similarity
def jiang_conrath_similarity(word1, word2):
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    if synsets1 and synsets2:
        max_sim = max((s1.jcn_similarity(s2, brown_ic) or 0) for s1 in synsets1 for s2 in synsets2)
        return max_sim if max_sim > 0 else 0
    return 0

@safe_similarity
def hirst_st_onge_similarity(word1, word2):
    synsets1 = wn.synsets(word1)
    synsets2 = wn.synsets(word2)
    if synsets1 and synsets2:
        max_sim = max((s1.hso_similarity(s2) or 0) for s1 in synsets1 for s2 in synsets2)
        return max_sim if max_sim > 0 else 0
    return 0

def create_similarity_matrix(word, similar_words):
    M = np.zeros((4, len(similar_words)))
    for i, similar_word in enumerate(similar_words):
        M[0, i] = wu_palmer_similarity(word, similar_word)
        M[1, i] = jiang_conrath_similarity(word, similar_word)
        M[2, i] = hirst_st_onge_similarity(word, similar_word)
        M[3, i] = 1  # bias
    return M
    print(f"Similarity matrix for '{word}':")
    print(M)
    return M

def normalize_matrix(M):
    min_vals = M.min(axis=1, keepdims=True)
    max_vals = M.max(axis=1, keepdims=True)
    return (M - min_vals) / (max_vals - min_vals + 1e-10)
    print("Normalized similarity matrix:")
    print(normalized_M)
    return normalized_M

def create_value_matrix(D, SM):
    return np.vstack((D, SM)).T
    print("Value matrix:")
    print(V)
    return V

def seek(w, P):
    try:
        return np.where(P[0] == w)[0][0]
    except IndexError:
        return P.shape[1]

def calculate_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

def load_golden_standard(file_path):
    golden_standard = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(':')
            if len(parts) == 2:
                concept = parts[0].strip()
                words = [w.strip() for w in parts[1].split(',')][:5]
                golden_standard[concept] = words
    return golden_standard

def train_word2vecLLS(word2vecLL, golden_standard, max_iter=100):
    nn_model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1, warm_start=True)
    all_words = list(word2vecLL.wv.key_to_index.keys())
    
    X_train = []
    y_train = []
    
    for concept, related_words in golden_standard.items():
        if concept not in word2vecLL.wv:
            continue
        similar_words = [w for w, _ in word2vecLL.wv.most_similar(concept, topn=20)]
        M = create_similarity_matrix(concept, similar_words)
        SM = normalize_matrix(M)
        D = word2vecLL.wv.distances(concept, similar_words)
        V = create_value_matrix(D, SM)
        X_train.extend(V)
        y_train.extend([1 if word in related_words else 0 for word in similar_words])
    
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    
    for i in range(max_iter):
        nn_model.fit(X_train, y_train)
        y_pred = nn_model.predict(X_train)
        error = mean_squared_error(y_train, y_pred)
        print(f"Iteration {i+1}, Mean Error: {error}")
        
        if error < 0.01:
            break
    
    return nn_model


def main():
    print("Loading word2vecLL model...")
    word2vecLL = Word2Vec.load("models/word2vecLL_quran.model")
    
    print("Loading Golden Standard from 5w100c...")
    golden_standard = load_golden_standard("5w100c.txt")
    
    print("Training word2vecLLS model...")
    word2vecLLS = train_word2vecLLS(word2vecLL, golden_standard, max_iter=100)
    
    print("Saving word2vecLLS model...")
    joblib.dump(word2vecLLS, "models/word2vecLLS_quran.model")
    print("word2vecLLS model training and saving completed.")
    print(f"\nVocabulary size in trained Word2Vec model: {len(word2vecLL.wv.key_to_index)}")

if __name__ == "__main__":
    main()

Loading word2vecLL model...
Loading Golden Standard from 5w100c...
Training word2vecLLS model...




Iteration 1, Mean Error: 0.012833860099951577
Iteration 2, Mean Error: 0.012260762488352318
Iteration 3, Mean Error: 0.012172185608659184
Iteration 4, Mean Error: 0.012808869199219088
Iteration 5, Mean Error: 0.012208642212833416
Iteration 6, Mean Error: 0.01224649458319283
Iteration 7, Mean Error: 0.012215524807038415
Iteration 8, Mean Error: 0.012163781271080852
Iteration 9, Mean Error: 0.012484550814821008
Iteration 10, Mean Error: 0.012004132075545014
Iteration 11, Mean Error: 0.011907030389243515
Iteration 12, Mean Error: 0.01194123295653125
Iteration 13, Mean Error: 0.011969077840810517
Iteration 14, Mean Error: 0.011994461576880468
Iteration 15, Mean Error: 0.012171330486600324
Iteration 16, Mean Error: 0.011892232632189498
Iteration 17, Mean Error: 0.012412023262820765
Iteration 18, Mean Error: 0.012389229309364147
Iteration 19, Mean Error: 0.011902741999840165
Iteration 20, Mean Error: 0.012002346861300802
Iteration 21, Mean Error: 0.012629826774130564
Iteration 22, Mean Error

In [5]:
from gensim.models import Word2Vec
import joblib
import numpy as np

def load_5w100c(file_path):
    word_groups = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(':')
            if len(parts) == 2:
                concept = parts[0].strip()
                words = [w.strip() for w in parts[1].split(',')][:5]  # Take only the first 5 words
                word_groups[concept] = words
    return word_groups

def evaluate_model(model_func, golden_standard, k_values=[20, 50, 100, 200]):
    results = {}
    
    for k in k_values:
        true_positives = 0
        false_positives = 0
        false_negatives = 0
        skipped_concepts = []
        
        for concept, related_words in golden_standard.items():
            try:
                predicted_similar = [w for w, _ in model_func(concept, topn=k)]
            except KeyError:
                skipped_concepts.append(concept)
                continue
            
            true_positives += len(set(related_words) & set(predicted_similar))
            false_positives += len(predicted_similar) - len(set(related_words) & set(predicted_similar))
            false_negatives += len(set(related_words) - set(predicted_similar))
        
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        results[k] = {
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
    
    print(f"Skipped concepts: {skipped_concepts}")
    print(f"Number of skipped concepts: {len(skipped_concepts)}")
    return results

# In your main function, update the evaluation part:
def main():
    print("Loading models...")
    word2vecLL = Word2Vec.load("models/word2vecLL_quran.model")
    word2vecLLS = joblib.load("models/word2vecLLS_quran.model")
    
    print("Loading 5w100c dataset...")
    golden_standard = load_golden_standard('5w100c.txt')
    
    print("Evaluating word2vecLL model...")
    results_LL = evaluate_model(word2vecLL.wv.most_similar, golden_standard)
    
    print("Evaluating word2vecLLS model...")
    def word2vecLLS_most_similar(word, topn):
        similar_words = [w for w, _ in word2vecLL.wv.most_similar(word, topn=topn)]
        M = create_similarity_matrix(word, similar_words)
        SM = normalize_matrix(M)
        D = word2vecLL.wv.distances(word, similar_words)
        V = create_value_matrix(D, SM)
        similarities = word2vecLLS.predict(V)
        return sorted(zip(similar_words, similarities), key=lambda x: x[1], reverse=True)[:topn]
    
    results_LLS = evaluate_model(word2vecLLS_most_similar, golden_standard)
    
    # Print results
    for k in [20, 50, 100, 200]:
        print(f"\nResults for k={k}:")
        print(f"word2vecLL - Precision: {results_LL[k]['precision']:.4f}, Recall: {results_LL[k]['recall']:.4f}, F1: {results_LL[k]['f1']:.4f}")
        print(f"word2vecLLS - Precision: {results_LLS[k]['precision']:.4f}, Recall: {results_LLS[k]['recall']:.4f}, F1: {results_LLS[k]['f1']:.4f}")

if __name__ == "__main__":
    main()

Loading models...
Loading 5w100c dataset...
Evaluating word2vecLL model...
Skipped concepts: ['children', 'orphans', 'neighbors', 'travelers', 'strangers', 'oppressors', 'unity', 'disunity']
Number of skipped concepts: 8
Evaluating word2vecLLS model...
Skipped concepts: ['children', 'orphans', 'neighbors', 'travelers', 'strangers', 'oppressors', 'unity', 'disunity']
Number of skipped concepts: 8

Results for k=20:
word2vecLL - Precision: 0.0118, Recall: 0.0471, F1: 0.0188
word2vecLLS - Precision: 0.0118, Recall: 0.0471, F1: 0.0188

Results for k=50:
word2vecLL - Precision: 0.0054, Recall: 0.0541, F1: 0.0098
word2vecLLS - Precision: 0.0054, Recall: 0.0541, F1: 0.0098

Results for k=100:
word2vecLL - Precision: 0.0047, Recall: 0.0941, F1: 0.0090
word2vecLLS - Precision: 0.0047, Recall: 0.0941, F1: 0.0090

Results for k=200:
word2vecLL - Precision: 0.0031, Recall: 0.1247, F1: 0.0061
word2vecLLS - Precision: 0.0031, Recall: 0.1247, F1: 0.0061
