In [None]:
import numpy as np
import requests
from collections import Counter
import re
import pandas as pd

def compute_similarity(word, model, word2idx):
    if word not in word2idx:
        return []

    word_vector = model.W1[word2idx[word]]
    similarities = {}

    for other_word, idx in word2idx.items():
        if other_word != word:
            other_vector = model.W1[idx]
            similarity = np.dot(word_vector, other_vector) / (np.linalg.norm(word_vector) * np.linalg.norm(other_vector))
            similarities[other_word] = similarity

    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return sorted_similarities

if __name__ == "__main__":
    API_KEY = "xxxxx"  # Replace with your NewsAPI key

    models = {}
    word2idxs = {}
    sample_words = ["los", "angeles", "tribune"]  # Example words for similarity computation
    results = []
    metrics = []

    for window_size in [1, 2, 3]:
        for embedding_dim in [50, 100, 200]:
            print(f"Training model with window size {window_size} and embedding dim {embedding_dim}...")
            model, word2idx = train_model(API_KEY, window_size, embedding_dim)
            models[(window_size, embedding_dim)] = model
            word2idxs[(window_size, embedding_dim)] = word2idx

            avg_similarity = 0
            count = 0

            similarity_details = []

            for sample_word in sample_words:
                if sample_word in word2idx:
                    similar_words = compute_similarity(sample_word, model, word2idx)
                    if similar_words:
                        word_sim_avg = sum(sim[1] for sim in similar_words) / len(similar_words)
                        avg_similarity += word_sim_avg
                        count += 1
                        similarity_details.append((sample_word, similar_words))

            avg_similarity = avg_similarity / count if count > 0 else 0
            results.append((window_size, embedding_dim, avg_similarity))
            metrics.append({
                "Window Size": window_size,
                "Embedding Dim": embedding_dim,
                "Avg Similarity": avg_similarity,
                "Similarity Details": similarity_details
            })

    # Menemukan konfigurasi terbaik berdasarkan rata-rata nilai similaritas tertinggi
    best_config = max(results, key=lambda x: x[2])
    print(f"\nBest configuration: Window Size {best_config[0]}, Embedding Dim {best_config[1]} with Avg Similarity {best_config[2]:.4f}")

    # Menampilkan metrik dalam bentuk tabel
    metrics_df = pd.DataFrame(metrics)
    print("\nSimilarity Metrics:")
    print(metrics_df)

    # Menampilkan semua hasil similaritas untuk setiap kata dalam setiap konfigurasi
    for metric in metrics:
        print(f"\nWindow Size: {metric['Window Size']}, Embedding Dim: {metric['Embedding Dim']}")
        for word, similarities in metric["Similarity Details"]:
            print(f"Similar words for '{word}':")
            for similar_word, similarity in similarities:
                print(f"  {similar_word}: {similarity:.4f}")


Training model with window size 1 and embedding dim 50...
Training model with window size 1 and embedding dim 100...
Training model with window size 1 and embedding dim 200...
Training model with window size 2 and embedding dim 50...
Training model with window size 2 and embedding dim 100...
Training model with window size 2 and embedding dim 200...
Training model with window size 3 and embedding dim 50...
Training model with window size 3 and embedding dim 100...
Training model with window size 3 and embedding dim 200...

Best configuration: Window Size 3, Embedding Dim 50 with Avg Similarity 0.2964

Similarity Metrics:
   Window Size  Embedding Dim  Avg Similarity  \
0            1             50       -0.067213   
1            1            100       -0.075282   
2            1            200       -0.074420   
3            2             50        0.102038   
4            2            100        0.123060   
5            2            200        0.103807   
6            3             5