## TP1 - PLN

**Nome:** Antônio Caetano Neves Neto

**Matrícula:** 2022043698

### Importações

In [20]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [3]:
with open("files/questions-words.txt", "r") as src:
    lines = src.read().split("\n")

    section = lines[0].split(" ")[1]

    test_per_section = {section: []}

    for line in lines[1:]:
        if ":" in line:
            section = line.split(" ")[1]
            test_per_section[section] = []
        else:
            test_per_section[section].append(line)

    words_for_test = set(np.concatenate([tests for tests in test_per_section.values()]).tolist())
print(f"Número de palavras para teste: {len(words_for_test)}")

Número de palavras para teste: 19374


## Teste de Embedding

In [25]:
embedding_file = "files/embedding.txt"

In [16]:
def get_embedding():
    embedding = {}
    with open(embedding_file, "r") as src:
        lines = src.read().split("\n")

        n_lines, k = [int(w) for w in lines[0].split(" ")]

        for line in lines[1:]:
            w = line.strip().split(" ")
            embedding[w[0]] = np.array([float(n) for n in w[1:]])
    
    return embedding

In [15]:
def get_embedding_of_word(word, embedding):
  if word not in embedding:
    return embedding["</s>"]
  return embedding[word]

In [14]:
def cosine_similarity(v1, v2):
  dot_product = np.dot(v1, v2)
  norm_v1 = np.linalg.norm(v1)
  norm_v2 = np.linalg.norm(v2)
  if norm_v1 == 0 or norm_v2 == 0:
    return 0
  return dot_product / (norm_v1 * norm_v2)

def cosine_distance(v1, v2):
  return 1 - cosine_similarity(v1, v2)

In [26]:
def validate(embedding):
    results = {}
    for section, tests in test_per_section.items():
        cosine_distances = []

        for test in tests:
            if len(test.lower().split(" ")) != 4: continue

            word1, word2, word3, word4 = test.lower().split(" ")

            v = get_embedding_of_word(word1, embedding) + get_embedding_of_word(word2, embedding) - get_embedding_of_word(word3, embedding)

            cosine_distances.append(
                cosine_distance(v, get_embedding_of_word(word4, embedding))
            )

        results[section] = (np.mean(cosine_distances), np.std(cosine_distances))

    return results

In [27]:
results = {}

for strategy in ["CBOW", "Skip-gram"]:
    for window_size in [3, 5, 10, 20]:
        for embedding_size in [50, 100, 150]:
            for n_iter in [3, 5, 10]:
                configuration = f"{strategy}-{str(window_size)}-{str(embedding_size)}-{str(n_iter)}"
                print(f"Start {configuration} configuration..", end="\n\n")

                os.system(
                    f"./word2vec/word2vec -train files/context.txt -output files/embedding.txt -size {embedding_size} -window {window_size} -iter {n_iter} -cbow {int(strategy == 'CBOW')} -threads 4"
                )
                
                embedding = get_embedding()
                results[configuration] = validate(embedding)
                print()
                break
            break
        break
    break

Start CBOW-3-50-3 configuration..

Starting training using file files/context.txt
Vocab size: 71291
Words in train file: 16718843
Alpha: 0.000005  Progress: 100.01%  Words/thread/sec: 354.40k  


In [29]:
pd.DataFrame(results)

Unnamed: 0,CBOW-3-50-3
capital-common-countries,"(0.6579326547912443, 0.21777780325830695)"
capital-world,"(0.5098990135979198, 0.28979038516735267)"
city-in-state,"(0.4255234578439494, 0.21047971117336192)"
currency,"(0.9435725391048057, 0.1449417705893463)"
family,"(1.0250782276835824, 0.4670104677947348)"
gram1-adjective-to-adverb,"(1.044039981024375, 0.24828086566864788)"
gram2-opposite,"(1.0525152264409972, 0.2827792842439299)"
gram3-comparative,"(0.8807827041723958, 0.24474101837219994)"
gram4-superlative,"(1.0533373626265128, 0.24164052646143327)"
gram5-present-participle,"(1.1410245329992625, 0.2841799705798426)"
