In [None]:
from pymongo import MongoClient
from konlpy.tag import Okt

In [None]:
client = MongoClient('mongodb://admin:admin@222.112.208.67:27017')
db = client['news']
documents = db['contents']

In [3]:
tokenizer = Okt()

stop_word = ['기자', '뉴시스', '[', ']', '(', ')', '했다', '에서', '.[', '하는']
contents_list =  [  tokenizer.nouns(docu['content'] )  for docu in documents.find()]

tokens = [[   t  for t in tok if t not in stop_word and len(t) > 1 ]   for tok in contents_list]

In [4]:
tokens_tmp = []
for tok in contents_list: 
    tmp = []
    for t in tok:
        if t not in stop_word and len(t) > 1:
            tmp.append(t)
    tokens_tmp.append(tmp)  

In [5]:
from collections import Counter

In [6]:
def build_vocab(corpus, n_vocab, special_tokens):
    counter = Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab = special_tokens
    for token, count in counter.most_common(n_vocab):
        vocab.append(token)
    return vocab


In [7]:
vocab = build_vocab(corpus=tokens, n_vocab=5000, special_tokens= ["<unk>"])

In [8]:
token_to_id = { token : idx  for idx, token in enumerate(vocab)} 

In [9]:
token_to_id['서울']

30

In [10]:
id_to_token = {idx : token for idx, token in enumerate(vocab)}

In [11]:
def get_word_pairs(tokens, window_size):
    pairs = []
    for sentence in tokens:
        sentense_length = len(sentence)
        for idx, center_word in enumerate(sentence):
            window_start = max(0, idx-window_size)
            window_end = min(sentense_length, idx+window_size + 1)
            center_word = sentence[idx]
            context_words = sentence[window_start:idx] + sentence[idx+1:window_end]
            for context_word in context_words:
                pairs.append([center_word, context_word])
    return pairs

In [12]:
tokens[0][:10]

['서울', '지난', '방송', '예능', '우리', '새끼', '이상민', '신혼집', '공개', '사진']

In [13]:
word_pairs = get_word_pairs(tokens=tokens, window_size=2)

In [14]:
def get_index_pairs(word_pairs, token_to_id):
    pairs = [] 
    unk_index = token_to_id['<unk>']
    for center_word, context_word in word_pairs:
        center_idx = token_to_id.get(center_word, unk_index )
        context_idx = token_to_id.get(context_word, unk_index)
        pairs.append([center_idx, context_idx])
    return pairs

In [15]:
index_pairs = get_index_pairs(word_pairs, token_to_id)

In [16]:
import torch
print('Torch :', torch.__version__)
print( 'CUDA :' , torch.version.cuda)
print('cuDNN :',torch.backends.cudnn.version())

if torch.cuda.is_available():
    print("GPU 사용 가능!")
    print(f"사용 가능한 GPU 개수: {torch.cuda.device_count()}")
    print(f"첫 번째 GPU 이름: {torch.cuda.get_device_name(0)}")
else:
    print("GPU를 사용할 수 없습니다.")

Torch : 2.7.1+cu128
CUDA : 12.8
cuDNN : 90701
GPU 사용 가능!
사용 가능한 GPU 개수: 1
첫 번째 GPU 이름: NVIDIA GeForce RTX 4060 Laptop GPU


In [17]:
from torch import nn

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings = vocab_size,
            embedding_dim = embedding_dim
        )
        self.linear = nn.Linear(
            in_features =  embedding_dim,
            out_features = vocab_size
        )
    def forward(self, input_ids):
        embeddings = self.embedding(input_ids)
        output = self.linear(embeddings)
        return output 

In [18]:
import torch 
from torch.utils.data import TensorDataset, DataLoader

index_pairs = torch.tensor(index_pairs)
center_indexs = index_pairs[:, 0]
context_indexs = index_pairs[:, 1]

dataset = TensorDataset(center_indexs, context_indexs)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [19]:
from torch import optim 
device = "cuda" if torch.cuda.is_available() else "cpu"
word2vec = SkipGram(vocab_size=len(token_to_id), embedding_dim = 128).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(word2vec.parameters(), lr=0.1)

In [20]:
device

'cuda'

In [21]:
for epoch in range(10):
    cost = 0.0 
    for  input_ids, target_ids in   dataloader:
        input_ids = input_ids.to(device)
        target_ids = target_ids.to(device)
        logits = word2vec(input_ids)
        loss = criterion(logits, target_ids)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        cost += loss 
    cost = cost / len(dataloader)
    print (f"Epoch {epoch+1:4d} Cost : {cost:.3f}")


Epoch    1 Cost : 7.057
Epoch    2 Cost : 6.300
Epoch    3 Cost : 6.072
Epoch    4 Cost : 5.941
Epoch    5 Cost : 5.847
Epoch    6 Cost : 5.774
Epoch    7 Cost : 5.712
Epoch    8 Cost : 5.660
Epoch    9 Cost : 5.613
Epoch   10 Cost : 5.571


In [25]:
import numpy as np 
from numpy.linalg import norm
def cosine_sim(a, b):
    return np.dot(a, b) / (norm(b) * norm(a))

김지민 = word2vec.embedding.weight[token_to_id['김지민']].detach().cpu().numpy()
김준호 = word2vec.embedding.weight[token_to_id['김준호']].detach().cpu().numpy()

cosine_sim(김지민, 김준호)

0.28108668

In [26]:
token_to_embedding = dict()
embedding_matrix = word2vec.embedding.weight.detach().cpu().numpy()

for word, embedding in zip(vocab, embedding_matrix):
    token_to_embedding[word] = embedding
token = "김지민"
token_embedding = token_to_embedding[token]

import numpy as np
from numpy.linalg import norm


def cosine_similarity(a, b):
    cosine = np.dot(b, a) / (norm(b, axis=1) * norm(a))
    return cosine

def top_n_index(cosine_matrix, n):
    closest_indexes = cosine_matrix.argsort()[::-1]
    top_n = closest_indexes[1 : n + 1]
    return top_n

cosine_matrix = cosine_similarity(token_embedding, embedding_matrix)
top_n = top_n_index(cosine_matrix, n=20)

print(f"{token}와 가장 유사한 5 개 단어")
for index in top_n:
    print(f"{id_to_token[index]} - 유사도 : {cosine_matrix[index]:.4f}")


김지민와 가장 유사한 5 개 단어
결혼식 - 유사도 : 0.2964
김대희 - 유사도 : 0.2946
김준호 - 유사도 : 0.2811
자료 - 유사도 : 0.2803
백지영 - 유사도 : 0.2753
이티 - 유사도 : 0.2601
뮤직뱅크 - 유사도 : 0.2597
면모 - 유사도 : 0.2532
최강 - 유사도 : 0.2516
엑스포츠 - 유사도 : 0.2499
플래닛 - 유사도 : 0.2487
단절 - 유사도 : 0.2472
예상 - 유사도 : 0.2470
열애 - 유사도 : 0.2458
경악 - 유사도 : 0.2448
개월 - 유사도 : 0.2446
연하 - 유사도 : 0.2408
냉부 - 유사도 : 0.2390
홈즈 - 유사도 : 0.2375
방문 - 유사도 : 0.2320


In [4]:
from gensim.models import Word2Vec
word2vec = Word2Vec(sentences=tokens, vector_size=128, 
        window=5, min_count=3, sg=1, epochs=10, max_final_vocab= 5000
        )
# word2vec.wv['김지민']
# word2vec.wv.most_similar("김지민", topn=10)
word2vec.wv.similarity(w1='김준호', w2='김지민')

NameError: name 'tokens' is not defined

In [3]:
from gensim.models import Word2Vec