# Common

In [75]:
import re
import string

def clean(inp: str) -> str:
	inp = inp.translate(str.maketrans(string.punctuation, " "*len(string.punctuation)))
	inp = re.sub(r'\s+', ' ', inp.lower())
	return inp

# Skip-Gram

In [76]:
with open('../data/text.txt', 'r') as f:
	test_text = f.readlines()
	test_text = ' '.join(test_text)

test_text = """Empathy for the poor may not come easily to people who never experienced it. They may blame the victims and insist their predicament can be overcome through determination and hard work"""
clear_test_text = clean(test_text)
clear_test_text

'empathy for the poor may not come easily to people who never experienced it they may blame the victims and insist their predicament can be overcome through determination and hard work'

In [77]:
import re

import numpy as np
import torch
from torch import nn
from collections import Counter
from torch.utils.data import DataLoader, Dataset

# Функция для создания словаря и подготовки данных
def prepare_data_skip_gram(text: str, window_size=2):
	# Удаляем все символы кроме a-z, @, и #
	text = re.sub(r'[^a-z@# ]', '', text.lower())    
	# Разбиваем на пробелы
	tokens = text.split()    
	
	vocab = set(tokens)
	word_to_ix = {word: i for i, word in enumerate(vocab)}
	
	data = []
	for i in range(len(tokens)):
		for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
			if i != j:
				data.append((tokens[i], tokens[j]))	
	return data, word_to_ix, len(vocab)	
 
class SkipGramDataset(Dataset):
	def __init__(self, data, word_to_ix):			
		self.data = [(word_to_ix[center], word_to_ix[context]) for center, context in data]
	
	def __len__(self):
		return len(self.data)
	
	def __getitem__(self, idx):
		return torch.tensor(self.data[idx][0], dtype=torch.long), torch.tensor(self.data[idx][1], dtype=torch.long)
	

class Word2VecSkipGramModel(nn.Module):
	def __init__(self, vocab_size, embedding_dim):
		super(Word2VecSkipGramModel, self).__init__()
		self.embeddings = nn.Embedding(vocab_size, embedding_dim)
		self.out_layer = nn.Linear(embedding_dim, vocab_size)
		self.activation_function = nn.LogSoftmax(dim=-1)

	def forward(self, center_word_idx):
		hidden_layer = self.embeddings(center_word_idx)
		out_layer = self.out_layer(hidden_layer)
		log_probs = self.activation_function(out_layer)
		return log_probs

# Функция обучения модели
def train_model(data, word_to_ix, vocab_size, embedding_dim=50, epochs=10, batch_size=1):
	dataset = SkipGramDataset(data, word_to_ix)
	dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
	
	model = Word2VecSkipGramModel(vocab_size, embedding_dim)
	loss_function = nn.NLLLoss()
	optimizer = torch.optim.SGD (model.parameters(), lr=0.05)
	print('start')
	for epoch in range(epochs):
		total_loss = 0
		for center_word, context_word in dataloader:
			model.zero_grad()
			log_probs = model(center_word)
			loss = loss_function(log_probs, context_word)
			loss.backward()
			optimizer.step()            
			total_loss += loss.item()
			
		print(f'Epoch {epoch + 1}, Loss: {total_loss}')
	return model

# Главная функция
def train(data: str):
	window_size = 2
	embedding_dim = 10
	epochs = 5
	batch_size = 2
	
	ngramm_data, word_to_ix, vocab_size = prepare_data_skip_gram(data, window_size)    
	model = train_model(ngramm_data, word_to_ix, vocab_size, embedding_dim, epochs, batch_size)
	
	# # Извлекаем векторы слов из модели
	embeddings = model.embeddings.weight.data.numpy()
	ix_to_word = {i: word for word, i in word_to_ix.items()}
	w2v_dict = {ix_to_word[ix]: embeddings[ix] for ix in range(vocab_size)}
	return w2v_dict
# Тестовые данные
test_text = 'Captures Semantic Relationships: The skip-gram model effectively captures semantic relationships between words. It learns word embeddings that encode similar meanings and associations, allowing for tasks like word analogies and similarity calculations. Handles Rare Words: The skip-gram model performs well even with rare words or words with limited occurrences in the training data. It can generate meaningful representations for such words by leveraging the context in which they appear. Contextual Flexibility: The skip-gram model allows for flexible context definitions by using a window around each target word. This flexibility captures local and global word associations, resulting in richer semantic representations. Scalability: The skip-gram model can be trained efficiently on large-scale datasets due to its simplicity and parallelization potential. It can process vast amounts of text data to generate high-quality word embeddings.'

w2v_dict = train(test_text)
# print(w2v_dict)


start
Epoch 1, Loss: 1169.1363234519958
Epoch 2, Loss: 1101.0393064022064
Epoch 3, Loss: 1057.7352521419525
Epoch 4, Loss: 1027.1564388275146
Epoch 5, Loss: 1003.3897905349731


# CBOW

In [78]:
import torch.nn.functional as F

def prepare_data_cbow(text: str, window_size=2):
	text = re.sub(r'[^a-z@# ]', '', text.lower())    
	tokens = text.split()    
	
	vocab = set(tokens)
	word_to_ix = {word: i for i, word in enumerate(vocab)}
	
	data = []
	for i in range(window_size, len(tokens) - window_size):
		context = [tokens[i - j - 1] for j in range(window_size)] + [tokens[i + j + 1] for j in range(window_size)]
		target = tokens[i]
		data.append((context, target))
	return data, word_to_ix, len(vocab)	

class CBOWDataset(Dataset):
	def __init__(self, data, word_to_ix):
		self.contexts = []
		self.targets = []
		for context, target in data:
			indexed_context = [word_to_ix[word] for word in context]
			self.contexts.append(indexed_context)
			self.targets.append(word_to_ix[target])

	def __len__(self):
		return len(self.targets)

	def __getitem__(self, idx):
		# Возвращаем контекст и центральное слово как пару тензоров
		return torch.tensor(self.contexts[idx], dtype=torch.long), torch.tensor(self.targets[idx], dtype=torch.long)


class Word2VecCBOWModel(nn.Module):
	def __init__(self, vocab_size, embedding_dim):
		super(Word2VecCBOWModel, self).__init__()
		self.embeddings = nn.Embedding(vocab_size, embedding_dim)
		self.out_layer = nn.Linear(embedding_dim, vocab_size)
		self.activation_function = nn.LogSoftmax(dim=1)

	def forward(self, center_word_idx):
		hidden_layer = torch.mean(self.embeddings(center_word_idx), dim=1)
		out_layer = self.out_layer(hidden_layer)
		log_probs = self.activation_function(out_layer)
		return log_probs

# Функция обучения модели
def train_model_cbow(data, word_to_ix, vocab_size, embedding_dim=50, epochs=10, batch_size=1):
	dataset = CBOWDataset(data, word_to_ix)
	dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
	
	model = Word2VecCBOWModel(vocab_size, embedding_dim)
	loss_function = nn.NLLLoss()
	optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
	
	for epoch in range(epochs):
		total_loss = 0
		for context_words, target_word in dataloader:
			context_words = context_words  # Подготавливаем контекстные слова
			model.zero_grad()
			log_probs = model(context_words)
			loss = loss_function(log_probs, target_word)
			loss.backward()
			optimizer.step()
			total_loss += loss.item()
		print(f'Epoch {epoch + 1}, Loss: {total_loss}')
	return model


# Главная функция
def train(data: str):
	window_size = 2
	embedding_dim = 10
	epochs = 5
	batch_size = 2
	
	ngramm_data, word_to_ix, vocab_size = prepare_data_cbow(data, window_size)    
	model = train_model_cbow(ngramm_data, word_to_ix, vocab_size, embedding_dim, epochs, batch_size)
	
	# # Извлекаем векторы слов из модели
	embeddings = model.embeddings.weight.data.numpy()
	ix_to_word = {i: word for word, i in word_to_ix.items()}
	w2v_dict = {ix_to_word[ix]: embeddings[ix] for ix in range(vocab_size)}
	return w2v_dict
# Тестовые данные
test_text = 'Captures Semantic Relationships: The skip-gram model effectively captures semantic relationships between words. It learns word embeddings that encode similar meanings and associations, allowing for tasks like word analogies and similarity calculations. Handles Rare Words: The skip-gram model performs well even with rare words or words with limited occurrences in the training data. It can generate meaningful representations for such words by leveraging the context in which they appear. Contextual Flexibility: The skip-gram model allows for flexible context definitions by using a window around each target word. This flexibility captures local and global word associations, resulting in richer semantic representations. Scalability: The skip-gram model can be trained efficiently on large-scale datasets due to its simplicity and parallelization potential. It can process vast amounts of text data to generate high-quality word embeddings.'

w2v_dict = train(test_text)
# print(w2v_dict)

Epoch 1, Loss: 283.4514169692993
Epoch 2, Loss: 283.341778755188
Epoch 3, Loss: 283.232702255249
Epoch 4, Loss: 283.1228437423706
Epoch 5, Loss: 283.01355934143066


: 