# Text preprocessing

In [126]:
DATA = "data/"
TRAIN = f"{DATA}/train.csv"
TEST = f"{DATA}/test.csv"
TRAIN_LABEL = f"{DATA}/train_label.txt"
TRAIN_TITLE = f"{DATA}/train_title.txt"
TRAIN_TEXT = f"{DATA}/train_text.txt"
TEST_LABEL = f"{DATA}/test_label.txt"
TEST_TITLE = f"{DATA}/test_title.txt"
TEST_TEXT = f"{DATA}/test_text.txt"

TRAIN_NORM = f"{DATA}/train_norm.txt"
TEST_NORM = f"{DATA}/test_norm.txt"

In [127]:
# # Splt the csv into label, title, text
# import pandas as pd

# train = pd.read_csv(TRAIN)
# train_col = train.columns
# train[train_col[0]].to_csv(TRAIN_LABEL, index=False)
# train[train_col[1]].to_csv(TRAIN_TITLE, index=False)
# train[train_col[2]].to_csv(TRAIN_TEXT, index=False)

# test = pd.read_csv(TEST)
# test_col = test.columns
# test[test_col[0]].to_csv(TEST_LABEL, index=False)
# test[test_col[1]].to_csv(TEST_TITLE, index=False)
# test[test_col[2]].to_csv(TEST_TEXT, index=False)
# del train, test, pd

In [128]:
# from src.Normalizer import normalize_data
# normalize_data(TRAIN_TEXT, TRAIN_NORM)
# normalize_data(TEST_TEXT, TEST_NORM)

In [129]:
import numpy as np
from typing import List, Union, Generator, Dict
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch import optim
from collections import Counter
from datetime import time

In [130]:
class Corpus:
	def __init__(self, file_name: str, window: int = 1, k = 3):
		self.file_name = file_name
		self.word2id: Dict[str, int] = {}
		self.id2word: Dict[int, str] = {}
		self.word_count = {}
		self.k = k
		self.vocab_size = 0
		self.window = window
		self.pairs = []
		self.noise_dist = np.array([])
		self.__init_data()
		self.__subsample()
		self.__make_pairs()
		self.__noise_dist()

	def __init_data(self):
		with open(self.file_name, "r") as file:
			data = file.read()
		data = data.split()
		self.word_count = Counter(data)
		self.total = sum(list(self.word_count.values()))
		self.vocab_size = len(word_count)

	def __subsample(self):
		t = 1e-5
		sorted_count = sorted(self.word_count, key=self.word_count.get, reverse=True)  # type: ignore
		freq = {word: count / self.total for word, count in self.word_count.items()}
		pdrop = {word: 1 - np.sqrt(t / freq[word]) for word in sorted_count}
		i = 0
		_new_word_count = {}
		for word in sorted_count:
			if np.random.random() < 1 - pdrop[word]:
				self.id2word[i] = word
				self.word2id[word] = i
				_new_word_count[word] = self.word_count[word]
				i += 1
		self.word_count = _new_word_count
		self.vocab_size =  len(self.word_count.keys())
		self.total = sum(list(self.word_count.values()))

	def __get_pairs(self, text: List[str]):
		num_words = len(text)
		for i, word in enumerate(text):
			if word in self.word2id.keys():
				center = self.word2id[word]
				context = text[max(0, i - self.window) : i]
				context += text[i + 1 : min(num_words, i + self.window + 1)]
				_tmp = ((center, self.word2id[cnt]) for cnt in context if cnt in self.word2id.keys())
				self.pairs.extend(_tmp)  # type: ignore

	def __noise_dist(self,):
		freq = {}
		for word in self.word2id.keys():
			freq[word] = self.word_count[word]/self.total
		unigram = np.array(list(freq.values()))**(3/4)
		self.noise_dist = unigram/unigram.sum()
		self.noise_dist = torch.from_numpy(self.noise_dist)
		_neg = []
		for pair in self.pairs:
			_, b = pair
			_tmp = self.noise_dist.clone()
			_tmp[b] = 0.0
			_neg.append(torch.multinomial(input = _tmp, num_samples = self.k, replacement = True))
		self.neg = np.asarray(_neg)
		i,j = self.pairs.shape
		self.pairs = self.pairs.reshape(i,j,1)

		
	def __make_pairs(self):
		with open(self.file_name) as data:
			for text in data:
				self.__get_pairs(text.strip().split())
		_tmp = set(self.pairs)
		_tmp = np.array(list(_tmp))
		self.pairs = np.array(_tmp)	

In [138]:
class SGNS(nn.Module):
    def __init__(self, vocab_size, emb_dim) -> None:
        super(SGNS, self).__init__()
        self.vocab_size = vocab_size  # N
        self.emb_dim = emb_dim  # H
        self.vEmbedding = nn.Embedding(self.vocab_size, self.emb_dim)
        self.uEmbedding = nn.Embedding(self.vocab_size, self.emb_dim)

    def forward(self, c, o, neg):
        vv = self.vEmbedding(c)  # BxH
        uu = self.uEmbedding(o)  # BxH
        ng = self.uEmbedding(neg) # BxKxH
        pos = torch.sigmoid(torch.einsum("bki,bik->bk", uu, vv.mT)) # Bx1
        neg = torch.sigmoid(torch.einsum("bjkl,blk->bjk", ng, vv.mT).neg()) #BxKx1
        return pos, neg


class SGNSLoss(nn.Module):
    def __init__(self):
        super(SGNSLoss, self).__init__()

    def forward(self, positive, negatives):
        a = torch.log(positive).neg()
        b = torch.log(negatives).neg().sum(1)
        return torch.mean(a + b)


class CorpusData(Dataset):
    def __init__(self, corpus):
        self.pairs = torch.from_numpy(corpus.pairs)
        self.negative = torch.from_numpy(corpus.neg)
        self.V = corpus.vocab_size

    def __getitem__(self, idx):
        c_positive, o_positive = self.pairs[idx]
        o_negative = self.negative[idx]
        return c_positive, o_positive, o_negative.unsqueeze(1)

    def __len__(self):
        return len(self.pairs)

In [133]:
corpus = Corpus(TRAIN_NORM, 3, 10)
corpusdata = CorpusData(corpus)
V = corpus.vocab_size
H = 50
device = "mps"

In [140]:
model = SGNS(V,H)
model = model.to(device)

In [148]:
criterion = SGNSLoss()
optimizer = optim.Adam(model.parameters())
dataloader = DataLoader(corpusdata, batch_size=2*8192, shuffle=True)
N = len(corpusdata)

In [149]:
for i in range(100):
	log = []
	total_loss = 0
	model.train()
	pbar = tqdm(dataloader, total=len(dataloader), desc=f"Epoch {i+1}")
	for c, o, n in pbar:
		c, o, n = c.to(device), o.to(device), n.to(device)
		pos, neg = model(c,o,n)
		optimizer.zero_grad()
		loss = criterion(pos, neg)
		loss.backward()
		optimizer.step()
		total_loss += loss.item() * c.size(0)
		# break
		pbar.set_postfix(loss=f"{total_loss/N:.2f}")

	avg_loss = total_loss / N
	log.append(avg_loss)
	# break


Epoch 1: 100%|██████████| 52/52 [00:06<00:00,  7.49it/s, loss=1.81]
Epoch 2: 100%|██████████| 52/52 [00:06<00:00,  7.84it/s, loss=1.80]
Epoch 3: 100%|██████████| 52/52 [00:06<00:00,  8.16it/s, loss=1.79]
Epoch 4: 100%|██████████| 52/52 [00:06<00:00,  7.81it/s, loss=1.78]
Epoch 5: 100%|██████████| 52/52 [00:06<00:00,  8.12it/s, loss=1.77]
Epoch 6: 100%|██████████| 52/52 [00:06<00:00,  8.10it/s, loss=1.77]
Epoch 7: 100%|██████████| 52/52 [00:06<00:00,  8.06it/s, loss=1.76]
Epoch 8: 100%|██████████| 52/52 [00:06<00:00,  8.05it/s, loss=1.75]
Epoch 9: 100%|██████████| 52/52 [00:06<00:00,  8.04it/s, loss=1.74]
Epoch 10: 100%|██████████| 52/52 [00:06<00:00,  7.92it/s, loss=1.73]
Epoch 11: 100%|██████████| 52/52 [00:06<00:00,  8.09it/s, loss=1.73]
Epoch 12: 100%|██████████| 52/52 [00:06<00:00,  8.06it/s, loss=1.72]
Epoch 13: 100%|██████████| 52/52 [00:06<00:00,  7.99it/s, loss=1.71]
Epoch 14: 100%|██████████| 52/52 [00:06<00:00,  8.06it/s, loss=1.70]
Epoch 15: 100%|██████████| 52/52 [00:06<00:

In [151]:
model.vEmbedding.weight.shape

torch.Size([55085, 50])

In [162]:

from torch.utils.tensorboard.writer import SummaryWriter

from datetime import datetime

In [159]:
datetime.now()

datetime.datetime(2024, 7, 6, 9, 38, 7, 609074)

In [161]:
def log_embeddings(model, corpus):
	now = datetime.now()
	log_dir = f'runs/negativesampling_{now.strftime("%Y%m%d-%H%M%S")}'
	writer = SummaryWriter(log_dir)
	embeddings = model.vEmbedding.weight
	labels = list(corpus.id2word.values())
	writer.add_embedding(embeddings, metadata=labels)
	writer.flush()
	writer.close()
	return log_dir
log_embeddings(model, corpus)

'runs/negativesampling_20240706-093845'