# Text preprocessing

In [1]:
DATA = "data/"
TRAIN = f"{DATA}/train.csv"
TEST = f"{DATA}/test.csv"
TRAIN_LABEL = f"{DATA}/train_label.txt"
TRAIN_TITLE = f"{DATA}/train_title.txt"
TRAIN_TEXT = f"{DATA}/train_text.txt"
TEST_LABEL = f"{DATA}/test_label.txt"
TEST_TITLE= f"{DATA}/test_title.txt"
TEST_TEXT = f"{DATA}/test_text.txt"

TRAIN_NORM = f"{DATA}/train_norm.txt"
TEST_NORM = f"{DATA}/test_norm.txt"

In [2]:
# # Splt the csv into label, title, text
# import pandas as pd

# train = pd.read_csv(TRAIN)
# train_col = train.columns
# train[train_col[0]].to_csv(TRAIN_LABEL, index=False)
# train[train_col[1]].to_csv(TRAIN_TITLE, index=False)
# train[train_col[2]].to_csv(TRAIN_TEXT, index=False)

# test = pd.read_csv(TEST)
# test_col = test.columns
# test[test_col[0]].to_csv(TEST_LABEL, index=False)
# test[test_col[1]].to_csv(TEST_TITLE, index=False)
# test[test_col[2]].to_csv(TEST_TEXT, index=False)
# del train, test, pd

In [3]:
# from src.Normalizer import normalize_data
# normalize_data(TRAIN_TEXT, TRAIN_NORM)
# normalize_data(TEST_TEXT, TEST_NORM)

In [4]:
import numpy as np
from typing import List, Union, Generator
import torch
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch import optim

In [5]:
class Corpus:
	def __init__(self, file_name, window_size=1):

		self.file_name = file_name
		self.window_size = window_size
		self.word2id = {}
		self.id2word = {}
		self.word_count = {}
		self.word_prob = {}
		self.total_count= 0
		self.vocab_size = 0
		self.data: Union[List, np.ndarray] = []
		self.__build_data()

	def __iter__(self):
		with open(self.file_name, "r") as file:
			yield from file

	def __update_map(self, text):
		for word in text:
			if word not in self.word2id.keys():
				self.word2id[word] = self.vocab_size
				self.id2word[self.vocab_size] = word
				self.word_count[word] = 0
				self.vocab_size += 1
			self.word_count[word]+=1

	def __write_pairs(self, text):
		num_words = len(text)
		for i, word in enumerate(text):
			center = self.word2id[word]
			num_words = len(text)
			context = (
				text[max(0, i - self.window_size) : i]
				+ text[i + 1 : min(num_words, i + self.window_size + 1)]
			)
			self.data.extend(((center, self.word2id[cnt]) for cnt in context))  # type: ignore

	def __build_data(self) -> None:
		for line in self:
			text = line.strip().split()
			text = [word.strip() for word in text]
			self.__update_map(text)
			self.__write_pairs(text)
		self.data = np.array(list(set(self.data)), dtype="int32")
		self.total_count = np.sum(list(self.word_count.values()))
		_tmp = np.array(list(self.word_count.values()))**(0.75)
		_tmp = np.sum(_tmp)
		self.word_prob = {self.word2id[k]:v**0.75/_tmp for k, v in self.word_count.items()}



In [6]:
TESTINGDATA = f"{DATA}/testing_data.txt"
testing_corpus = Corpus(TRAIN_NORM, 3)

In [7]:
class SGNS(nn.Module):
	def __init__(self, num_words, emb_dim):
		super(SGNS, self).__init__()
		self.num_words = num_words
		self.emb_dim = emb_dim
		self.center = nn.Embedding(num_words, emb_dim)
		self.context = nn.Embedding(num_words, emb_dim)

	def forward(self, center, context):
		center = self.center(center)
		context = self.context(context)
		output = torch.matmul(context, center.mT).squeeze(1)
		return output

	def get_similarity(self, idx):
		with torch.no_grad():
			center_emb = self.center.weight[idx]
			similarities = torch.cosine_similarity(
				center_emb.unsqueeze(0), self.center.weight, dim=1
			)
		return similarities

	def get_cosine_distance(self, idx):
		similarities = self.get_similarity(idx)
		return 1 - similarities


class CorpusData(Dataset):
	def __init__(self, corpus, k):
		self.data = torch.from_numpy(corpus.data)
		self.V = corpus.vocab_size
		self.noise = torch.from_numpy(np.array(list(corpus.word_prob.values())))
		self.k = k
	def __getitem__(self, idx):
		c_positive, o_positive = self.data[idx]
		# _tmp = self.noise.clone()
		# _tmp[o_positive] = 0.0
		o_negative = torch.multinomial(input = self.noise, num_samples = self.k, replacement = True)
		c_negative = c_positive * torch.ones_like(o_negative, dtype=torch.long)
		return (c_positive, o_positive), (c_negative, o_negative)

	def __len__(self):
		return len(self.data)

class SGNSLoss(nn.Module):
	def __init__(self):
		super(SGNSLoss, self).__init__()

	def forward(self, positive, negatives):
		a = positive.sigmoid().log().squeeze()
		b = (-negatives).sigmoid().log()
		return torch.mean(-a -b)

def train(model,criterion, optimizer, dataloader, epochs):
	N= len(dataloader.dataset)
	for i in range(epochs):
		log = []
		total_loss = 0
		model.train()
		pbar = tqdm(dataloader, total=len(dataloader), desc=f"Epoch {i+1}")
		for p, n in pbar:
			pos_out = model(p[0].unsqueeze(1), p[1].unsqueeze(1))
			neg_out = model(n[0].unsqueeze(2), n[1].unsqueeze(2))
			optimizer.zero_grad()
			loss = criterion(pos_out, neg_out)
			loss.backward()
			optimizer.step()
			total_loss += loss.item() * p[0].size(0)

			pbar.set_postfix(loss=f"{total_loss/N:.2f}")

		avg_loss = total_loss / N
		log.append(avg_loss)
	return model, log

In [8]:
class SGNSLoss(nn.Module):
	def __init__(self):
		super(SGNSLoss, self).__init__()

	def forward(self, positive, negatives):
		a = positive.sigmoid().log().squeeze()
		b = (-negatives).sigmoid().log()
		return torch.mean(-a -b)

In [9]:
corpus_data =  CorpusData(testing_corpus, 3)
dataloader = DataLoader(corpus_data, batch_size=4096, shuffle=True)

In [10]:
device = "mps"

In [11]:
N = testing_corpus.vocab_size
model = SGNS(N, 25)
model = model.to(device)
# data = next(iter(corpus_loader))
criterion = SGNSLoss()

In [12]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [13]:
for i in range(100, 200):
	log = []
	total_loss = 0
	model.train()
	pbar = tqdm(dataloader, total=len(dataloader), desc=f"Epoch {i+1}")
	for p, n in pbar:
		pos_out = model(p[0].unsqueeze(1).to(device), p[1].unsqueeze(1).to(device))
		neg_out = model(n[0].unsqueeze(2).to(device), n[1].unsqueeze(2).to(device))
		optimizer.zero_grad()
		loss = criterion(pos_out, neg_out)
		loss.backward()
		optimizer.step()
		total_loss += loss.item() * p[0].size(0)

		pbar.set_postfix(loss=f"{total_loss/N:.2f}")

	avg_loss = total_loss / N
	log.append(avg_loss)


Epoch 101:  12%|█▏        | 141/1180 [00:45<05:35,  3.10it/s, loss=40.99]


KeyboardInterrupt: 

In [662]:
corpus = testing_corpus

In [663]:
corpus.id2word

{0: 'reuters',
 1: 'short',
 2: 'sellers',
 3: 'wall',
 4: 'street',
 5: 's',
 6: 'dwindling',
 7: 'band',
 8: 'of',
 9: 'ultra',
 10: 'cynics',
 11: 'are',
 12: 'seeing',
 13: 'green',
 14: 'again',
 15: 'private',
 16: 'investment',
 17: 'firm',
 18: 'carlyle',
 19: 'group',
 20: 'which',
 21: 'has',
 22: 'a',
 23: 'reputation',
 24: 'for',
 25: 'making',
 26: 'well',
 27: 'timed',
 28: 'and',
 29: 'occasionally',
 30: 'controversial',
 31: 'plays',
 32: 'in',
 33: 'the',
 34: 'defense',
 35: 'industry',
 36: 'quietly',
 37: 'placed',
 38: 'its',
 39: 'bets',
 40: 'on',
 41: 'another',
 42: 'part',
 43: 'market',
 44: 'soaring',
 45: 'crude',
 46: 'prices',
 47: 'plus',
 48: 'worries',
 49: 'about',
 50: 'economy',
 51: 'outlook',
 52: 'earnings',
 53: 'expected',
 54: 'to',
 55: 'hang',
 56: 'over',
 57: 'stock',
 58: 'next',
 59: 'week',
 60: 'during',
 61: 'depth',
 62: 'summer',
 63: 'doldrums',
 64: 'authorities',
 65: 'have',
 66: 'halted',
 67: 'oil',
 68: 'export',
 69: 'flow

In [664]:
# def log_embeddings(model, corpus):
# 	now = datetime.now()
# 	log_dir = f'runs/negativesampling_{now.strftime("%Y%m%d-%H%M%S")}'
# 	writer = SummaryWriter(log_dir)
# 	embeddings = model.center.weight
# 	labels = list(corpus.id2word.values())
# 	writer.add_embedding(embeddings, metadata=labels)
# 	writer.flush()
# 	writer.close()
# 	return log_dir
# log_embeddings(model, corpus)

In [665]:
# log_embeddings(model, corpus)

'runs/negativesampling_20240705-171458'

In [None]:
|