# WKJ

## NLP: Word2Vec

### Encoding + Classification = Embedding

https://medium.com/@patrykmwieczorek/mastering-nlp-with-pytorch-word2vec-60a54030c720

https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

# TODO: Add Images

In [None]:
!wget -q -P ./data/text https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/datasets/text/dickinson.txt
!wget -qO- https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/datasets/text/rappers.tar.gz | tar xz

In [None]:
import pandas as pd
import requests
import string
import torch

from collections import defaultdict

from torch import nn, Tensor
from torch.utils.data import DataLoader, Dataset
from torchtext.data import get_tokenizer

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
class TextUtils():
	stop_1000_url = "https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt"
	stop_100_url  = "https://gist.githubusercontent.com/sebleier/554280/raw/7e0e4a1ce04c2bb7bd41089c9821dbcf6d0c786c/NLTK's%2520list%2520of%2520english%2520stopwords"

	stopwords_list = requests.get(stop_100_url).content
	stopwords = list(set(stopwords_list.decode().splitlines()))

	@staticmethod
	def create_vocab(text, max_words=200_000):
		# create one big string
		if type(text) is list:
			text = " ".join(text)

		# remove punctuation, whitespaces and convert to lowercase
		text = text.translate(str.maketrans('', '', string.punctuation)).lower().strip()

		# tokenize words
		tokenizer = get_tokenizer("basic_english")
		words = tokenizer("".join(text))[:max_words]

		# remove repeated words
		vocab = list(set(words))

		return words, vocab

In [None]:
class TextSequenceDataset(Dataset):
	def __init__(self, text, max_words=200_000, window=2, symmetric_context=True):
		super().__init__()
		self.device = "cuda" if torch.cuda.is_available() else "cpu"
		self.window = window
		self.symmetric_context = symmetric_context
		self.words, self.vocab = TextUtils.create_vocab(text, max_words=max_words)
		wtoi = {word: i for i, word in enumerate(["<UNK>"] + self.vocab)}
		itow = {i: word for i, word in enumerate(wtoi)}

		self.wtoi = defaultdict(int, wtoi)
		self.itow = defaultdict(lambda: "<UNK>", itow)
		print(f"{len(self.wtoi)} words in vocab")
		print(f"{len(self.words)} words in text")

	def encode_word(self, word, return_tensors=False):
		widx = self.wtoi[word]
		if not return_tensors:
			return widx
		else:
			return Tensor([widx]).long().to(self.device)

	def encode(self, words):
		widx = [self.wtoi[w] for w in words]
		return Tensor(widx).long().to(self.device)

	def decode_word(self, idx):
		if type(idx) is int:
			return self.itow[idx]
		else:
			return self.itow[idx.item()]

	def decode(self, idxs_t):
		idxs = idxs_t.tolist()
		return [self.itow[i] for i in idxs]

class SkipGramDataset(TextSequenceDataset):
	def __init__(self, text, max_words=200_000, window=2, symmetric_context=True):
		super().__init__(text, max_words, window, symmetric_context)
		self.X, self.Y = self.create_dataset(self.words, self.wtoi, self.window, self.symmetric_context)
		assert len(self.X) == len(self.Y)

	def create_dataset(self, words, wtoi, window, symmetric_context):
		stopwords = TextUtils.stopwords + ["=", ":", ",", "(", ")", "{", "}", "[", "]"]
		xs, ys = [], []

		for i in range(0, len(words)):
			minj = i - window if symmetric_context else i + 1
			maxj = i + window
			if words[i] in stopwords:
				continue
			center_word = wtoi[words[i]]
			for j in range(minj, maxj + 1):
				if j == i or j < 0 or j > len(words) - 1 or words[j] in stopwords:
					continue
				context_word = wtoi[words[j]]
				xs.append(center_word)
				ys.append(context_word)
		return Tensor(xs).long().to(self.device), Tensor(ys).long().to(self.device)

	def __getitem__(self, idx):
		if type(idx) is slice:
			return list(zip(self.X[idx], self.Y[idx]))
		return (self.X[idx], self.Y[idx])

	def __len__(self):
		return len(self.X)

In [None]:
class SkipGram(nn.Module):
	def __init__(self, vocab_size, embed_dim=128):
		super().__init__()
		self.center_embeds = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
		self.context_embeds = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)

	def forward(self, x):
		center_word = self.center_embeds(x)
		scores = torch.matmul(center_word, self.context_embeds.weight.t())
		return scores

	def get_N_closest(self, x, N=5, metric="lnorm"):
		# get word vector
		x = self.center_embeds(x)

		# calculate similarity between x and all center vectors
		if metric == "sine":
			cos_sim = nn.CosineSimilarity()
			similarities = cos_sim(x, self.center_embeds.weight).squeeze()
			largest = True
		elif metric == "lnorm":
			similarities = torch.cdist(x, self.center_embeds.weight).squeeze()
			largest = False

		# return top-N similar words by embeddings
		values, indices = torch.topk(similarities, k=N, largest=largest)
		return indices

In [None]:
with open("./data/text/dickinson.txt", "r") as f:
  dickinson_text = f.read().split("\n")

In [None]:
lyrics_df = pd.read_csv("./data/text/rappers.csv")
rapper_text = lyrics_df["lyric"].values

In [None]:
dataset = SkipGramDataset(text=dickinson_text, max_words=500_000, window=3, symmetric_context=False)
train_dl = DataLoader(dataset, batch_size=4096, shuffle=True)

In [None]:
mdevice = "cuda" if torch.cuda.is_available() else "cpu"

model = SkipGram(vocab_size=len(dataset.wtoi), embed_dim=64).to(mdevice)
optim = torch.optim.Adam(model.parameters(), lr=5e-3)
loss_fn = nn.CrossEntropyLoss()

ctr,ctx = next(iter(train_dl))
print(ctr.shape, ctx.shape)

ctx_pred = model(ctr)
print(ctx_pred.shape)

In [None]:
for e in range(32):
  model.train()
  for center, context in train_dl:
    optim.zero_grad()
    context_pred = model(center)
    loss = loss_fn(context_pred, context)
    loss.backward()
    optim.step()

  if e % 4 == 3:
    print(f"Epoch: {e} loss: {loss.item():.4f}")

In [None]:
query = dataset.encode_word("wild", return_tensors=True)

top5s = model.get_N_closest(query, N=5, metric="sine")
top5l = model.get_N_closest(query, N=5, metric="lnorm")

print(dataset.decode(top5s))
print(dataset.decode(top5l))

# TODO : Translate ...

Get Dickinson phrase -> list of embeddings.

Get start word from rappers, follow directions from embeddings, get nearest word(s)...

## RNNs

### Classification + Classification + Classification + ...

- Output of NN becomes an input for next prediction

# TODO: Images

https://machinelearningmastery.com/an-introduction-to-recurrent-neural-networks-and-the-math-that-powers-them/

https://machinelearningmastery.com/models-sequence-prediction-recurrent-neural-networks/

https://medium.com/@prudhviraju.srivatsavaya/lstm-vs-gru-c1209b8ecb5a

https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

https://machinelearningmastery.com/lstm-for-time-series-prediction-in-pytorch/

In [None]:
class NGramDataset(TextSequenceDataset):
	def __init__(self, text, max_words=200_000, window=2):
		super().__init__(text, max_words, window, symmetric_context=False)
		self.words_t = self.encode(self.words)

	def __len__(self):
		return len(self.words) - self.window

	def __getitem__(self, idx):
		target = self.words_t[idx + self.window]
		context = self.words_t[idx : idx + self.window]
		return context, target

In [None]:
class NextWordGRU(nn.Module):
  def __init__(self, vocab_size, embedding_dim=64, hidden_dim=256, num_layers=2):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_dim, vocab_size)

  def forward(self, x, hidden):
    x = self.embedding(x)
    out, hidden = self.gru(x, hidden)
    out = self.fc(out[:, -1, :])
    return out, hidden

In [None]:
dataset = NGramDataset(text=dickinson_text, max_words=500_000, window=3)
train_dl = DataLoader(dataset, batch_size=4096, shuffle=True)

In [None]:
mdevice = "cuda" if torch.cuda.is_available() else "cpu"

model = NextWordGRU(vocab_size=len(dataset.wtoi), embedding_dim=64, hidden_dim=256, num_layers=2).to(mdevice)
optim = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

input, target = next(iter(train_dl))
print(input.shape, target.shape)

output, hidden = model(input, None)
print(output.shape, hidden.shape)

In [None]:
for e in range(32):
  model.train()
  for input, target in train_dl:
    optim.zero_grad()
    hidden = None
    output, hidden = model(input, hidden)
    loss = loss_fn(output, target)
    loss.backward()
    optim.step()

  if e % 4 == 3:
    print(f"Epoch: {e} loss: {loss.item():.4f}")

In [None]:
query = dataset.encode(["not","one","more"]).unsqueeze(0)

model.eval()
with torch.no_grad():
  output, _ = model(query, None)
  output = output.squeeze()
  top1 = output.argmax()
  top5 = output.argsort(descending=True)[:5]
  print(dataset.decode_word(top1))
  print(dataset.decode(top5))