# WKJ

## NLP: Word2Vec

### Encoding + Classification = Embedding

https://medium.com/@patrykmwieczorek/mastering-nlp-with-pytorch-word2vec-60a54030c720

https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

# TODO: Add Images

In [None]:
!wget -q -P ./data/text https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/datasets/text/dickinson.txt
!wget -qO- https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/datasets/text/rappers.tar.gz | tar xz

In [None]:
import pandas as pd
import requests
import string
import torch

from collections import defaultdict

from torch import nn, Tensor
from torch.utils.data import DataLoader, Dataset
from torchtext.data import get_tokenizer

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
stop_1000_url = "https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt"
stop_100_url  = "https://gist.githubusercontent.com/sebleier/554280/raw/7e0e4a1ce04c2bb7bd41089c9821dbcf6d0c786c/NLTK's%2520list%2520of%2520english%2520stopwords"

In [None]:
stopwords_100_list = requests.get(stop_100_url).content
stopwords_100 = list(set(stopwords_100_list.decode().splitlines()))

stopwords_1000_list = requests.get(stop_1000_url).content
stopwords_1000 = list(set(stopwords_1000_list.decode().splitlines()))

In [None]:
def get_words(text, max_words=200_000):
	# create one big string 
	text = " ".join(text)

	# remove punctuation, whitespaces and convert to lowercase
	text = text.translate(str.maketrans('', '', string.punctuation)).lower().strip()

	# tokenize words
	tokenizer = get_tokenizer("basic_english")
	words = tokenizer("".join(text))
	return words[:max_words]

def create_vocab(text, max_words=200_000):
	words = get_words(text, max_words=max_words)
	# remove repeated words
	return words, list(set(words))

In [None]:
with open("./data/text/dickinson.txt", "r") as f:
  txt = f.read().split("\n")

In [None]:
lyrics_df = pd.read_csv("./data/text/rappers.csv")
txt = lyrics_df["lyric"].values

In [None]:
words, vocab = create_vocab(txt, max_words=500_000)

wtoi = {word: i for i, word in enumerate(["<UNK>"] + vocab)}
wtoi = defaultdict(int, wtoi)

itow = {i: word for i, word in enumerate(wtoi)}
itow = defaultdict(lambda: "<UNK>", itow)

In [None]:
len(wtoi), len(words)

In [None]:
class SkipGramDataset(Dataset):
	def __init__(self, data, window=2, symmetric_context=True):
		super().__init__()
		self.device = "cuda" if torch.cuda.is_available() else "cpu"
		self.dataset = []
		self.window = window
		self.symmetric_context = symmetric_context
		self.X, self.Y = self.create_dataset(data)
		assert len(self.X) == len(self.Y)

	def create_dataset(self, data):
		stopwords = stopwords_100 + ["=", ":", ",", "(", ")", "{", "}", "[", "]"]
		window = self.window
		xs, ys = [], []

		for i in range(0, len(data)):
			minj = i - window if self.symmetric_context else i + 1
			maxj = i + window
			if data[i] in stopwords:
				continue
			center_word = wtoi[data[i]]
			for j in range(minj, maxj + 1):
				if j == i or j < 0 or j > len(data)-1 or data[j] in stopwords:
					continue
				context_word = wtoi[data[j]]
				xs.append(center_word)
				ys.append(context_word)
		return Tensor(xs).long().to(self.device), Tensor(ys).long().to(self.device)

	def __getitem__(self, val):
		if type(val) is slice:
			return list(zip(self.X[val], self.Y[val]))
		return (self.X[val], self.Y[val])

	def __len__(self):
		return len(self.X)

In [None]:
class SkipGram(nn.Module):
	def __init__(self, vocab_size, embed_dim=128):
		super().__init__()
		self.center_embeds = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)
		self.context_embeds = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)

	def forward(self, x):
		center_word = self.center_embeds(x)
		scores = torch.matmul(center_word, self.context_embeds.weight.t())
		return scores

	def get_N_closest(self, x, N=5, metric="lnorm"):
		# get word vector
		x = self.center_embeds(x)

		# calculate similarity between x and all center vectors
		if metric == "sine":
			cos_sim = nn.CosineSimilarity()
			similarities = cos_sim(x, self.center_embeds.weight).squeeze()
			largest = True
		elif metric == "lnorm":
			similarities = torch.cdist(x, self.center_embeds.weight).squeeze()
			largest = False

		# return top-N similar words by embeddings
		values, indices = torch.topk(similarities, k=N, largest=largest)
		return indices

In [None]:
dataset = SkipGramDataset(window=3, data=words, symmetric_context=False)
train_dl = DataLoader(dataset, batch_size=4096, shuffle=True)

In [None]:
mdevice = "cuda" if torch.cuda.is_available() else "cpu"

model = SkipGram(len(wtoi), embed_dim=64).to(mdevice)
optim = torch.optim.Adam(model.parameters(), lr=5e-3)
loss_fn = nn.CrossEntropyLoss()

ctr,_ = next(iter(train_dl))
ctx = model(ctr)

print(ctr.shape, ctx.shape)

In [None]:
for e in range(32):
  model.train()
  for center, context in train_dl:
    optim.zero_grad()
    context_pred = model(center)
    loss = loss_fn(context_pred, context)
    loss.backward()
    optim.step()

  if e % 4 == 3:
    print(f"Epoch: {e} loss: {loss.item():.4f}")

In [None]:
query = Tensor([wtoi["wild"]]).long().to(mdevice)

top5s = model.get_N_closest(query, metric="sine")
top5l = model.get_N_closest(query, metric="lnorm")

print([itow[i.item()] for i in top5s])
print([itow[i.item()] for i in top5l])

## RNNs

### Classification + Classification + Classification + ...

- Output of NN becomes an input for next prediction

# TODO: Images

https://machinelearningmastery.com/lstm-for-time-series-prediction-in-pytorch/