# Imports

In [2]:
import torch
import torch.nn as nn
import torch.optim as opt
import pandas as pd
from torch.nn import functional as F
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import os
import random

# Hyperparameters

In [43]:
BATCH_SIZE = 128
CONTEXT_SIZE = 5
DATA_PATH = "../data/Articles"

# Load Data

In [44]:
def get_data(data_path):
    data = {'File Name': [], 'Text': []}

    for folder_name in os.listdir(data_path):
        folder_path = os.path.join(data_path, folder_name)
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                if file_name.endswith('.txt'):
                    file_path = os.path.join(folder_path, file_name)
                    with open(file_path, 'r', encoding='ISO-8859-1') as file:
                        content = file.read()
                        data['File Name'].append(file_name)
                        data['Text'].append(content)

    df = pd.DataFrame(data)

    # Remove the first line (title) from the 'Text' column
    df['Text'] = df['Text'].str.split('\n', 1).str[1]

    # Remove extra spaces and symbols
    df['Text'] = df['Text'].str.replace(r'\n', ' ', regex=True)
    df['Text'] = df['Text'].str.replace(r'[^\w\s]', ' ', regex=True)  # Remove non-alphanumeric characters
    df['Text'] = df['Text'].str.replace(r'\s+', ' ', regex=True)  # Remove extra spaces

    # Remove numbers
    df['Text'] = df['Text'].str.replace(r'\d+', '', regex=True)

    # Lowercase all words
    df['Text'] = df['Text'].str.lower()

    return pd.DataFrame(df["Text"])

In [45]:
data = get_data(DATA_PATH)
data.head()

Unnamed: 0,Text
0,musicians groups are to tackle us visa regula...
1,u who have won three prestigious grammy award...
2,rock singer pete doherty has been involved in...
3,the film adaptation of lemony snicket novels ...
4,ocean s twelve the crime caper sequel starrin...


In [46]:
def create_pairs(data, context_size):
    pairs = []
    for _, row in data.iterrows():
        input_sequence = row["Text"].split()

        num_windows = len(input_sequence) - 2 * context_size

        for i in range(num_windows):
            window = input_sequence[i: i + context_size] + input_sequence[i + context_size + 1: i + 2 * context_size + 1]
            target = input_sequence[i + context_size]
            
            for word in window:
                pairs.append([target, word])

    return pairs

In [47]:
pairs = create_pairs(data, CONTEXT_SIZE)
pairs[:5]

[['us', 'musicians'],
 ['us', 'groups'],
 ['us', 'are'],
 ['us', 'to'],
 ['us', 'tackle']]

In [48]:
# Create a CountVectorizer instance
tokenizer = CountVectorizer(lowercase=True)

# Fit the vectorizer on your text data
text_data = data['Text'].tolist()  # Assuming your DataFrame is named 'df'
tokenizer.fit(text_data)

# Get the vocabulary (list of words) and its corresponding indices
vocabulary = tokenizer.get_feature_names_out()
vocabulary = np.append(vocabulary, "UNK")
word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}
idx_to_word = {idx: word for idx, word in enumerate(vocabulary)}

# Example: Transform a text into a vector representation
text_vector = tokenizer.transform(['example']).toarray()

In [49]:
unk_token = word_to_idx["UNK"]

# Creating a Text Dataset

In [59]:
class TextDataset(Dataset):
    def __init__(self, pairs, word_to_idx, unk_token):
        self.pairs = pairs
        self.word_to_idx = word_to_idx
        self.unk_token = unk_token

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, index):
        target = self.pairs[index][0]
        context = self.pairs[index][1]

        # Tokenize the text
        target = torch.tensor([self.word_to_idx.get(target, self.unk_token)])
        context = torch.tensor([self.word_to_idx.get(context, self.unk_token)])

        return {
            "target": target,
            "context": context
        }

In [60]:
train_ds = TextDataset(pairs, word_to_idx, unk_token)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)

# Define the model architecture

In [61]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(SkipGram, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out = self.out(x)
        return out

# Traning the model

In [62]:
VOCAB_SIZE = len(vocabulary)
D_MODEL = 512
N_EPOCHS = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

In [63]:
model = SkipGram(VOCAB_SIZE, D_MODEL).to(DEVICE)

In [64]:
# Define loss fucntion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = opt.Adam(model.parameters(), lr=0.0001)

In [67]:
# Training loop
def train(n_epochs, model=model, train_loader=train_loader, tokenizer=tokenizer, criterion=criterion, optimizer=optimizer):
    for epoch in range(n_epochs):
        total_loss = 0

        model.train()
        for batch in tqdm(train_loader, desc=f"EPOCH: {epoch+1} / {n_epochs}", leave=False):
            # Get data from the loader and put it on GPU if available.
            target = batch["target"].to(DEVICE)
            context = batch["context"].to(DEVICE)

            optimizer.zero_grad()
            outputs = model(target)

            loss = criterion(outputs.view(-1, VOCAB_SIZE), context.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{n_epochs} | Loss: {avg_loss:.4f}")
        rand_idx = torch.randint(0, len(batch), (1, )).item()
        target = idx_to_word.get(target[rand_idx].item())
        context = idx_to_word.get(context[rand_idx].item())

        pred_probs = torch.softmax(outputs[rand_idx], dim=1)
        pred_token = torch.argmax(pred_probs)
        predicted_context = idx_to_word.get(pred_token.item())

        print(f"TARGET: {target} | CONTEXT: {context} | PREDICTED: {predicted_context}")