# Data

The model was trained on the BBC News Articles dataset.

You can download the dataset here:

https://www.kaggle.com/datasets/pariza/bbc-news-summary

# Imports

In [1]:
import torch
import torch.nn as nn
import torch.optim as opt
import pandas as pd
from torch.nn import functional as F
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import os
import random

  from .autonotebook import tqdm as notebook_tqdm


# Hyperparameters

In [20]:
BATCH_SIZE = 64
CONTEXT_SIZE = 8
DATA_PATH = "../data/Articles"

# Load data

In [24]:
def get_data(data_path):
    data = {'File Name': [], 'Text': []}

    for folder_name in os.listdir(data_path):
        folder_path = os.path.join(data_path, folder_name)
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                if file_name.endswith('.txt'):
                    file_path = os.path.join(folder_path, file_name)
                    with open(file_path, 'r', encoding='ISO-8859-1') as file:
                        content = file.read()
                        data['File Name'].append(file_name)
                        data['Text'].append(content)

    df = pd.DataFrame(data)

    # Remove the first line (title) from the 'Text' column
    df['Text'] = df['Text'].str.split('\n', 1).str[1]

    # Remove extra spaces and symbols
    df['Text'] = df['Text'].str.replace(r'\n', ' ', regex=True)
    df['Text'] = df['Text'].str.replace(r'[^\w\s]', ' ', regex=True)  # Remove non-alphanumeric characters
    df['Text'] = df['Text'].str.replace(r'\s+', ' ', regex=True)  # Remove extra spaces

    # Remove numbers
    df['Text'] = df['Text'].str.replace(r'\d+', '', regex=True)

    # Lowercase all words
    df['Text'] = df['Text'].str.lower()

    return pd.DataFrame(df["Text"])

In [25]:
data = get_data(DATA_PATH)
data.head()

Unnamed: 0,Text
0,musicians groups are to tackle us visa regula...
1,u who have won three prestigious grammy award...
2,rock singer pete doherty has been involved in...
3,the film adaptation of lemony snicket novels ...
4,ocean s twelve the crime caper sequel starrin...


In [26]:
# Create a CountVectorizer instance
tokenizer = CountVectorizer(lowercase=True)

# Fit the vectorizer on your text data
text_data = data['Text'].tolist()  # Assuming your DataFrame is named 'df'
tokenizer.fit(text_data)

# Get the vocabulary (list of words) and its corresponding indices
vocabulary = tokenizer.get_feature_names_out()
vocabulary = np.append(vocabulary, "UNK")
word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}
idx_to_word = {idx: word for idx, word in enumerate(vocabulary)}

# Example: Transform a text into a vector representation
text_vector = tokenizer.transform(['example']).toarray()

In [27]:
def create_windows_dataframe(data, context_size):
    all_windows_str = []  # Modified to store windows as strings
    all_labels = []

    for index, row in data.iterrows():
        input_sequence = row["Text"].split()

        num_windows = len(input_sequence) - 2 * context_size

        for i in range(num_windows):
            window = input_sequence[i: i + context_size] + input_sequence[i + context_size + 1: i + 2 * context_size + 1]
            window_str = " ".join(window)  # Convert the window list to a string
            label = input_sequence[i + context_size]
            all_windows_str.append(window_str)  # Append the window string
            all_labels.append(label)

    # Create a pandas DataFrame from the lists
    windows_df = pd.DataFrame({
        'windows': all_windows_str,  # Use the modified list containing window strings
        'labels': all_labels
    })

    return windows_df

# Assuming you have a DataFrame named 'test_df' with a column named 'Text'
windows_dataframe = create_windows_dataframe(data, CONTEXT_SIZE)
windows_dataframe.head()

Unnamed: 0,windows,labels
0,musicians groups are to tackle us visa regulat...,which
1,groups are to tackle us visa regulations which...,are
2,are to tackle us visa regulations which are fo...,blamed
3,to tackle us visa regulations which are blamed...,for
4,tackle us visa regulations which are blamed fo...,hindering


In [113]:
# Split the data into train and val
X, y = windows_dataframe["windows"], windows_dataframe["labels"]
# X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)

# X_train = X_train.to_list()
# X_val = X_val.to_list()
# y_train = y_train.to_list()
# y_val = y_val.to_list()

# Creating a Text Dataset

In [114]:
class TextDataset(Dataset):
    def __init__(self, windows, labels, word_to_idx, unk_token):
        self.windows = windows
        self.labels = labels
        self.word_to_idx = word_to_idx
        self.unk_token = unk_token

    def __len__(self):
        return len(self.windows)

    def __getitem__(self, index):
        window = self.windows[index]
        label = self.labels[index]

        # Tokenize the text
        tokenized_window = torch.tensor([self.word_to_idx.get(word, self.unk_token) for word in window.split()])
        tokenized_label = torch.tensor([self.word_to_idx.get(label, self.unk_token)])


        return {
            "windows": tokenized_window,
            "labels": tokenized_label
        }

In [115]:
unk_token = word_to_idx["UNK"]

In [116]:
train_ds = TextDataset(X, y, word_to_idx, unk_token)
# val_ds = TextDataset(X_val, y_val, word_to_idx, unk_token)

In [117]:
# Create data loaders
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
# val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

# Define the model architecture

In [118]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, d_model, hidden_dim):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.fc1 = nn.Linear(d_model, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = torch.mean(self.embedding(x), dim=1) # (batch_size, d_model)
        x = self.fc1(x) # (batch_size, hidden_dim)
        x = self.fc2(x) # (batch_size, vocab_size)
        return x

# Traning the model


In [119]:
VOCAB_SIZE = len(vocabulary)
D_MODEL = 512
HIDDEN_DIM = 128
N_EPOCHS = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

In [120]:
model = CBOW(VOCAB_SIZE, D_MODEL, HIDDEN_DIM).to(DEVICE)

In [56]:
# Define loss fucntion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = opt.Adam(model.parameters(), lr=0.000001)

In [57]:
# Training loop
def train(n_epochs, model=model, train_loader=train_loader, tokenizer=tokenizer, criterion=criterion, optimizer=optimizer):
    for epoch in range(n_epochs):
        total_loss = 0

        model.train()
        for batch in tqdm(train_loader, desc=f"EPOCH: {epoch+1} / {n_epochs}", leave=False):
            # Get data from the loader and put it on GPU if available.
            windows = batch["windows"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)

            optimizer.zero_grad()
            outputs = model(windows)

            loss = criterion(outputs, labels.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{n_epochs} | Loss: {avg_loss:.4f}")

In [None]:
train(N_EPOCHS)

In [44]:
def get_analogy(word1, word2, word3, model=model, word_to_idx=word_to_idx, idx_to_word=idx_to_word, unk_token=word_to_idx["UNK"], n=5):
    word1_index = word_to_idx.get(word1, unk_token)
    word2_index = word_to_idx.get(word2, unk_token)
    word3_index = word_to_idx.get(word3, unk_token)

    # Access the embedding layer of your model
    embedding_layer = model.embedding  # Replace with the actual name of your embedding layer

    # Get the embedding vector for the word
    word1_emb = embedding_layer.weight[word1_index]
    word2_emb = embedding_layer.weight[word2_index]
    word3_emb = embedding_layer.weight[word3_index]

    analogy_vector = word1_emb - word2_emb + word3_emb
    analogy_vector_cpu = analogy_vector.cpu().detach()
    word_embeddings_cpu = embedding_layer.weight.cpu().detach()

    # Calculate cosine similarity between the analogy vector and all word embeddings
    similarity_scores = cosine_similarity(analogy_vector_cpu.reshape(1, -1), word_embeddings_cpu)
    
    # Find the indices of the n most similar words
    most_similar_indices = np.argsort(similarity_scores[0])[-n:][::-1]

    # Get the words associated with the most similar indices and their similarity scores
    similar_words = [idx_to_word[idx] for idx in most_similar_indices]
    similar_scores = [similarity_scores[0][idx] for idx in most_similar_indices]

    for word, score in zip(similar_words, similar_scores):
        print(f"Word: {word}, Cosine Similarity: {score:.4f}")

In [45]:
get_analogy("king", "man", "woman") # Doesn't really understand 

Word: king, Cosine Similarity: 0.6175
Word: woman, Cosine Similarity: 0.5853
Word: academically, Cosine Similarity: 0.2030
Word: arabia, Cosine Similarity: 0.1982
Word: burns, Cosine Similarity: 0.1971


In [122]:
def check_similarity(word1, word2, model=model, word_to_idx=word_to_idx, idx_to_word=idx_to_word, unk_token=word_to_idx["UNK"]):
    word1_index = word_to_idx.get(word1, unk_token)
    word2_index = word_to_idx.get(word2, unk_token)

    # Access the embedding layer of your model
    embedding_layer = model.embedding

    # Get the embedding vectors for the words
    word1_emb = embedding_layer.weight[word1_index]
    word2_emb = embedding_layer.weight[word2_index]

    # Calculate cosine similarity between the embedding vectors
    similarity_score = cosine_similarity(word1_emb.cpu().detach().reshape(1, -1), word2_emb.cpu().detach().reshape(1, -1))

    print(f"Are '{word1}' and '{word2}' similar?")
    print(f"Cosine Similarity: {similarity_score[0][0]:.4f}")

In [55]:
check_similarity("tall", "short") # Kind of understands that these are opposite (negative score)

Are 'tall' and 'short' similar?
Cosine Similarity: -0.0156


In [None]:
def get_most_similar_words(input_word, model=model, word_to_idx=word_to_idx, idx_to_word=idx_to_word, unk_token=word_to_idx["UNK"], n=5):
    word_index = word_to_idx.get(input_word, unk_token)

    # Access the embedding layer of your model
    embedding_layer = model.embedding

    # Get the embedding vector for the input word
    input_word_emb = embedding_layer.weight[word_index]

    # Calculate cosine similarity between the embedding vector of the input word and all word embeddings
    similarity_scores = cosine_similarity(input_word_emb.cpu().detach().reshape(1, -1), embedding_layer.weight.cpu().detach())

    # Find the indices of the n most similar words
    most_similar_indices = np.argsort(similarity_scores[0])[-n:][::-1]

    # Get the words associated with the most similar indices and their similarity scores
    similar_words = [idx_to_word[idx] for idx in most_similar_indices]
    similar_scores = [similarity_scores[0][idx] for idx in most_similar_indices]

    print(f"Most Similar Words to '{input_word}':")
    for word, score in zip(similar_words, similar_scores):
        print(f"Word: {word}, Cosine Similarity: {score:.4f}")