In [1]:
import os
import pickle
import itertools
import datetime

import numpy as np
import pandas as pd
from typing import Any
from pathlib import Path
from tqdm import tqdm
from collections import Counter
from typing import Dict, List, Tuple

In [2]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [1]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder

In [5]:
torch.manual_seed(100)

<torch._C.Generator at 0x7f3e61860630>

In [6]:
print(torch.__version__) # Get PyTorch and CUDA version
print(f"{torch.cuda.is_available() = }") # Check that CUDA works
print(f"{torch.cuda.device_count() = }") # Check how many CUDA capable devices you have
# Print device human readable names
print(f"{torch.cuda.get_device_name(1) = }")
# Add more lines with +1 like get_device_name(3), get_device_name(4) if you have more devices.

2.0.1
torch.cuda.is_available() = True
torch.cuda.device_count() = 7
torch.cuda.get_device_name(1) = 'NVIDIA GeForce RTX 2080 Ti'


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(1)

In [8]:
def save_model(model: Any, model_path: str) -> None:
    """
    Saves model in gzip format

    Args:
        model: Model to be saved
        model_path: Path to save model to
        
    Returns:
        (None)
    """
    import gzip
    with gzip.open(model_path, "wb") as f:
        pickle.dump(model, f)

    print(f'Model saved to {model_path}')

## Create the Skipgram Model

In [9]:
class SkipGram(nn.Module):

    def __init__(self, emb_sizes: dict, emb_dim: int):
        super().__init__()
        self.emb_sizes = emb_sizes
        self.emb_dim = emb_dim

        # Create the embedding layer
        self.center_embeddings = nn.ModuleList()
        for k, v in self.emb_sizes.items():
            self.center_embeddings.append(nn.Embedding(v, emb_dim, sparse=True))
        
        self.context_embeddings = nn.ModuleList()
        for k, v in self.emb_sizes.items():
            self.context_embeddings.append(nn.Embedding(v, emb_dim, sparse=True))
        
        self.init_emb()

    def init_emb(self):
        """
        Init embeddings like word2vec

        Center embeddings have uniform disribution ~ [-0.5/emb_dim, 0.5/emb_dim]
        Context embeddings have nitialized with 0

        Returns:

        """
        # Initializing embeddings:
        # https://stackoverflow.com/questions/55276504/different-methods-for-initializing-embedding-layer-weights-in-pytorch
        for emb in self.context_embeddings:
            emb.weight.data.uniform_(0, 0)

        for emb in self.center_embeddings:
            emb.weight.data.uniform_(-0.5/self.emb_dim, 0.5/self.emb_dim)

    def forward(self, centers, contexts, neg_contexts):
        """

        Args:
            center: List of center words
            contexts: List of context words
            neg_contexts: List of list of negative sample words

        Return
        """

        # Calculate positive score
        emb_centers  = []
        for i in range(centers.shape[0]):
            emb_centers.append(self.center_embeddings[i](centers[:, i]))
        emb_center = torch.mean(torch.stack(emb_centers), axis = 1)

        emb_contexts = []
        for i in  range(contexts.shape[0]):
            emb_contexts.append(self.context_embeddings[i](contexts[:, i]))
        emb_context = torch.mean(torch.stack(emb_contexts), axis = 1)

        emb_neg_contexts = []
        neg_contexts = neg_contexts.view(-1, len(self.context_embeddings))
        for i in range(neg_contexts.shape[1]):
            emb_neg_contexts.append(self.context_embeddings[i](neg_contexts[:, i]))
        emb_neg_context = torch.mean(torch.stack(emb_neg_contexts), axis = 1)


        score = torch.mul(emb_center, emb_context) 
        score = torch.sum(score, dim=1)
        score = torch.clamp(score, max = 10, min = -10)
        score = -F.logsigmoid(score)


        # Negative Score
        neg_score = torch.bmm(emb_neg_context.view(emb_center.shape[0], -1, emb_center.shape[1]), emb_center.unsqueeze(2)).squeeze()
        neg_score = torch.clamp(neg_score, max=10, mix=-10)
        neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1)

        return torch.mean(score + neg_score)
    
    def get_center_emb(self, centers):
        emb_centers = []
        for row_idx, center in enumerate(centers):
            emb_center = []
            for col_idx, center_ in enumerate(center):
                emb_center.append(self.center_embeddings[col_idx](center_))
            emb_center.append(torch.mean(torch.stack(emb_center), axis = 1))

        
        return torch.stack(emb_centers)
    

    def save_embeddings(self, file_name):
        embeddings = self.center_embeddings.weight.cpu().data.numpy()
        np.save(file_name, embeddings)

## Test Model Class

In [10]:
model = SkipGram({'a': 10}, 100)

In [11]:
model

SkipGram(
  (center_embeddings): ModuleList(
    (0): Embedding(10, 100, sparse=True)
  )
  (context_embeddings): ModuleList(
    (0): Embedding(10, 100, sparse=True)
  )
)

## Create the DataLoader

In [None]:
class Sequences:

    def __init__(self, sequence_path: str, val_path: str, meta_path: str, subsample: float = 0.001, 
                 power: float = 0.75):
        
        self.negative_idx = 0
        self.n_unique_tokens = 0
        self.META_COLS = ['category_level_1', 'category_level_2', 'brand', 'price']

        self.sequences = np.load(sequence_path).tolist()
        self.n_sequences = len(self.sequences)
        print(f'No. of sequences = {self.n_sequences}')

        self.val = pd.read_csv(val_path)
        print(f'Shape of validation data = {self.val.shape}')

        self.word_freq = self.get_word_freq()

        self.word2id, self.id2word = self.get_mapping_dicts()
        self.add_val_product_to_mapping_dict()
        self.n_unique_tokens = len(self.word2id)
        print(f'No. of unique tokens = {self.n_unique_tokens}')

    def get_word_freq(self):
        seq_flat = list(itertools.chain.from_iterable(self.sequences))
        word_freq = Counter(seq_flat)

        return word_freq

    
    def get_mapping_dicts(self):
        word2id = dict()
        id2word = dict()

        wid = 0

        for w, c in self.word_freq:
            word2id[w] = wid
            id2word[wid] = w
            wid += 1

        return word2id, id2word


    def add_val_product_to_mapping_dict(self):
        val_product_set = set(self.val['product1'].values).union(set(self.val['product2'].values))
        print(f'Original size of the mapping dict = {len(self.word2id)}')

        wid = max(self.word2id.values) + 1
        for w in val_product_set:
            if w not in self.word2id:
                self.word2id[w] = wid
                self.id2word[wid] = w
                wid += 1

        self.val = None
        print(f'Size of mapping dict after adding val products = {len(self.word2id)}')

    def convert_seq_to_ids(self):
        return np.vectorize(self.word2id.get)(self.sequences)
    
    def convert_word_freq_to_ids(self):
        return {self.word2id[w]:c for w,c in self.word_freq.items}
    
    def get_product_id(self, x):
        return self.word2id.get(x, -1)

    def prep_meta(self):
        pass

    def convert_meta_to_dict(self):
        pass

    def get_discard_prob(self):
        pass

    def get_negative_sample_table(self):
        pass

    def get_mata(self):
        pass

    



