In [1]:
import os
import pickle
import itertools
import datetime

import numpy as np
import pandas as pd
from typing import Any
from pathlib import Path
from tqdm import tqdm
from collections import Counter
from typing import Dict, List, Tuple

In [2]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [36]:
from sklearn.metrics import roc_auc_score

In [4]:
torch.manual_seed(100)

<torch._C.Generator at 0x7ff390bb8570>

In [5]:
print(torch.__version__) # Get PyTorch and CUDA version
print(f"{torch.cuda.is_available() = }") # Check that CUDA works
print(f"{torch.cuda.device_count() = }") # Check how many CUDA capable devices you have
# Print device human readable names
print(f"{torch.cuda.get_device_name(0) = }")
# Add more lines with +1 like get_device_name(3), get_device_name(4) if you have more devices.

1.12.1+cu113
torch.cuda.is_available() = True
torch.cuda.device_count() = 1
torch.cuda.get_device_name(0) = 'NVIDIA A10G'


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(0)

In [7]:
def save_model(model: Any, model_path: str) -> None:
    """
    Saves model in gzip format

    Args:
        model: Model to be saved
        model_path: Path to save model to
        
    Returns:
        (None)
    """
    import gzip
    with gzip.open(model_path, "wb") as f:
        pickle.dump(model, f)

    print(f'Model saved to {model_path}')

## Create the MF Model

In [8]:
def regularize_l2(array):
    loss = torch.sum(array ** 2)
    return loss

class MF(nn.Module):
    def __init__(self, emb_size, emb_dim, c_vector=1e-6):
        super().__init__()
        self.emb_size = emb_size # size of the dictionary of embeddings
        self.emb_dim = emb_dim # size of each embedding vector
        self.c_vector = c_vector
        
        # layers
        self.embedding = nn.Embedding(emb_size, emb_dim)
        self.sig = nn.Sigmoid()
        
        # loss
        self.bce = nn.BCELoss()
        
        print(f'Model initialized: {self}')
        
    def forward(self, product1, product2):
        emb_product1 = self.embedding(product1)
        emb_product2 = self.embedding(product2)
        interaction = self.sig(torch.sum(emb_product1*emb_product2, dim = 1, dtype = torch.float))
        return interaction
    
    
    def loss(self, pred, label):
        mf_loss = self.bce(pred, label)
        
        # L2 regularization
        product_prior = regularize_l2(self.embedding.weight) * self.c_vector
        
        loss_total  = mf_loss + product_prior # loss + regularization 
        
        return loss_total

## Test Model Class

In [9]:
model = MF(1000, 12).to(device)

Model initialized: MF(
  (embedding): Embedding(1000, 12)
  (sig): Sigmoid()
  (bce): BCELoss()
)


In [10]:
pred = model.forward(torch.LongTensor([0,1]).to(device), torch.LongTensor([13,12]).to(device))
pred

tensor([0.6586, 0.9999], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [11]:
model.loss(pred, torch.FloatTensor([0,1]).to(device))

tensor(0.5495, device='cuda:0', grad_fn=<AddBackward0>)

In [12]:
model.to('cpu')

MF(
  (embedding): Embedding(1000, 12)
  (sig): Sigmoid()
  (bce): BCELoss()
)

## Create the data loader

In [13]:
class Sequences:
    NEGATIVE_SAMPLE_TABLE_SIZE = 1e5
    WINDOW = 5
    
    def __init__(self, sequence_path: str, val_path: str, subsample: float = 0.001, power: float = 0.75):
        """
        Intialize the dataset object
        """
        self.negative_idx = 0
        self.n_unique_tokens = 0
        
        self.sequences = np.load(sequence_path).tolist()
        self.n_sequences = len(self.sequences)
        print(f'# Sequences = {self.n_sequences}')
        
        self.val = pd.read_csv(val_path)
        print(f'# Validation data = {self.val.shape}')
        
        self.word_freq = self.get_word_freq()
        
        self.word2id, self.id2word = self.get_mapping_dicts()
        self.add_val_product_to_mapping_dicts()
        self.n_unique_tokens = len(self.word2id)
        print(f'# Tokens = {self.n_unique_tokens}')
        
        sequence_file_name = Path(sequence_path).resolve().stem
        save_model(self.word2id, f'../data/processed/{sequence_file_name}_word2id')
        save_model(self.id2word, f'../data/processed/{sequence_file_name}_id2word')
        
        self.sequences = self.convert_sequence_to_id()
        self.word_freq = self.convert_word_freq_to_id()
        
        self.discard_probs = self.get_discard_probs(sample = subsample)
        
        self.neg_table = self.get_negative_sample_table(power = power)
        
    def get_word_freq(self) -> Counter:
        """
        Returns a dictionary of word frequencies
        """
        
        seq_flat = list(itertools.chain.from_iterable(self.sequences)) # flatten the array
        
        word_freq = Counter(seq_flat)
        
        return word_freq
    
    def get_mapping_dicts(self):
        word2id = dict()
        id2word = dict()
        
        wid = 0
        for w,c in self.word_freq.items():
            if wid == 0:
                print(f"{w = }, {c = }")
            word2id[w] = wid
            id2word[wid] = w
            wid += 1
        
        return word2id, id2word
    
    def add_val_product_to_mapping_dicts(self):
        val_product_set = set(self.val['product1'].values).union(set(self.val['product2'].values))
        
        print(f'Size of word2id before adding val product : {len(self.word2id)}')
        wid = max(self.word2id.values()) + 1
        for w in val_product_set:
            if w not in self.word2id:
                self.word2id[w] = wid
                self.id2word[wid] = w
                wid +=1
        
        self.val = None # free up space
        print(f'Size of the word2id after adding val product : {len(self.word2id)}')
        
                
    def convert_sequence_to_id(self):
        return np.vectorize(self.word2id.get)(self.sequences)
    
    def get_product_id(self, x):
        return self.word2id.get(x, -1)
    
    def convert_word_freq_to_id(self):
        return {self.word2id[k] : v for k ,v  in self.word_freq.items()}
    
    def get_discard_probs(self, sample = 0.001):
        """
        Returns a dictionary of words and their associated discard probability, 
        word should ne discarded if np.random.rand() < probability
        """
        
        # convert to array
        word_freq = np.array(list(self.word_freq.items()), dtype=np.float64)
        
        # convert to probability
        word_freq[:, 1] = word_freq[:, 1] / word_freq[:, 1].sum()
        
        # perform subsampling 
        # http://mccormickml.com/2017/01/11/word2vec-tutorial-part-2-negative-sampling/
        word_freq[:, 1] = (np.sqrt(word_freq[:, 1]/ sample) + 1) * (sample / word_freq[:, 1]) 
        
        # get dict 
        discard_probs = {int(k) : v for k, v in word_freq.tolist()}
        
        return discard_probs
    
    def get_negative_sample_table(self, power=0.75):
        """
        Returns a table with size = NEGATIVE_SAMPLE_TABLE_SIZE of nagative samples which can be selected via indexing. 
        """
        
        # COnvert to array 
        word_freq = np.array(list(self.word_freq.items()), dtype = np.float)
        
        # adjust the power
        word_freq[:, 1] = word_freq[:, 1] ** power
        
        # Get probabilities
        word_freq_sum = word_freq[:, 1].sum()
        word_freq[:, 1] = word_freq[:, 1] / word_freq_sum
        
        # Multiply probabilities by sample table size
        word_freq[:, 1] = np.round(word_freq[:, 1] * self.NEGATIVE_SAMPLE_TABLE_SIZE)
        
        # Convert to int 
        word_freq = word_freq.astype(int).tolist()
        
        # create the sample table
        sample_table = [[tup[0]]*tup[1] for tup in tqdm(word_freq)] # repeating the index (wrod_id) by proportion of their frequency (more frequent words are more probable for sampling)
        sample_table = np.array(list(itertools.chain.from_iterable(sample_table)))
        np.random.shuffle(sample_table)

        return sample_table
        
    
    def get_pairs(self, idx, window = 5):
        pairs = []
        sequence = self.sequences[idx]
        
        for center_idx, node in enumerate(sequence):
            for i in range(-window, window + 1):
                context_idx = center_idx + i
                if (context_idx > 0) and (context_idx < len(sequence)) and (node != sequence[context_idx]) and (np.random.rand() < self.discard_probs[sequence[context_idx]]):
                    pairs.append((node, sequence[context_idx]))
    
        
        return pairs
    
    def get_all_center_context_pair(self, window = 5) -> List[Tuple[int, int]]:
        """
        Returns a list of tuples (center, context).
        
        Args: 
            window:
            
        Returns:
        
        """
        
        pairs = []
        
        for sequence in self.sequences:
            for center_idx, node in enumerate(sequence):
                context_idx = center_idx + i
                if (0 <= context_idx < len(sequence)) \
                    and node != sequence[context_idx] \
                    and np.random.rand() < self.discard_probs[sequence[context_idx]]:
                    pairs.append((node, sequence[context_idx]))
                        
            
        return pairs
    
    
    def get_negative_samples(self, context, sample_size = 5) -> np.array:
        """
        Returns a list of negative samples, where len = sample_size.
        
        eg. if context is 12345 sample from 
        
        Args:
        
            sample_size:
            
        """
        
        while True:
            neg_sample = self.neg_table[self.negative_idx:self.negative_idx + sample_size]
            
            self.negative_idx = (self.negative_idx + sample_size) % len(self.neg_table)
            
            if len(neg_sample) != sample_size:
                neg_sample = np.concatenate((neg_sample, 
                                             self.neg_table[:self.negative_idx]))
                
            
            if not context in neg_sample:
                return neg_sample    

In [14]:
class SequencesDataset(Dataset):
    def __init__(self, sequences: Sequences, neg_sample_size = 5):
        self.sequences = sequences
        self.neg_sample_size = neg_sample_size
        
    def __len__(self):
        return self.sequences.n_sequences
    
    def __getitem__(self, idx):
        pairs = self.sequences.get_pairs(idx)
        neg_samples = []
        for center, context in pairs:
            neg_samples.append(self.sequences.get_negative_samples(context))
        
        return pairs, neg_samples
    
    @staticmethod
    def collate(batches):
        pairs_batch = [batch[0] for batch in batches]
        neg_contexts_batch = [batch[1] for batch in batches]
        
        pairs_batch = list(itertools.chain.from_iterable(pairs_batch))
        neg_contexts = list(itertools.chain.from_iterable(neg_contexts_batch))
        
        centers = [center for center, _ in pairs_batch]
        contexts = [context for _, context in pairs_batch]
        
        return torch.LongTensor(centers), torch.LongTensor(contexts), torch.LongTensor(neg_contexts)
    
    @staticmethod
    def collate_for_mf(batches):
        batch_list = []
        
        for batch in batches:
            pairs = np.array(batch[0])
            # print(f"{pairs.shape=}")
            negs = np.array(batch[1])
            # print(f"{negs.shape=}")
            negs = np.vstack((pairs[:, 0].repeat(negs.shape[1]), negs.ravel())).T
            # print(f"{negs.shape=}")
            # print(negs)
            pairs_arr = np.ones((pairs.shape[0], pairs.shape[1] + 1), dtype=int) # 2d
            pairs_arr[:, :-1] = pairs
            # print(f"{pairs_arr.shape=}")
            negs_arr = np.zeros((negs.shape[0], negs.shape[1] + 1), dtype=int) # 2d
            negs_arr[:, :-1] = negs
            # print(f"{negs_arr.shape=}")
            all_arr = np.vstack((pairs_arr, negs_arr)) # 2d
            batch_list.append(all_arr)
            
        batch_array = np.vstack(batch_list)
        
        
        return (torch.LongTensor(batch_array[:, 0]),torch.LongTensor(batch_array[:, 1]),
                torch.FloatTensor(batch_array[:, 2]))
    
    

## Testing DataLoader

In [15]:
read_path = '../data/processed/meta_Electronics_random_walks.npy'
val_path = '../data/interim/meta_Electronics_edges_val.csv'

In [16]:
shuffle = True
emb_dim = 128
epochs = 5
initial_lr = 0.01

In [17]:
sequences = Sequences(read_path, val_path)

# Sequences = 4649780
# Validation data = (1440998, 3)
w = 'b00f37z8q6', c = 40
Size of word2id before adding val product : 464978
Size of the word2id after adding val product : 527314
# Tokens = 527314
Model saved to ../data/processed/meta_Electronics_random_walks_word2id
Model saved to ../data/processed/meta_Electronics_random_walks_id2word


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  word_freq = np.array(list(self.word_freq.items()), dtype = np.float)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 464978/464978 [00:00<00:00, 1015476.98it/s]


In [18]:
sequences.word2id['b00f37z8q6']

0

In [19]:
sequences.word_freq[sequences.word2id['b00f37z8q6']]

40

In [20]:
dataset = SequencesDataset(sequences)

In [21]:
dataloader = DataLoader(dataset, batch_size = 32, shuffle = shuffle, num_workers = 32, collate_fn = dataset.collate_for_mf)

In [94]:
dataset[0] # returns a list of positive pairs and 5 negative sample for each context at idx i

([(0, 1),
  (0, 2),
  (0, 3),
  (0, 4),
  (0, 3),
  (1, 2),
  (1, 3),
  (1, 4),
  (1, 3),
  (1, 5),
  (2, 1),
  (2, 3),
  (2, 4),
  (2, 3),
  (2, 5),
  (2, 6),
  (3, 1),
  (3, 2),
  (3, 4),
  (3, 5),
  (3, 6),
  (3, 5),
  (4, 1),
  (4, 2),
  (4, 3),
  (4, 3),
  (4, 5),
  (4, 6),
  (4, 5),
  (4, 7),
  (3, 1),
  (3, 2),
  (3, 4),
  (3, 5),
  (3, 6),
  (3, 5),
  (3, 7),
  (5, 1),
  (5, 2),
  (5, 3),
  (5, 4),
  (5, 3),
  (5, 6),
  (5, 7),
  (6, 2),
  (6, 3),
  (6, 4),
  (6, 3),
  (6, 5),
  (6, 5),
  (6, 7),
  (5, 3),
  (5, 4),
  (5, 3),
  (5, 6),
  (5, 7),
  (7, 4),
  (7, 3),
  (7, 5),
  (7, 6),
  (7, 5)],
 [array([ 28412,  11156,  20672, 120964,  19066]),
  array([308901, 105115,   2089,  33325,  22207]),
  array([  5619,  35622,  34937,  53992, 138802]),
  array([162353,   7150,  35164,  20126,  84868]),
  array([32175, 45773, 15440, 11024, 97718]),
  array([ 67569,  25303,   3173, 161681,  97372]),
  array([40960, 19710, 65434, 28472,  3900]),
  array([ 23990, 173031,  93102, 125488,  

In [95]:
for i, batches in enumerate(dataloader):
    print(batches)
    break

(tensor([ 49132,  49132,  49132,  49132,  49132,  11948,  11948,  11948,  11948,
         11948,   3090,   3090,   3090,   3090,   3090,   3090,  77342,  77342,
         77342,  77342,  77342,  77342,  77342,  58658,  58658,  58658,  58658,
         58658,  58658,  58658,  58658,  34740,  34740,  34740,  34740,  34740,
         34740,  34740,  34740, 132340, 132340, 132340, 132340, 132340, 132340,
        132340, 132340,   8870,   8870,   8870,   8870,   8870,   8870,   8870,
         51148,  51148,  51148,  51148,  51148,  51148,  18668,  18668,  18668,
         18668,  18668,  49132,  49132,  49132,  49132,  49132,  49132,  49132,
         49132,  49132,  49132,  49132,  49132,  49132,  49132,  49132,  49132,
         49132,  49132,  49132,  49132,  49132,  49132,  49132,  49132,  49132,
         11948,  11948,  11948,  11948,  11948,  11948,  11948,  11948,  11948,
         11948,  11948,  11948,  11948,  11948,  11948,  11948,  11948,  11948,
         11948,  11948,  11948,  11948,

## Train Model

In [22]:
# prep sample val set
val_df = pd.read_csv(val_path)
val_df.head()

Unnamed: 0,product1,product2,edge
0,b00mmzfrhw,b01mzyoj76,1
1,b014lrkdwm,b071hb4bpr,1
2,b005l8vf3w,b00rkbb94s,1
3,b01es8uwfm,b00bd8i3ei,1
4,b00zw80dt8,b01eaiv5h4,1


In [23]:
val_df.shape

(1440998, 3)

In [24]:
SAMPLE_PROP = 0.1
val_samp = val_df.sample(frac=SAMPLE_PROP, replace=False)

In [25]:
val_samp.shape

(144100, 3)

In [26]:
word2id_func = np.vectorize(sequences.get_product_id)
val_samp['product1_id'] = word2id_func(val_samp['product1'].values)
val_samp['product2_id'] = word2id_func(val_samp['product2'].values)
val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)]  # Keep those with valid ID
print('No. of validation samples: {}'.format(val_samp.shape[0]))

No. of validation samples: 144100


In [27]:
product1_id = val_samp['product1_id'].values
product2_id = val_samp['product2_id'].values

In [28]:
print('Device: {}, emb_dim: {}, epochs: {}, initial_lr: {}'.format(device, emb_dim, epochs, initial_lr))

Device: cuda, emb_dim: 128, epochs: 5, initial_lr: 0.01


In [33]:
# Initialize model
mf = MF(sequences.n_unique_tokens, emb_dim)
mf = mf.to(device)

Model initialized: MF(
  (embedding): Embedding(527314, 128)
  (sig): Sigmoid()
  (bce): BCELoss()
)


In [34]:
optimizer = optim.Adam(mf.parameters(), lr=initial_lr)

In [None]:
results = []
start_time = datetime.datetime.now()
for epoch in range(epochs):
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(dataloader))
    running_loss = 0
    
    for i, batches in enumerate(dataloader):
        product1 = batches[0].to(device)
        product2 = batches[1].to(device)
        label = batches[2].to(device)
        
        optimizer.zero_grad()
        
        pred = mf.forward(product1, product2)
        loss = mf.loss(pred, label)
        loss.backward()
        optimizer.step()
        
        scheduler.step()
        running_loss = running_loss * 0.9 + loss.item() * 0.1
        
        if (i > 0) and (i % 1000 == 0):
            with torch.no_grad():
                pred = mf.forward(torch.LongTensor(product1_id).to(device),
                                  torch.LongTensor(product2_id).to(device))
                
                score = roc_auc_score(val_samp['edge'], pred.detach().cpu().numpy())
       
        
            print("Epoch: {}, Seq: {:,}/{:,}, " \
                  "Loss: {:.4f}, AUC-ROC: {:.4f}, Lr: {:.6f}".format(epoch, i, len(dataloader), running_loss,
                                                                           score, optimizer.param_groups[0]['lr']))
            results.append([epoch, i, running_loss, score])
            running_loss = 0
     
    # save model
    current_datetime = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')
    state_dict_path = ',,/models/mf_epoch_{}_{}.pt'.format(MODEL_PATH, epoch, current_datetime)
    torch.save(mf.state_dict(), state_dict_path)
    logger.info('Model state dict saved to {}'.format(state_dict_path))
    
end_time = datetime.datetime.now()               

Epoch: 0, Seq: 1,000/145,306, Loss: 0.7769, AUC-ROC: 0.5498, Lr: 0.009998
Epoch: 0, Seq: 2,000/145,306, Loss: 0.7175, AUC-ROC: 0.5775, Lr: 0.009994
