# Knowledge Graph

To build a knowledge graph, different relationships between products and stores are extracted from the CTC datasets. The current focus is to create a dense representation of the data with a high quality knowledge graph. The graph will later be converted into embeddings. 

## Initializing Notebook

In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
import torch
from torchkge.data_structures import KnowledgeGraph




import torch
from ignite.engine import Engine, Events
from ignite.handlers import EarlyStopping
from ignite.metrics import RunningAverage
from torch.optim import Adam

from torchkge.evaluation import LinkPredictionEvaluator
from torchkge.models import TransRModel
from torchkge.models import TransEModel
from torchkge.sampling import BernoulliNegativeSampler
from torchkge.utils import MarginLoss, DataLoader
from torchkge.utils.datasets import load_fb15k
from torchkge.data_structures import KnowledgeGraph
from tqdm.autonotebook import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
basket_path = "clean_data/cleaned_basket.csv"
df_basket = pd.read_csv(basket_path).drop(columns=['Unnamed: 0'])
df_basket["product_num"].values.astype(int, copy=False)
df_basket["store_num"].values.astype(int, copy=False)

In [18]:
products_path = "clean_data/cleaned_products.csv"
df_products = pd.read_csv(products_path).drop(columns=['Unnamed: 0'])


## Sources and Relevant Links

https://www.kaggle.com/code/nageshsingh/build-knowledge-graph-using-python

https://aws-dglke.readthedocs.io/en/latest/kg.html 

https://arxiv.org/abs/2107.07842 

Idea:
- Nodes: products, stores, and hierarchical categories 
- Relationships (Edges): "bought together" (using basket data), "sold in" (store data), "part of"/"subcategory of" (product hierarchical data) 


Product Knowledge Graph Embedding for E-commerce: https://arxiv.org/pdf/1911.12481.pdf 



# 1. Extract Relationships for the Knowledge Graph

In [4]:
def bought_together_pairs(df,groupby,target_column):
    grouped_df = df.groupby(groupby)
    all_pairs = [] #source,target

    for key, item in grouped_df:
        groups = item[target_column].unique()
        if len(groups)>1:
            for index, source in enumerate(groups):
                for target in groups[index+1:]:
                    all_pairs.append([source,target, "bought together"])
    return all_pairs

### Extract relationships from the Basket Data 

The basket data allows us to see which products have been bought together. Thus, we can extract relationships of the form '[product A, product B, "bought together]'. The triplets are stored in the list called bought_together. Extracting these relationships could take some time, so the data has been pickled so the extraction does not need to occur every time the notebook is run.

In [24]:
#specify "product" in front of product nums and "store" in front of store nums for clarity
df_basket['product_num'] = df_basket['product_num'].apply(lambda x: "{}{}".format('product ', x))
df_basket['store_num'] = df_basket['store_num'].apply(lambda x: "{}{}".format('store ', x))


In [None]:
df_basket

Extracting the "bought together" relationship from the basket data takes a while to run. We created a pickle data file to load instead.

In [7]:

'''
bought_together = bought_together_pairs(df_basket,"basket_id","product_num") #source,target
len(bought_together)
'''


'\nbought_together = bought_together_pairs(df_basket,"basket_id","product_num") #source,target\nlen(bought_together)\n'

In [8]:

'''
#pickle the data
with open('embeddings/pickle_bought_together.data', 'wb') as f:
        pickle.dump(bought_together, f)

'''


"\n#pickle the data\nwith open('embeddings/pickle_bought_together.data', 'wb') as f:\n        pickle.dump(bought_together, f)\n\n"

In [16]:
#open the pickled data and save
infile = open('embeddings/pickle_bought_together.data','rb')
bought_together = pickle.load(infile)
infile.close()

### Extract relationships from Product Standard Data

The product standard dataset will be filtered down to the products which appear in the basket data. Three relationships will be extracted to capture products that are part of categories, and categories that are subcategories of larger ones.

[ctr_product_num, merch_bus_cat_nm, "part of"]

[merch_bus_cat_nm, merch_lob_nm, "subcategory of"] 

[merch_lob_nm, merch_division_nm, "subcategory of"] 

In [None]:
df_products

In [19]:
#add the word "product" before product num (differentiate from store nums later on)
df_products['ctr_product_num'] = df_products['ctr_product_num'].apply(lambda x: "{}{}".format('product ', x))

In [None]:
#filter products to those that appear in basket data
df_products_basket = df_products[df_products.ctr_product_num.isin(df_basket.product_num.unique())]
len(df_products_basket)

In [None]:
df_products_basket

Modified the get_pairs function to include the relation that is being captured as an input parameter

In [21]:
def get_pairs_relation(df,groupby,target_column,relation):
    grouped_df = df.groupby(groupby)
    all_pairs = [] #source,target

    for source, item in grouped_df:
        groups = item[target_column].unique()
        for index, target in enumerate(groups):
            all_pairs.append([source, target, relation])
    return all_pairs

In [22]:
#[product num, business category, relation: "part of"]
prod_buscat = get_pairs_relation(df_products_basket, "ctr_product_num", "merch_bus_cat_nm", "part of")
prod_buscat

[]

In [None]:
#[merch bus cat, merch lob nm, relation: "subcategory of"]
buscat_lob = get_pairs_relation(df_products_basket, "merch_bus_cat_nm", "merch_lob_nm", "subcategory of")
buscat_lob

In [None]:
lob_division = get_pairs_relation(df_products_basket, "merch_lob_nm", "merch_division_nm", "subcategory of")
lob_division

### Extract store semantics 

In [18]:
in_store = get_pairs_relation(df_basket,"product_num", "store_num", "sold at")

In [None]:
in_store

# 2. Construct the Knowledge Graph

Combine all extracted relationships and put them into a dataframe for the KG. It must have three columns: "from", "to", and "rel" to describe pairs of nodes and their relationships.

Note: currently, the product and business category relation (prod_buscat) causes a strange error, so it is excluded from this version of the knowledge graph.

In [None]:
#Combine the extracted relationships into one list
all_rels = bought_together  + buscat_lob + lob_division + in_store # prod_buscat
len(all_rels)

In [None]:
all_rels

In [22]:
# extract subject
source = [i[0] for i in all_rels] 

# extract object
target = [i[1] for i in all_rels] 

# state relationship
relations = [i[2] for i in all_rels] 

In [None]:
#assign to dataframe
kg_df = pd.DataFrame({'from':source, 'to':target, 'rel':relations})
kg_df

In [32]:
#split the df into train val and test 
df_train, df_val, df_test = np.split(kg_df.sample(frac=1, random_state=42), [int(.6*len(kg_df)), int(.8*len(kg_df))])

Creating the knowledge graph takes about 32 minutes. The train, val and test kgs have been pickeld for easier future use. If you wish to redo the KG generation and pickling, uncomment the two cells below.

In [33]:

# Turn into knowledge graph
kg_train = KnowledgeGraph(df=df_train)
kg_val = KnowledgeGraph(df=df_val)
kg_test = KnowledgeGraph(df=df_test)


In [None]:
'''
#pickle the train, val, and test KGs
with open('embeddings/pickle_kg_train.data', 'wb') as f:
        pickle.dump(kg_train, f)

with open('embeddings/pickle_kg_val.data', 'wb') as f:
        pickle.dump(kg_val, f)

with open('embeddings/pickle_kg_test.data', 'wb') as f:
        pickle.dump(kg_test, f)

'''

The pickles could take a few minutes to open

In [6]:
#open the pickles
infile = open('embeddings/pickle_kg_train.data','rb')
kg_train = pickle.load(infile)
infile.close()

infile = open('embeddings/pickle_kg_val.data','rb')
kg_val = pickle.load(infile)
infile.close()

infile = open('embeddings/pickle_kg_test.data','rb')
kg_test = pickle.load(infile)
infile.close()

# 3. Create Knowledge Graph Embedding using torchkge

https://torchkge.readthedocs.io/en/latest/

Training with Ignite, following https://torchkge.readthedocs.io/en/latest/tutorials/training.html



https://kge-tutorial-ecai2020.github.io/ECAI-20_KGE_tutorial.pdf


The conversion of knowledge graphs into embeddings is still in progress. The intuition is to choose a translational model and attempt link prediction between the nodes.

In [None]:
'''
def process_batch(engine, batch):
    h, t, r = batch[0], batch[1], batch[2]
    n_h, n_t = sampler.corrupt_batch(h, t, r)

    optimizer.zero_grad()

    pos, neg = model(h, t, r, n_h, n_t)
    loss = criterion(pos, neg)
    loss.backward()
    optimizer.step()

    return loss.item()


def linkprediction_evaluation(engine):
    model.normalize_parameters()

    loss = engine.state.output

    # validation MRR measure
    if engine.state.epoch % eval_epoch == 0:
        evaluator = LinkPredictionEvaluator(model, kg_val)
        evaluator.evaluate(b_size=256, verbose=False)
        val_mrr = evaluator.mrr()[1]
    else:
        val_mrr = 0

    print('Epoch {} | Train loss: {}, Validation MRR: {}'.format(
        engine.state.epoch, loss, val_mrr))

    try:
        if engine.state.best_mrr < val_mrr:
            engine.state.best_mrr = val_mrr
        return val_mrr

    except AttributeError as e:
        if engine.state.epoch == 1:
            engine.state.best_mrr = val_mrr
            return val_mrr
        else:
            raise e
'''

In [None]:
'''
infile = open('embeddings/pickle_bought_together.data','rb')
bought_together_pairs = pickle.load(infile)
infile.close()

# extract subject
source = [i[0] for i in bought_together_pairs]

# extract object
target = [i[1] for i in bought_together_pairs]

# state relationship
relations = ["bought together" for i in bought_together_pairs]

kg_df = pd.DataFrame({'from':source, 'to':target, 'rel':relations})

df_train, df_val, df_test = np.split(kg_df.sample(frac=1, random_state=42), [int(.6*len(kg_df)), int(.8*len(kg_df))])
'''

### Iteration 0: Bought Together KG

In [33]:
# Turn into knowledge graph
#kg_train = KnowledgeGraph(df=df_train)


In [34]:
#kg_val = KnowledgeGraph(df=df_val)
#kg_test = KnowledgeGraph(df=df_test)

In [35]:
#with open('embeddings/pickle_merch_pairs_kg_train.data', 'wb') as f:
#        pickle.dump(kg_train, f)

In [36]:
#with open('embeddings/pickle_merch_pairs_kg_val.data', 'wb') as f:
#        pickle.dump(kg_val, f)

In [37]:
#with open('embeddings/pickle_merch_pairs_kg_test.data', 'wb') as f:
#        pickle.dump(kg_test, f)

In [38]:
'''
infile = open('embeddings/pickle_merch_pairs_kg_train.data','rb')
kg_train = pickle.load(infile)
infile.close()

infile = open('embeddings/pickle_merch_pairs_kg_val.data','rb')
kg_val = pickle.load(infile)
infile.close()

infile = open('embeddings/pickle_merch_pairs_kg_test.data','rb')
kg_test = pickle.load(infile)
infile.close()
'''

"\ninfile = open('embeddings/pickle_merch_pairs_kg_train.data','rb')\nkg_train = pickle.load(infile)\ninfile.close()\n\ninfile = open('embeddings/pickle_merch_pairs_kg_val.data','rb')\nkg_val = pickle.load(infile)\ninfile.close()\n\ninfile = open('embeddings/pickle_merch_pairs_kg_test.data','rb')\nkg_test = pickle.load(infile)\ninfile.close()\n"

In [38]:
'''
import os

# prevent memory issue
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'max_split_size_mb:512'
'''

'\nimport os\n\n# prevent memory issue\nos.environ["PYTORCH_CUDA_ALLOC_CONF"] = \'max_split_size_mb:512\'\n'

In [4]:

ent_emb_dim = 1000
rel_emb_dim = 3
lr = 0.0005
n_epochs = 100
b_size = 32768
margin = 0.5


In [5]:

# Define the model and criterion
model = TransRModel(ent_emb_dim,rel_emb_dim, kg_train.n_ent, kg_train.n_rel)
criterion = MarginLoss(margin)

# Move everything to CUDA if available
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    model.cuda()
    criterion.cuda()

# Define the torch optimizer to be used
optimizer = Adam(model.parameters(), lr=lr, weight_decay=1e-5)

sampler = BernoulliNegativeSampler(kg_train)
dataloader = DataLoader(kg_train, batch_size=b_size, use_cuda='all')



In [6]:

iterator = tqdm(range(n_epochs), unit='epoch')
for epoch in iterator:
    running_loss = 0.0
    for i, batch in enumerate(dataloader):
        h, t, r = batch[0], batch[1], batch[2]
        n_h, n_t = sampler.corrupt_batch(h, t, r)

        optimizer.zero_grad()

        # forward + backward + optimize
        pos, neg = model(h, t, r, n_h, n_t)
        loss = criterion(pos, neg)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    iterator.set_description(
        'Epoch {} | mean loss: {:.5f}'.format(epoch + 1,
                                              running_loss / len(dataloader)))

model.normalize_parameters()


Epoch 100 | mean loss: 2436.30601: 100%|██████████| 100/100 [4:38:47<00:00, 167.27s/epoch]


In [7]:
torch.save(model.state_dict(), "embeddings/transr_state_dict_version_2.pt")

In [8]:
model

TransRModel(
  (ent_emb): Embedding(143618, 1000)
  (rel_emb): Embedding(3, 3)
  (proj_mat): Embedding(3, 3000)
)

In [7]:
model = TransRModel(ent_emb_dim,rel_emb_dim, kg_train.n_ent, kg_train.n_rel)  
model.load_state_dict(torch.load("embeddings/transr_state_dict_version_2.pt"))
model.eval()

TransRModel(
  (ent_emb): Embedding(143618, 1000)
  (rel_emb): Embedding(3, 3)
  (proj_mat): Embedding(3, 3000)
)

In [13]:
entity_emb,rel_emb, proj_mat = model.get_embeddings()

In [None]:
entity_emb

In [None]:
kg_train.evaluate_dicts