In [7]:
import pandas as pd
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe
from sklearn.preprocessing import LabelEncoder

from src.model.layers.ArticleEncoder import ArticleEncoder

In [8]:
eg_articles = pd.read_csv("data/articles.csv", nrows=10,
                          usecols=["article_id", "product_code", "product_type_no", "detail_desc"])
eg_articles.head()

Unnamed: 0,article_id,product_code,product_type_no,detail_desc
0,108775015,108775,253,Jersey top with narrow shoulder straps.
1,108775044,108775,253,Jersey top with narrow shoulder straps.
2,108775051,108775,253,Jersey top with narrow shoulder straps.
3,110065001,110065,306,"Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,306,"Microfibre T-shirt bra with underwired, moulde..."


In [9]:
tokenizer = get_tokenizer("basic_english")
global_vectors = GloVe(name='6B', dim=100)

In [10]:
# Encode the product codes
label_encoder = LabelEncoder()
label_encoder.fit(eg_articles["product_code"])
eg_articles["product_code"] = label_encoder.transform(eg_articles["product_code"])

In [11]:
encoder = ArticleEncoder(
    text_cols=["detail_desc"],
    category_cols=["product_code"],
    tokenizer=tokenizer, 
    pretrained_word_embedding=global_vectors,
    word_embedding_dim=100, 
    max_words_per_text=66,
    query_vector_dim=16,  
    num_filters=32, 
    window_size=2, 
    dropout_proba=0.1, 
    is_training=True,
    category_embedding_dim=100,
    num_categories=[len(eg_articles["product_code"].unique())]
)

In [12]:
encoder.forward(articles=eg_articles).shape

torch.Size([10, 32])