In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel, BertConfig
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

  from .autonotebook import tqdm as notebook_tqdm


## Init

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
configuration = BertConfig()
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True).to(device)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_fast=True)
args = {
        "device" : device,
        "data_dir" : r'../data/filtered_reviews_with_split.pkl',
        "data_chunks_dir" : r'../data/chunks',
        "emb_dim" : 768,
        "max_word" : 25,
        "max_sentence" : 10,
        "max_review_user" : 10,
        "max_review_item" : 30,
        "epoch" : 5,
        "batch_size": 32,
        "bert_configuration" : configuration,
        "bert_model" : bert_model,
        "bert_tokenizer" : bert_tokenizer
    }

print("Device: ",device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Device:  cuda


## Read Pickle

In [11]:
"""
Required dataframe format:
[column name]       [dtype]
AppID               int
UserID              int
Like                int
Review              String
"""
data = pd.read_pickle(r'../data/review_data_casual.pkl')
# Change dtype of columns in df
data.reset_index(drop=True, inplace=True)
data["UserID"] = data["UserID"].astype("int64")
data["AppID"] = data["AppID"].astype(int)
data["Like"] = data["Like"].astype(int)
"""
TODO dataframe format:
[column name]       [dtype]
SplitReview         list
LDA_group           list
SplitReview_emb     np.array
"""
# TODO Columns
data["SplitReview"] = ""
data["SplitReview_emb"]=""
data["LDA_group"]=""

print(data.dtypes)
print("Once you've prepared the data, press {Run All} will do the megic")
data

AppID               int32
UserID              int64
Like                int32
Review             object
SplitReview        object
SplitReview_emb    object
LDA_group          object
dtype: object
Once you've prepared the data, press {Run All} will do the megic


Unnamed: 0,AppID,UserID,Like,Review,SplitReview,SplitReview_emb,LDA_group
0,945360,76561197996720254,1,[h1]We're all sus![/h1]\n\n[b][u]PROS[/u]:[/b]...,,,
1,945360,76561198133726836,1,buy this on steam for free fortnite skin,,,
2,945360,76561198271396832,1,honesty gets you voted off when you're not eve...,,,
3,945360,76561198083568932,0,"An international sensation, yet receives updat...",,,
4,945360,76561198123845513,1,[h1] [b] Sus [/b] [/h1]\n\nOverall Review: Red...,,,
...,...,...,...,...,...,...,...
23873,559610,76561198069159152,1,So this game has been rumbling around a bit on...,,,
23874,559610,76561198304467388,1,Simple story yet heartful and warm love story ...,,,
23875,559610,76561198009282696,1,A little heartwarming story about two sisters....,,,
23876,559610,76561198120348301,1,Razz’s art style just keeps getting better and...,,,


In [12]:
# 確定 threshold 後檢查有多少個 App、User
app_reviews = data['AppID'].value_counts()
user_reviews = data['UserID'].value_counts()
app_reviews, user_reviews

(265890     209
 945360     201
 533300     201
 435400     192
 1794680    191
           ... 
 1106840     18
 606800      18
 761830      18
 1009560     17
 1224160     16
 Name: AppID, Length: 485, dtype: int64,
 76561198027267313    80
 76561198040884867    75
 76561198062813911    68
 76561197987731882    62
 76561198069159152    62
                      ..
 76561198041024658     6
 76561198086933786     6
 76561197985573260     6
 76561198048974498     6
 76561198353593495     6
 Name: UserID, Length: 1679, dtype: int64)

# LDA Grouping

### 1. Split every review to sentences.  

In [13]:
import re

def review_to_sentences(review):
    """
    split review into sentences contained by a list
    param: review (String)
    output: sentences (list of word)
    """
    sentences = review.splitlines()
    sentences = list(filter(None, sentences))
    tmp = []
    for sent in sentences:
        sent = re.split(r' *[\.\?!][\'"\)\]]* *', sent)
        tmp.extend(sent)
    # delete sentence less than specific number of words
    sentences = list(filter(lambda x:len(x.split())>=5, tmp))
    return sentences

In [14]:
list_split_sentences =  [review_to_sentences(review) for review in data["Review"]]
data["SplitReview"] = list_split_sentences
empty = [i for i, x in enumerate(data["SplitReview"]) if x ==[]] # Delete data whose splitReview is empty list
data.drop(empty, axis=0, inplace=True)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,AppID,UserID,Like,Review,SplitReview,SplitReview_emb,LDA_group
0,945360,76561197996720254,1,[h1]We're all sus![/h1]\n\n[b][u]PROS[/u]:[/b]...,[[b][u]PROS[/u]:[/b] Among Us is a social dedu...,,
1,945360,76561198133726836,1,buy this on steam for free fortnite skin,[buy this on steam for free fortnite skin],,
2,945360,76561198271396832,1,honesty gets you voted off when you're not eve...,[honesty gets you voted off when you're not ev...,,
3,945360,76561198083568932,0,"An international sensation, yet receives updat...","[An international sensation, yet receives upda...",,
4,945360,76561198123845513,1,[h1] [b] Sus [/b] [/h1]\n\nOverall Review: Red...,"[[h1] [b] Sus [/b] [/h1], Overall Review: Red ...",,
...,...,...,...,...,...,...,...
22930,559610,76561198069159152,1,So this game has been rumbling around a bit on...,[So this game has been rumbling around a bit o...,,
22931,559610,76561198304467388,1,Simple story yet heartful and warm love story ...,[Simple story yet heartful and warm love story...,,
22932,559610,76561198009282696,1,A little heartwarming story about two sisters....,[A little heartwarming story about two sisters...,,
22933,559610,76561198120348301,1,Razz’s art style just keeps getting better and...,[Razz’s art style just keeps getting better an...,,


### 2. LDA Grouping

In [15]:
# Modify 趙儀's LDA part
def stemmer_with_delete_stopword(split_sentences):
    vectorizer = TfidfVectorizer(stop_words = "english")
    stop_list = list(vectorizer.get_stop_words())
    porter_stemmer = PorterStemmer()
    all_stem_sents=[]
    for review in split_sentences:
        review_stem_list = []
        for sent in review:
            sent_stem_list =[]
            for word in sent.split(" "):
                if len(word)>2:
                    if word not in stop_list:
                        sent_stem_list.append(porter_stemmer.stem(word))
            review_stem_list.append(sent_stem_list)
        all_stem_sents.append(review_stem_list) 
    return all_stem_sents

def LDAGrouping(reviews):
    all_sents = []
    for review in reviews:
        for sentence in review:
            all_sents.append(sentence)
    dictionary = corpora.Dictionary(all_sents)
    corpus = [dictionary.doc2bow(sent) for sent in all_sents]
    lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5)
    group_results = []
    for sents in reviews:
        single_corpus = [dictionary.doc2bow(sent) for sent in sents]
        sents_group_result = []
        for scores in lda.inference(single_corpus)[0]:
            # scores.argmax()+1 --> Retain group:0 for no meaning sentences
            sents_group_result.append(scores.argmax()+1)
        group_results.append(sents_group_result)

    return group_results

def pad_and_trunc(group_results, *, max_sentence):
    #max number of sentences in a review
    result_list = []
    for i, result in enumerate(group_results):
        if len(result) >= max_sentence:
            result = result[:10]
        else:
            result.extend([0]*(max_sentence-len(result)))
        result_list.append(np.array(result).astype(int))
    return result_list

In [16]:
clean_reviews = stemmer_with_delete_stopword(data["SplitReview"].tolist())
group_list = LDAGrouping(clean_reviews) # Training might take a little bit time 
pad_group_list = pad_and_trunc(group_list, max_sentence=args["max_sentence"])
data["LDA_group"] = pad_group_list
data

Unnamed: 0,AppID,UserID,Like,Review,SplitReview,SplitReview_emb,LDA_group
0,945360,76561197996720254,1,[h1]We're all sus![/h1]\n\n[b][u]PROS[/u]:[/b]...,[[b][u]PROS[/u]:[/b] Among Us is a social dedu...,,"[5, 1, 1, 4, 4, 5, 3, 3, 5, 4]"
1,945360,76561198133726836,1,buy this on steam for free fortnite skin,[buy this on steam for free fortnite skin],,"[5, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,945360,76561198271396832,1,honesty gets you voted off when you're not eve...,[honesty gets you voted off when you're not ev...,,"[2, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,945360,76561198083568932,0,"An international sensation, yet receives updat...","[An international sensation, yet receives upda...",,"[1, 5, 1, 1, 3, 3, 0, 0, 0, 0]"
4,945360,76561198123845513,1,[h1] [b] Sus [/b] [/h1]\n\nOverall Review: Red...,"[[h1] [b] Sus [/b] [/h1], Overall Review: Red ...",,"[4, 2, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...
22930,559610,76561198069159152,1,So this game has been rumbling around a bit on...,[So this game has been rumbling around a bit o...,,"[3, 1, 1, 1, 2, 2, 1, 5, 0, 0]"
22931,559610,76561198304467388,1,Simple story yet heartful and warm love story ...,[Simple story yet heartful and warm love story...,,"[2, 4, 0, 0, 0, 0, 0, 0, 0, 0]"
22932,559610,76561198009282696,1,A little heartwarming story about two sisters....,[A little heartwarming story about two sisters...,,"[2, 5, 4, 2, 5, 2, 1, 4, 2, 4]"
22933,559610,76561198120348301,1,Razz’s art style just keeps getting better and...,[Razz’s art style just keeps getting better an...,,"[4, 5, 5, 2, 2, 5, 5, 3, 4, 4]"


In [17]:
# Save the LDA grouping result
data.to_pickle(r"../data/filtered_reviews_group.pkl")

# Split Train Val Test

In [18]:
# Split train/val/test data by user case
train_df, val_df, test_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
tain_ratio = 0.7
val_ratio = 0.1
test_ratio = 0.2
for user in set(data["UserID"]):
    single_user_data = data[data["UserID"]==user]
    single_user_data_train = single_user_data[:int(len(single_user_data)*tain_ratio)]
    single_user_data_val = single_user_data[int(len(single_user_data)*tain_ratio):int(len(single_user_data)*(tain_ratio+val_ratio))]
    single_user_data_test = single_user_data[int(len(single_user_data)*(tain_ratio+val_ratio)):]
    train_df = pd.concat([train_df, single_user_data_train], axis=0)
    val_df = pd.concat([val_df, single_user_data_val], axis=0)
    test_df = pd.concat([test_df, single_user_data_test], axis=0)
len(train_df), len(val_df), len(test_df), len(train_df)+len(val_df)+len(test_df)

(15339, 2014, 5582, 22935)

In [20]:
# Saving three types of dataframe
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
train_df.to_pickle(r"../data/train_df.pkl")
val_df.to_pickle(r"../data/val_df.pkl")
test_df.to_pickle(r"../data/test_df.pkl")

# Bert Encode

### 1. Init Bert and encode methods

In [22]:
def padding_to_tagert_dimension(input_tensor, sent_len, word_len, word_dim):
    """
    Set input_tensor to specified dim with zero padding, and flatten it
    ex: [3, 25, 768] -> [10, 25, 768] -> [250, 768]
    """
    target_emb = torch.zeros(sent_len, word_len, word_dim)
    target_emb[:input_tensor.size(dim=0), :, :] = input_tensor
    target_emb = torch.flatten(target_emb, start_dim=0, end_dim=1)
    
    return target_emb

def bert_encode(review_split, args):
    """
    Encode splitted review to bert embedding
    return embedding of review padded with zero
    """
    emb_list = []
    for i, sentence in enumerate(review_split):
        if i == args["max_sentence"]: break
        sentence_encode = args["bert_tokenizer"](
            sentence,
            return_attention_mask = True,
            max_length = args["max_word"],
            truncation = True,
            padding = "max_length",
            return_tensors = 'pt'
            )
        for k,v in sentence_encode.items():
            sentence_encode[k] = v.to(args["device"])
        with torch.no_grad():
            outputs = args["bert_model"](**sentence_encode)
        sentence_emb = outputs[2][-1]
        emb_list.append(sentence_emb)
    review_emb = torch.cat(emb_list, 0)
    pad_review_emb = padding_to_tagert_dimension(review_emb, args["max_sentence"], args["max_word"], args["emb_dim"])
    return pad_review_emb

### 2. Encode splited sentences and save into multiple chunks of H5DF

In [25]:
# Saving emb for each "user"
def save_each_bert_emb(data, *, col_name, target):
    user_set_len = len(set(data[col_name]))
    for i, indie in enumerate(set(data[col_name])):
        print("執行%s進度: %d/%d\r"%(target, i, user_set_len), end="")
        user_data = data[data[col_name]==indie]
        for index, review in zip(user_data.index, user_data["SplitReview"]):
            review_emb = np.asarray(bert_encode(review, args))
            user_data.at[index, "SplitReview_emb"] = review_emb
            user_data[["SplitReview_emb", "LDA_group"]].to_pickle(f'../data/{target}_emb/{indie}.pkl')

In [26]:
# This step require a lot of disk storage. Please make sure that you have sufficient space.
save_each_bert_emb(data, col_name="UserID", target="user")

執行user進度: 1672/1673

In [28]:
# This step require a lot of disk storage. Please make sure that you have sufficient space.
#  Can't store all emb into one df, so it has to be run twice 
save_each_bert_emb(data, col_name="AppID", target="item")

執行item進度: 484/485

### 3. Show Bert Encode Result

In [44]:
# 76561198066129673 garbage reviewer
tmp = pd.read_pickle(r"../data/item_emb/3300.pkl")
torch.from_numpy(np.array(tmp["SplitReview_emb"].tolist())).size()

torch.Size([17, 250, 768])

# Matrix Factorization

In [43]:
mf_df = pd.read_pickle(r"../data/filtered_reviews_group.pkl")
mf_df = mf_df[['AppID', 'UserID', 'Like']]
mf_df['Interacted'] = 1
mf_df

Unnamed: 0,AppID,UserID,Like,Interacted
0,945360,76561197996720254,1,1
1,945360,76561198133726836,1,1
2,945360,76561198271396832,1,1
3,945360,76561198083568932,0,1
4,945360,76561198123845513,1,1
...,...,...,...,...
22930,559610,76561198069159152,1,1
22931,559610,76561198304467388,1,1
22932,559610,76561198009282696,1,1
22933,559610,76561198120348301,1,1


In [44]:
# Modify MF From 裴伯儀
def train_test_random_split(df):
    app_ids = list(df.columns)
    user_ids = list(df.index)
    user_set = {}
    for user_id in user_ids:
        interacted_items = [app_ids[idx] for idx in df.loc[user_id].values.nonzero()[0].tolist()]
        user_set[user_id] = interacted_items
    return user_set

def get_trainVector(df, user_set):
    business_ids = list(df.columns)
    user_ids = list(df.index)
    return [ [1 if business_id in user_set[user_id] else 0 for business_id in business_ids ] for user_id in user_ids]

def matrix_factorization(matrix, trainVector):
    # n_components is embedding dimension, vervose=1 shows the training process
    model = NMF(n_components=128, init='random', random_state=0, verbose=0)
    user_embeddings = model.fit_transform(trainVector*matrix.values)
    encoded_user_embeddings = np.asarray(user_embeddings, dtype=np.float32)
    app_embeddings = model.components_.T
    encoded_app_embeddings = np.asarray(app_embeddings.astype('float32'))
    user_id_emb = dict(zip(matrix.index, encoded_user_embeddings))
    app_id_emb = dict(zip(matrix.columns, encoded_app_embeddings))
    
    return user_id_emb, app_id_emb

### Construct Rating Matrix and Interaction Matrix

In [45]:
interaction_matrix = mf_df.pivot_table(index='UserID', columns='AppID', values='Interacted').fillna(0)
rating_matrix = mf_df.pivot_table(index='UserID', columns='AppID', values='Like').fillna(0)
user_set = train_test_random_split(interaction_matrix)
trainVector = torch.tensor(get_trainVector(interaction_matrix, user_set)).to(torch.float32)

### Train MF Model

In [46]:
user_id_emb, app_id_emb = matrix_factorization(rating_matrix, trainVector)
len(user_id_emb), len(app_id_emb), next(iter(user_id_emb.values())).shape, next(iter(app_id_emb.values())).shape

(1673, 485, (128,), (128,))

### Save MF Embedding

In [47]:
app_mf_df = pd.DataFrame()
app_mf_df["AppID"] = list(app_id_emb.keys())
app_mf_df["MF_emb"] = list(app_id_emb.values())
app_mf_df.to_pickle(r"../data/train_item_mf_emb.pkl")

user_mf_df = pd.DataFrame()
user_mf_df["UserID"] = list(user_id_emb.keys())
user_mf_df["MF_emb"] = list(user_id_emb.values())
user_mf_df.to_pickle(r"../data/train_user_mf_emb.pkl")

In [51]:
torch.reshape(torch.randn((32,50,250,768)), (1600,250,768)).size()

torch.Size([1600, 250, 768])