In [2]:
import argparse
import collections
import gzip
import html
import json
import os
import random
import re
import torch
from tqdm import tqdm
import numpy as np
from utils import set_device, load_json, load_plm, clean_text
from transformers import LlamaForCausalLM, LlamaTokenizer, LlamaConfig, AutoTokenizer, AutoModel,AutoModelForCausalLM
import pickle
from sklearn.decomposition import PCA

def load_data(args):

    item2feature_path = os.path.join(args.root, f'{args.dataset}.item.json')
    item2feature = load_json(item2feature_path)

    return item2feature

def generate_text(item2feature, features):
    item_text_list = []

    for item in item2feature:
        data = item2feature[item]
        text = []
        for meta_key in features:
            if meta_key in data:
                meta_value = clean_text(data[meta_key])
                text.append(meta_value.strip())

        item_text_list.append([int(item), text])

    return item_text_list

def preprocess_text(args):
    print('Process text data: ')
    print(' Dataset: ', args.dataset)

    item2feature = load_data(args)
    # load item text and clean
    item_text_list = generate_text(item2feature, ['title', 'description'])
    # item_text_list = generate_text(item2feature, ['title'])
    # return: list of (item_ID, cleaned_item_text)
    return item_text_list

def generate_item_embedding(args, item_text_list, tokenizer, model, word_drop_ratio=-1):
    print(f'Generate Text Embedding: ')
    print(' Dataset: ', args.dataset)

    items, texts = zip(*item_text_list)
    order_texts = [[0]] * len(items)
    for item, text in zip(items, texts):
        order_texts[item] = text
    for text in order_texts:
        assert text != [0]

    embeddings = []
    start, batch_size = 0, 1
    while start < len(order_texts):
        if (start+1)%100==0:
            print("==>",start+1)
        field_texts = order_texts[start: start + batch_size]
        # print(field_texts)
        field_texts = zip(*field_texts)

        field_embeddings = []
        for sentences in field_texts:
            sentences = list(sentences)
            # print(sentences)
            if word_drop_ratio > 0:
                print(f'Word drop with p={word_drop_ratio}')
                new_sentences = []
                for sent in sentences:
                    new_sent = []
                    sent = sent.split(' ')
                    for wd in sent:
                        rd = random.random()
                        if rd > word_drop_ratio:
                            new_sent.append(wd)
                    new_sent = ' '.join(new_sent)
                    new_sentences.append(new_sent)
                sentences = new_sentences
            encoded_sentences = tokenizer(sentences, max_length=args.max_sent_len,
                                          truncation=True, return_tensors='pt',padding="longest").to(args.device)
            with torch.no_grad():
                outputs = model(input_ids=encoded_sentences.input_ids,
                                attention_mask=encoded_sentences.attention_mask)
                # print("Start:", str(start))
            masked_output = outputs.last_hidden_state * encoded_sentences['attention_mask'].unsqueeze(-1)
            mean_output = masked_output.sum(dim=1) / encoded_sentences['attention_mask'].sum(dim=-1, keepdim=True)
            mean_output = mean_output.detach().cpu()
            field_embeddings.append(mean_output)
            
        field_mean_embedding = torch.stack(field_embeddings, dim=0).mean(dim=0)
        embeddings.append(field_mean_embedding)
        start += batch_size

    embeddings = torch.cat(embeddings, dim=0).numpy()
    print('Embeddings shape: ', embeddings.shape)

    file = os.path.join(args.root, args.dataset + '.emb-' + args.plm_name + "-td" + ".npy")
    np.save(file, embeddings)


# def parse_args():
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--dataset', type=str, default='Instruments', help='Instruments / Arts / Games')
#     parser.add_argument('--root', type=str, default="")
#     parser.add_argument('--gpu_id', type=int, default=2, help='ID of running GPU')
#     parser.add_argument('--plm_name', type=str, default='llama')
#     parser.add_argument('--plm_checkpoint', type=str,
#                         default='')
#     parser.add_argument('--max_sent_len', type=int, default=2048)
#     parser.add_argument('--word_drop_ratio', type=float, default=-1, help='word drop ratio, do not drop by default')
#     return parser.parse_args()


# if __name__ == '__main__':
#     args = parse_args()
#     args.root = os.path.join(args.root, args.dataset)

#     device = set_device(args.gpu_id)
#     args.device = device

#     item_text_list = preprocess_text(args)

#     plm_tokenizer, plm_model = load_plm(args.plm_checkpoint)
#     if plm_tokenizer.pad_token_id is None:
#         plm_tokenizer.pad_token_id = 0
#     plm_model = plm_model.to(device)

#     generate_item_embedding(args, item_text_list,plm_tokenizer,
#                             plm_model, word_drop_ratio=args.word_drop_ratio)




In [6]:
model_path = '/root/autodl-tmp/qwen7b/'
tokenizer = AutoTokenizer.from_pretrained(model_path,trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path,low_cpu_mem_usage=True,trust_remote_code=True)

The model is automatically converting to bf16 for faster inference. If you want to disable the automatic precision, please manually add bf16/fp16/fp32=True to "AutoModelForCausalLM.from_pretrained".
Try importing flash-attention for faster inference...
Loading checkpoint shards: 100%|██████████| 8/8 [00:01<00:00,  5.48it/s]


In [7]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.tokenize('who am i', max_length=100,
                                          truncation=True, return_tensors='pt',padding="longest")

[b'who', b' am', b' i']

In [12]:
tokenizer.

In [15]:
tokenizer.pad()

TypeError: PreTrainedTokenizerBase.pad() missing 1 required positional argument: 'encoded_inputs'

In [10]:
tokenizer.tokenize('</s>')

['</s>']

In [10]:
import numpy as np
embeddings = np.load('/root/autodl-tmp/data/fashion/fashion.emb-llama-td.npy')

In [20]:
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_neighbors=30,n_components=768)
lle_item_emb = lle.fit_transform(embeddings.reshape(-1,4096))

In [22]:
(lle_item_emb*lle_item_emb).sum(-1)

array([0.00302897, 0.01810408, 0.01983337, ..., 0.90955505, 0.94262341,
       0.94928422], shape=(22660,))

In [23]:
(pca_item_emb*pca_item_emb).sum(-1)

array([0.00043163, 0.00042918, 0.00042901, ..., 0.00042995, 0.00043029,
       0.00043029], shape=(22660,), dtype=float32)

In [13]:
pca = PCA(n_components=768)
pca_item_emb = pca.fit_transform(embeddings.reshape(-1,4096))

In [4]:
np.save('/root/LETTER/data/Instruments/Instruments.emb-llama-td-pca.npy',pca_item_emb)

In [12]:
torch.load('/root/LETTER/RQ-VAE/ckpt/Instruments-32d-sasrec.pt').squeeze().detach().numpy()

array([[ 0.06546738,  0.07035364,  0.12352964, ..., -0.05386544,
        -0.01489884, -0.05661311],
       [-0.00936208,  0.0402041 , -0.2232153 , ..., -0.02281781,
        -0.0177921 ,  0.060487  ],
       [-0.10918454, -0.11901627, -0.17798027, ..., -0.07504256,
        -0.08569156,  0.06216739],
       ...,
       [ 0.2804811 ,  0.3421658 , -0.07818857, ..., -0.14269552,
         0.03339064,  0.03802712],
       [ 0.18306674,  0.27125368, -0.01843184, ..., -0.02817823,
         0.05040237, -0.00770262],
       [-0.0802355 ,  0.22407892,  0.2701178 , ...,  0.06522095,
         0.33446762, -0.08688183]], dtype=float32)

In [3]:
import subprocess

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [None]:
args.root = os.path.join(args.root, args.dataset)

device = set_device(0)

item_text_list = preprocess_text(args)


In [None]:
sentences = '1'
encoded_sentences = tokenizer(sentences, max_length=2048,
                                          truncation=True, return_tensors='pt',padding="longest").to(model.device)
with torch.no_grad():
    outputs = model(input_ids=encoded_sentences.input_ids,
                    attention_mask=encoded_sentences.attention_mask,output_last_hidden_states=True)

In [43]:

tokenizer.pad_token = '<|endoftext|>'
tokenizer.pad_token_id = 151643

In [49]:
with torch.no_grad():
    outputs = model(input_ids=encoded_sentences.input_ids,
                    attention_mask=encoded_sentences.attention_mask,output_hidden_states=True)

In [57]:
outputs.hidden_states[-1][0,0,:]

tensor([ 0.2188,  3.4844,  3.7500,  ..., -3.7656,  0.6797, -2.2969],
       device='cuda:0', dtype=torch.bfloat16)