In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2022/11/21 10:06
# @Author  : Wang Yujia
# @File    : auction_features_encoding.ipynb
# @Description : 为auction从'desc'列提取contextual features。目前采用的是`SentenceTransformer`


# 0. what for
1. auction的features/ setting info需要encoding
2. 只对`desc`进行encoding

# 1. preparations
## 1.1 全局设置

In [3]:
LENTH=4      # 望得到的encoding维度=300 or 4? 取决于用途

# small data
settings_small_NN_path = r"../data/small_settings_NN.csv"

# large data
settings_large_NN_path = r'E:\DATA\large_dta\large_settings_NN.csv'

# output path
prod_embedding_small_path = "../data/small_prod_embedding_"+str(LENTH)+".csv"
prod_embedding_large_path = "E:\DATA\large_dta\large_prod_embedding_"+str(LENTH)+".csv"
prod_embedding_small_path

'../data/small_prod_embedding_4.csv'

In [21]:
# 希望得到的encoding维度
new_dimension = LENTH

# 聚类数量
num_clusters = 4

In [22]:
import sys
import time
import pandas as pd
import numpy as np
from pprint import pprint
import torch
from transformers import BertModel
from visdom import Visdom
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import gzip
import csv
import os
import random
import logging
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer, util, InputExample,evaluation,models

print(sys.path)
device = 'cuda'
print(torch.cuda.is_available())

['D:\\PyCharm\\plugins\\python\\helpers-pro\\jupyter_debug', 'D:\\PyCharm\\plugins\\python\\helpers\\pydev', 'D:\\Desktop\\PROJ\\PAProj\\data_handler', 'D:\\Desktop\\PROJ\\PAProj', 'D:\\Anaconda\\python39.zip', 'D:\\Anaconda\\DLLs', 'D:\\Anaconda\\lib', 'D:\\Anaconda', '', 'D:\\Anaconda\\Lib\\site-packages', 'D:\\Anaconda\\Lib\\site-packages\\win32', 'D:\\Anaconda\\Lib\\site-packages\\win32\\lib', 'D:\\Anaconda\\Lib\\site-packages\\Pythonwin', 'D:\\Anaconda\\Lib\\site-packages\\IPython\\extensions', 'C:\\Users\\Administrator\\.ipython']
True


## 1.2 读取data
### 1.2.1 读取desc，不去重

In [23]:
small_data_key = pd.read_csv(settings_small_NN_path,encoding="utf-8")
large_data_key = pd.read_csv(settings_large_NN_path,encoding="utf-8")

data = pd.concat([small_data_key,large_data_key],axis=0,ignore_index=True)
prod = data.desc
print(f"一共有 *{prod.shape[0]}* 个商品item")

一共有 *1276* 个商品item


### 1.2.2 SentenceTransformer微调用到的data
1. training：AllNLI（2w个samples）
2. evaluating：STS benchmark

In [24]:
# Use AllNLI as a source of sentences to compute PCA
nli_dataset_path = 'datasets/AllNLI.tsv.gz'

# Use the STS benchmark dataset to see how much performance we lose by the dimensionality reduction
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

if not os.path.exists(nli_dataset_path):
    util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)


In [25]:
# Prepare the benchmark dataset
eval_examples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as f:
    reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'test':
            score = float(row['score']) / 5.0 #Normalize score to range 0 ... 1
            eval_examples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
eval_examples.__len__()

1379

In [26]:
#Read sentences from NLI dataset
nli_sentences = set()
with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as f:
    reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        nli_sentences.add(row['sentence1'])
        nli_sentences.add(row['sentence2'])
nli_sentences = list(nli_sentences)
random.shuffle(nli_sentences)
nli_sentences.__len__()

1196755

## 1.3 读取model
1. 读取的model包括
    - SentenceTransformer
    - PCA：用来降维SentenceTransformer的输出维度
    - 聚类用的kmeans和tSNE

In [27]:
model_2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

pca = PCA(n_components=new_dimension)

tsne_model = TSNE(perplexity=10, n_components=2, init='pca', n_iter=250, random_state=23)
clustering_model = KMeans(n_clusters=num_clusters)

# 2. Encoding

## 2.1 全连接层（PCA）降维
1. 计算加这个FC层之前的模型表现

In [28]:
# Measure the performance of the original model
# Evaluate the original model on the STS benchmark dataset
stsb_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples, name='sts-benchmark-test')

stsb_evaluator(model_2)

0.8203247283076371

2. 用training data计算PCA特征矩阵

In [29]:
#To determine the PCA matrix, we need some example sentence embeddings.
#Here, we compute the embeddings for 20k random sentences from the AllNLI dataset

time_start = time.time()
pca_train_sentences = nli_sentences[0:20000]
train_embeddings = model_2.encode(pca_train_sentences, convert_to_numpy=True)

#Compute PCA on the training embeddings matrix
pca.fit(train_embeddings)
pca_comp = np.asarray(pca.components_)

time_end = time.time()  # 记录开始时间

print(pca_comp.shape)
print(time_end-time_start)

(4, 384)
3.08321475982666


3. 用PCA矩阵当做全连接层的权重

In [30]:
# Add a dense layer to the model, so that it will produce directly embeddings with the new size
dense = models.Dense(in_features=model_2.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity())
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))


4. 加入现有的pretrained model中，并且计算现在的evaluation

In [31]:
model_2.add_module('dense', dense)

# Evaluate the model with the reduce embedding size
stsb_evaluator(model_2)

0.49181313689977346

## 2.2 Generate and save

In [32]:
### model generate and save
time_start = time.time()
# prod_embedding = model_2.encode(list(prod_id.loc[:,'desc']), convert_to_numpy=True,device=device)
prod_embedding = model_2.encode(list(prod), convert_to_numpy=True,device=device)
assert prod_embedding.shape[0] == prod.shape[0], "Wrong!"
time_end = time.time()  # 记录开始时间
print("用时：",time_end - time_start)

用时： 0.46291613578796387


- save: 分别保存到两个csv中

In [33]:
prod_embedding_df = pd.DataFrame(prod_embedding)
# prod_embedding_df['id'] = prod_id['id']
prod_embedding_df['desc'] = pd.DataFrame(prod)['desc']     # Add this new column

small_prod = prod_embedding_df.iloc[0:small_data_key.shape[0],:]
small_prod.to_csv(prod_embedding_small_path,header=True,index=False,encoding="utf-8")

large_prod = prod_embedding_df.iloc[small_data_key.shape[0]:,:]
large_prod.to_csv(prod_embedding_large_path,header=True,index=False,encoding="utf-8")

# 3. Test: clustering
做一下clustering来判断encoding效果
see `auction_features_encoding_demo.ipynb`