In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2022/11/21 10:06
# @Author  : Wang Yujia
# @File    : auction_features_encoding.ipynb
# @Description : 为auction从'desc'列提取contextual features。目前采用的是`SentenceTransformer`


# 0. what for
1. auction的features/ setting info需要encoding
2. 只对`desc`进行encoding

# 1. preparations
## 1.1 全局设置

In [1]:
data_key_path = "../data/target_datakey.csv"
prod_id_path = "../data/prod_id.csv"

# output path
prod_embedding_output_path = "../data/prod_embedding.csv"


In [3]:
# 希望得到的encoding维度
new_dimension = 300

# 聚类
num_clusters = 4

In [4]:
import sys
import time
import pandas as pd
import numpy as np
from pprint import pprint
import torch
from transformers import BertModel
from visdom import Visdom
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import gzip
import csv
import os
import random
import logging
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer, util, InputExample,evaluation,models

print(sys.path)
device = 'cuda'
print(torch.cuda.is_available())

['C:\\Program Files\\JetBrains\\PyCharm 2022.2.3\\plugins\\python\\helpers-pro\\jupyter_debug', 'C:\\Program Files\\JetBrains\\PyCharm 2022.2.3\\plugins\\python\\helpers\\pydev', 'D:\\Desktop\\PROJ\\PAProj\\data_handler', 'D:\\Desktop\\PROJ\\PAProj', 'D:\\Anaconda\\envs\\py39\\python39.zip', 'D:\\Anaconda\\envs\\py39\\DLLs', 'D:\\Anaconda\\envs\\py39\\lib', 'D:\\Anaconda\\envs\\py39', '', 'D:\\Anaconda\\envs\\py39\\lib\\site-packages', 'D:\\Anaconda\\envs\\py39\\lib\\site-packages\\win32', 'D:\\Anaconda\\envs\\py39\\lib\\site-packages\\win32\\lib', 'D:\\Anaconda\\envs\\py39\\lib\\site-packages\\Pythonwin']
True


## 1.2 读取data
### 1.2.1 读取id和对应的desc

In [5]:
data_key = pd.read_csv(data_key_path,encoding="utf-8")
prod_id_all = pd.read_csv(prod_id_path,encoding="utf-8")

prod_id = prod_id_all[ prod_id_all['id'].isin(data_key['id']) ]
prod_id.reset_index(drop=True,inplace=True)
print(prod_id.head(10))
print(prod_id.shape)

   id                                           desc
0   0  Sony Ericsson S500i Unlocked Mysterious Green
1   1               PSP Slim & Lite Sony Piano Black
2   2     iPod Touch Apple 8GB with Software Upgrade
3   3      Logitech Cordless Wave Keyboard and Mouse
4   4   Apple Macbook Air 1.6GHz Core 2 Duo Notebook
5   5                     SanDisk Cruzer Contour 4GB
6   6           Mario Kart with Wheel (Nintendo Wii)
7   7      PS3 | Playstation 3 Sony Console 40GB HDD
8   8                    DS | Nintendo DS Lite White
9  11            Corsair Voyager Mini 4 GB USB Flash
(907, 2)


### 1.2.2 SentenceTransformer微调用到的data
1. training：AllNLI（2w个samples）
2. evaluating：STS benchmark

In [7]:
# Use AllNLI as a source of sentences to compute PCA
nli_dataset_path = 'datasets/AllNLI.tsv.gz'

# Use the STS benchmark dataset to see how much performance we lose by the dimensionality reduction
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

if not os.path.exists(nli_dataset_path):
    util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)


In [8]:
# Prepare the benchmark dataset
eval_examples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as f:
    reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'test':
            score = float(row['score']) / 5.0 #Normalize score to range 0 ... 1
            eval_examples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
eval_examples.__len__()

1379

In [9]:
#Read sentences from NLI dataset
nli_sentences = set()
with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as f:
    reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        nli_sentences.add(row['sentence1'])
        nli_sentences.add(row['sentence2'])
nli_sentences = list(nli_sentences)
random.shuffle(nli_sentences)
nli_sentences.__len__()

1196755

## 1.3 读取model
1. 读取的model包括
    - SentenceTransformer
    - PCA：用来降维SentenceTransformer的输出维度
    - 聚类用的kmeans和tSNE

In [10]:
model_2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

pca = PCA(n_components=new_dimension)

tsne_model = TSNE(perplexity=10, n_components=2, init='pca', n_iter=250, random_state=23)
clustering_model = KMeans(n_clusters=num_clusters)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

# 2. Encoding

## 2.1 全连接层（PCA）降维
1. 计算加这个FC层之前的模型表现

In [11]:
# Measure the performance of the original model
# Evaluate the original model on the STS benchmark dataset
stsb_evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(eval_examples, name='sts-benchmark-test')

stsb_evaluator(model_2)

0.8203247283076371

2. 用training data计算PCA特征矩阵

In [12]:
#To determine the PCA matrix, we need some example sentence embeddings.
#Here, we compute the embeddings for 20k random sentences from the AllNLI dataset

time_start = time.time()
pca_train_sentences = nli_sentences[0:20000]
train_embeddings = model_2.encode(pca_train_sentences, convert_to_numpy=True)

#Compute PCA on the training embeddings matrix
pca.fit(train_embeddings)
pca_comp = np.asarray(pca.components_)

time_end = time.time()  # 记录开始时间

print(pca_comp.shape)
print(time_end-time_start)

(300, 384)
8.884342193603516


3. 用PCA矩阵当做全连接层的权重

In [13]:
# Add a dense layer to the model, so that it will produce directly embeddings with the new size
dense = models.Dense(in_features=model_2.get_sentence_embedding_dimension(), out_features=new_dimension, bias=False, activation_function=torch.nn.Identity())
dense.linear.weight = torch.nn.Parameter(torch.tensor(pca_comp))


4. 加入现有的pretrained model中，并且计算现在的evaluation

In [14]:
model_2.add_module('dense', dense)

# Evaluate the model with the reduce embedding size
stsb_evaluator(model_2)

0.8234648135739334

## 2.2 Generate and save

In [15]:
### model generate and save
time_start = time.time()
prod_embedding = model_2.encode(list(prod_id.loc[:,'desc']), convert_to_numpy=True,device=device)
assert prod_embedding.shape[0] == prod_id.shape[0], "Wrong!"
time_end = time.time()  # 记录开始时间
print(time_end - time_start)

0.49586009979248047


In [16]:
prod_embedding_df = pd.DataFrame(prod_embedding)
prod_embedding_df['id'] = prod_id['id']
prod_embedding_df['desc'] = prod_id['desc']

prod_embedding_df.to_csv(prod_embedding_output_path,header=True,index=False,encoding="utf-8")
prod_embedding_df.head(10),prod_embedding_df.shape

(          0         1         2         3         4         5         6  \
 0  0.113677  0.015587 -0.017459 -0.083209  0.009277  0.015455 -0.033741   
 1  0.065721  0.028872  0.015985  0.029794 -0.078768  0.032856  0.038300   
 2 -0.050416 -0.074042  0.113132  0.051390 -0.042346  0.152172  0.020443   
 3  0.002531  0.022662  0.008649  0.080328  0.002015 -0.072503  0.164256   
 4  0.055621  0.064637  0.025927 -0.000543 -0.030083 -0.003895  0.078841   
 5  0.010668 -0.015149  0.060915 -0.013527  0.084374  0.049732 -0.058918   
 6  0.089104  0.036358  0.036312  0.129133  0.062260  0.032078  0.037145   
 7 -0.010504  0.005163  0.018896 -0.063805  0.032434  0.026571  0.008288   
 8  0.003754  0.086591  0.012458  0.058575 -0.017852 -0.036281 -0.030097   
 9 -0.028825  0.080478  0.153809  0.014118  0.004908 -0.027036  0.037114   
 
           7         8         9  ...       292       293       294       295  \
 0 -0.033269 -0.056256 -0.050112  ...  0.085058 -0.000890  0.028617  0.019562   


# 3. Test: clustering
做一下clustering来判断encoding效果
see `auction_features_encoding_demo.ipynb`