In [57]:
# 导入 pandas 处理数据
import pandas as pd
# 导入 tiktoken 计算待 embedding 的文本，所需的 token 开销
import tiktoken
# 导入 openai 的模型开发包，获取可用模型
import openai

In [58]:
input_datapath = "data/fine_food_reviews_1k.csv"
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()

# 合并 Text 和 Summary 到新列 combined
df['combined'] = ("Title: " + df.Summary.str.strip() + "；Content: " + df.Text.str.strip())

df.head(2)

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text,combined
0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title: where does one start...and stop... wit...
1,1351123200,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos...",Title: Arrived in pieces；Content: Not pleased ...


In [59]:
# 模型名称 —— 该模型支持的输入 token 上限为 8191，向量维度是 1536
embedding_model = "text-embedding_ada_002"
# 使用的分词器名称
embedding_encoding = "cl100k_base"
# 设置过滤值，超过该值的文本丢弃
max_tokens = 8000

In [60]:
# 设置筛选的评论数量基数
top_n = 1000
# 基于 Time 列对 DF 进行排序，获取最新的两千条评论
df = df.sort_values("Time").tail(top_n * 2)
df.drop("Time", axis=1, inplace=True)
encoding = tiktoken.get_encoding(embedding_encoding)

df['n_tokens'] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

1000

In [63]:
# 非必须步骤，可用已生成的向量。
df['embedding'] = df.combined.apply(lambda x: openai..get_embedding(x, engine= embedding_model))
output_datapath = "data/fine_food_receivews_with_embeddings_1k_2146.csv"
df.to_csv(output_datapath)

SyntaxError: invalid syntax (2219769993.py, line 2)

In [93]:
## 读取 embedding 后的数据。
df = pd.read_csv("data/fine_food_reviews_with_embeddings_1k.csv", index_col=0)
df["embedding"]

12     [-0.0005399271612986922, -0.004124758299440145...
13     [0.0068963742814958096, 0.0167608093470335, -0...
14     [-0.0023715533316135406, -0.021357767283916473...
15     [0.00226533692330122, 0.010306870564818382, 0....
16     [-0.027459919452667236, -0.009041198529303074,...
                             ...                        
447    [0.00796585250645876, 0.0017102764686569571, 0...
436    [0.001777207711711526, -0.011673098430037498, ...
437    [-0.005498920567333698, -0.014834611676633358,...
438    [-0.00294404081068933, -0.007058987859636545, ...
439    [-0.006043732166290283, -0.000693734094966203,...
Name: embedding, Length: 1000, dtype: object

In [95]:
import ast
df["embedding_vec"]  = df.embedding.apply(ast.literal_eval)

In [96]:
import numpy as np

In [97]:
import matplotlib as mpb

In [98]:
import sklearn

In [99]:
assert df.embedding_vec.apply(len).nunique() == 1

In [100]:
matrix = np.vstack(df.embedding_vec.values)

tsne = sklearn.manifold.TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)