In [1]:
!pip install openai pandas backoff scikit-learn pyarrow 
!pip install faiss-gpu

[0m

# 让AI生成点实验数据

In [2]:
from openai import OpenAI

client = OpenAI()

COMPLETION_MODEL = "gpt-3.5-turbo"


In [3]:

def generate_data_by_prompt(prompt):
    response = client.chat.completions.create(
        model=COMPLETION_MODEL,
        messages=[
            {"role": "user", "content": prompt},
        ],
        max_tokens=2048,
        temperature=0.5,
        top_p=1,
    )
    return response.choices[0].message.content

prompt = "请你生成50条淘宝网里的商品的标题，每条在30个字左右，品类是3C数码产品，标题里往往也会有一些促销类的信息，每行一条。不要重复。"

data = generate_data_by_prompt(prompt)


In [4]:
import pandas as pd

product_names = data.strip().split('\n')
df = pd.DataFrame({'product_name': product_names})

df.head()

Unnamed: 0,product_name
0,1. 超值特惠！华为P40 Pro 5G全网通手机
1,2. 限时抢购！小米笔记本Air 13.3英寸轻薄本
2,3. 爆款推荐！Apple AirPods Pro降噪蓝牙耳机
3,4. 限时秒杀！戴尔XPS 15.6英寸轻薄笔记本
4,5. 优惠促销！华硕ROG游戏手机3全新上市


In [5]:
df.product_name = df.product_name.apply(lambda x: x.split('.')[1].strip())
df.head()

Unnamed: 0,product_name
0,超值特惠！华为P40 Pro 5G全网通手机
1,限时抢购！小米笔记本Air 13
2,爆款推荐！Apple AirPods Pro降噪蓝牙耳机
3,限时秒杀！戴尔XPS 15
4,优惠促销！华硕ROG游戏手机3全新上市


In [6]:
clothes_prompt = "请你生成50条淘宝网里的商品的标题，每条在30个字左右，品类是女性的服饰箱包等等，标题里往往也会有一些促销类的信息，每行一条。不要重复。"
clothes_data = generate_data_by_prompt(clothes_prompt)
clothes_product_names = clothes_data.strip().split('\n')
clothes_df = pd.DataFrame({'product_name': clothes_product_names})
clothes_df.product_name = clothes_df.product_name.apply(lambda x: x.split('.')[1].strip())
clothes_df.head()

Unnamed: 0,product_name
0,夏季清仓！时尚女士连衣裙特惠款
1,限时折扣！韩版潮流女包精选推荐
2,热销爆款！韩版甜美女士针织衫
3,限时秒杀！复古小清新女士手提包
4,夏日新品！韩版女士百搭短裤特惠


In [7]:
df = pd.concat([df, clothes_df], axis=0)
df = df.reset_index(drop=True)
display(df)

Unnamed: 0,product_name
0,超值特惠！华为P40 Pro 5G全网通手机
1,限时抢购！小米笔记本Air 13
2,爆款推荐！Apple AirPods Pro降噪蓝牙耳机
3,限时秒杀！戴尔XPS 15
4,优惠促销！华硕ROG游戏手机3全新上市
...,...
95,特价促销！欧美风格女士皮带精选
96,限时抢购！日系清新女士休闲裤
97,热销推荐！时尚女士斜挎包特惠款
98,夏季狂欢！韩版女士短款外套清仓


# 通过Embedding进行语义搜索

In [9]:
from openai import OpenAI
import openai, backoff

client = OpenAI()

embedding_model = "text-embedding-ada-002"

def get_embeddings(list_of_text, model="text-embedding-3-small"):
    list_of_text = [text.replace("\n", " ") for text in list_of_text]
    embeddings = []
    for da in client.embeddings.create(input = list_of_text, model=model).data:
        embeddings.append(da.embedding)
        
    return embeddings

batch_size = 100

@backoff.on_exception(backoff.expo, openai.RateLimitError)
def get_embeddings_with_backoff(prompts, engine):
    embeddings = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]
        embeddings += get_embeddings(list_of_text=batch, model=engine)
    return embeddings

prompts = df.product_name.tolist()
prompt_batches = [prompts[i:i+batch_size] for i in range(0, len(prompts), batch_size)]

embeddings = []
for batch in prompt_batches:
    batch_embeddings = get_embeddings_with_backoff(prompts=batch, engine=embedding_model)
    embeddings += batch_embeddings

df["embedding"] = embeddings
print(df)
df.to_parquet("data/taobao_product_title.parquet", index=False)

                    product_name  \
0         超值特惠！华为P40 Pro 5G全网通手机   
1               限时抢购！小米笔记本Air 13   
2   爆款推荐！Apple AirPods Pro降噪蓝牙耳机   
3                  限时秒杀！戴尔XPS 15   
4            优惠促销！华硕ROG游戏手机3全新上市   
..                           ...   
95               特价促销！欧美风格女士皮带精选   
96                限时抢购！日系清新女士休闲裤   
97               热销推荐！时尚女士斜挎包特惠款   
98               夏季狂欢！韩版女士短款外套清仓   
99               限时特惠！欧美风格女士毛衣推荐   

                                            embedding  
0   [-0.013336816802620888, -0.002172698499634862,...  
1   [-0.01182595081627369, -0.013900082558393478, ...  
2   [-0.011434641666710377, -0.007904254831373692,...  
3   [-0.005350682884454727, -0.0124397287145257, -...  
4   [-0.013714857399463654, -0.004293053410947323,...  
..                                                ...  
95  [-0.024718234315514565, -0.015129383653402328,...  
96  [-0.011915394105017185, -0.006174277979880571,...  
97  [-0.03560192137956619, -0.00539803970605135, -...  
98  [-0

In [11]:
import os, backoff

@backoff.on_exception(backoff.expo, openai.RateLimitError)
def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

# df = pd.read_parquet("data/taobao_product_title.parquet")
# embedding_model = "text-embedding-3-small"
# client = OpenAI()
# df["embedding"] = df.product_name.apply(lambda x: get_embedding(x, embedding_model))
# df.to_parquet("data/taobao_product_title2.parquet", index=False)

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

def search_product(df, query, n=3, pprint=True):
    product_embedding = get_embedding(
        query,
        model=embedding_model
    )
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity([x], [product_embedding]))
    
    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
        .product_name
    )
    if pprint:
        for r in results:
            print(r)
    return results

results = search_product(df, "自然淡雅背包", n=3)

特价促销！欧美风格女士斜挎包
热销推荐！时尚女士斜挎包特惠款
热销推荐！时尚女士斜挎包特惠款


# 利用Embedding信息进行商品推荐的冷启动

In [14]:
def recommend_product(df, product_name, n=3, pprint=True):
    product_embdding = df[df['product_name'] == product_name].iloc[0].embedding

    df['similarity'] = df.embedding.apply(lambda x: cosine_similarity([x], [product_embdding]))
    
    results = (
        df.sort_values('similarity', ascending=False)
        .head(n)
        .product_name
    )
    if pprint:
        for r in results:
            print(r)
    return results

results = recommend_product(df, "超值特惠！华为P40 Pro 5G全网通手机", n=3)

超值特惠！华为P40 Pro 5G全网通手机
限时特惠！小米10 Pro 5G全网通手机
超值特惠！华为MatePad Pro 10


# 通过FAISS加速搜索过程

In [15]:
import faiss
import numpy as np

def load_embeddings_to_faiss(df):
    embeddings = np.array(df['embedding'].tolist()).astype('float32')
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

index = load_embeddings_to_faiss(df)

In [16]:
def search_index(index, df, query, k=5):
    query_vector = np.array(get_embedding(query, model=embedding_model)).reshape(1, -1).astype('float32')
    distances, indexes = index.search(query_vector, k)

    results = []
    for i in range(len(indexes)):
        product_names = df.iloc[indexes[i]]['product_name'].values.tolist()
        results.append((distances[i], product_names))
    return results

products = search_index(index, df, "自然淡雅背包", k=3)

for distances, product_names in products:
    for i in range(len(distances)):
        print(product_names[i], distances[i])

特价促销！欧美风格女士斜挎包 0.2906236
热销推荐！时尚女士斜挎包特惠款 0.30157784
热销推荐！时尚女士斜挎包特惠款 0.301996


In [17]:
!pip list

Package                        Version
------------------------------ --------------
aiofiles                       22.1.0
aiosqlite                      0.20.0
altair                         5.3.0
annotated-types                0.6.0
anyio                          4.3.0
argon2-cffi                    23.1.0
argon2-cffi-bindings           21.2.0
arrow                          1.3.0
astroid                        3.1.0
asttokens                      2.4.1
attrs                          23.2.0
autopep8                       2.0.4
Babel                          2.14.0
backoff                        2.2.1
beautifulsoup4                 4.12.3
bleach                         6.1.0
certifi                        2024.2.2
cffi                           1.16.0
charset-normalizer             3.3.2
click                          8.1.7
comm                           0.2.2
contourpy                      1.2.1
cramjam                        2.8.3
cycler                         0.12.1
debugpy        