In [39]:
from huggingface_hub import login
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

# 加载中文RoBERTa模型和Tokenizer
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
model = AutoModel.from_pretrained("hfl/chinese-roberta-wwm-ext", output_hidden_states=True)

def encode_text(text):
    """
    使用中文RoBERTa模型将商品描述转化为向量表示。
    """
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs, output_hidden_states=True)
    
    # 获取最后一层隐藏状态的平均值
    hidden_states = outputs.hidden_states
    last_layer_hidden_state = hidden_states[-1]  # 最后一层的隐藏状态
    return last_layer_hidden_state.mean(dim=1).detach().numpy().squeeze()

# 2. 模拟用户行为数据
user_behavior_data = [
    {"user_id": "U001", "item_id": "P1001", "action": "浏览", "item_description": "时尚蓝牙耳机", "timestamp": "2025-01-01 10:00"},
    {"user_id": "U001", "item_id": "P1002", "action": "浏览", "item_description": "智能手表", "timestamp": "2025-01-01 10:15"},
    {"user_id": "U001", "item_id": "P1003", "action": "加入购物车", "item_description": "4K电视", "timestamp": "2025-01-01 10:30"},
    {"user_id": "U002", "item_id": "P1002", "action": "浏览", "item_description": "智能手表", "timestamp": "2025-01-01 11:00"},
    {"user_id": "U003", "item_id": "P1001", "action": "购买", "item_description": "时尚蓝牙耳机", "timestamp": "2025-01-01 11:30"},
    {"user_id": "U002", "item_id": "P1003", "action": "购买", "item_description": "4K电视", "timestamp": "2025-01-01 12:00"},
    {"user_id": "U001", "item_id": "P1001", "action": "浏览", "item_description": "时尚蓝牙耳机", "timestamp": "2025-01-01 12:30"},
]

# 3. 构建用户行为向量
def build_user_vector(user_id, user_behavior_data):
    """
    根据用户的行为记录生成一个兴趣向量。
    """
    user_history = [entry["item_description"] for entry in user_behavior_data if entry["user_id"] == user_id]
    if not user_history:
        return np.zeros(768)  # 中文RoBERTa模型输出的向量维度为768
    # 计算用户历史行为的平均向量
    user_vectors = np.mean([encode_text(desc) for desc in user_history], axis=0)
    return user_vectors

# 4. 示例商品描述
item_descriptions = [
    "airpods",
    "智能手表",
    "4K电视",
    "无线耳机",
    "超高清电视"
]

# 5. 将商品描述转化为向量
item_vectors = np.array([encode_text(desc) for desc in item_descriptions])

# 6. 推荐系统
def recommend(user_id, user_behavior_data, item_descriptions, item_vectors, top_n=3, similarity_threshold=0.5):
    """
    为指定用户推荐商品，支持设置推荐数量和相似度阈值。
    """
    # 获取用户兴趣向量
    user_vector = build_user_vector(user_id, user_behavior_data)
    
    # 计算用户兴趣向量与商品向量的相似度
    similarities = cosine_similarity([user_vector], item_vectors)
    
    # 过滤掉低于相似度阈值的商品
    filtered_items = [idx for idx, sim in enumerate(similarities[0]) if sim >= similarity_threshold]
    
    if not filtered_items:
        print("没有符合条件的商品推荐！")
        return []
    
    # 根据相似度排序，推荐最相关的商品
    recommended_items = np.argsort(similarities[0][filtered_items])[::-1]
    
    # 返回推荐的商品（限制数量为top_n）
    recommended_item_ids = [item_descriptions[filtered_items[i]] for i in recommended_items[:top_n]]
    return recommended_item_ids

# 7. 为用户U001推荐商品
user_id = "U001"
recommended_items = recommend(user_id, user_behavior_data, item_descriptions, item_vectors, top_n=1, similarity_threshold=0.5)

# 输出推荐结果
print(f"为用户 {user_id} 推荐的商品：")
for idx, item in enumerate(recommended_items):
    print(f"{idx + 1}. {item}")


  return self.fget.__get__(instance, owner)()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


为用户 U001 推荐的商品：
1. 智能手表


In [38]:
from huggingface_hub import login
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

# 加载Qwen2.5-3B模型和Tokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
model = AutoModel.from_pretrained("Qwen/Qwen2.5-3B", output_hidden_states=True)

def encode_text(text):
    """
    使用Qwen2.5-3B模型将商品描述转化为向量表示。
    """
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs, output_hidden_states=True)
    
    # 获取最后一层隐藏状态的平均值
    hidden_states = outputs.hidden_states
    last_layer_hidden_state = hidden_states[-1]  # 最后一层的隐藏状态
    return last_layer_hidden_state.mean(dim=1).detach().numpy().squeeze()

# 2. 模拟用户行为数据
user_behavior_data = [
    {"user_id": "U001", "item_id": "P1001", "action": "浏览", "item_description": "时尚蓝牙耳机", "timestamp": "2025-01-01 10:00"},
    {"user_id": "U001", "item_id": "P1002", "action": "浏览", "item_description": "智能手表", "timestamp": "2025-01-01 10:15"},
    {"user_id": "U001", "item_id": "P1003", "action": "加入购物车", "item_description": "4K电视", "timestamp": "2025-01-01 10:30"},
    {"user_id": "U002", "item_id": "P1002", "action": "浏览", "item_description": "智能手表", "timestamp": "2025-01-01 11:00"},
    {"user_id": "U003", "item_id": "P1001", "action": "购买", "item_description": "时尚蓝牙耳机", "timestamp": "2025-01-01 11:30"},
    {"user_id": "U002", "item_id": "P1003", "action": "购买", "item_description": "4K电视", "timestamp": "2025-01-01 12:00"},
    {"user_id": "U001", "item_id": "P1001", "action": "浏览", "item_description": "时尚蓝牙耳机", "timestamp": "2025-01-01 12:30"},
]

# 3. 构建用户行为向量
def build_user_vector(user_id, user_behavior_data):
    """
    根据用户的行为记录生成一个兴趣向量。
    """
    user_history = [entry["item_description"] for entry in user_behavior_data if entry["user_id"] == user_id]
    if not user_history:
        return np.zeros(768)  # 假设Qwen2.5-3B模型输出的向量维度为768
    # 计算用户历史行为的平均向量
    user_vectors = np.mean([encode_text(desc) for desc in user_history], axis=0)
    return user_vectors

# 4. 示例商品描述
item_descriptions = [
    "headset",
    "智能手表",
    "4K电视",
    "无线耳机",
    "超高清电视"
]

# 5. 将商品描述转化为向量
item_vectors = np.array([encode_text(desc) for desc in item_descriptions])

# 6. 推荐系统
def recommend(user_id, user_behavior_data, item_descriptions, item_vectors, top_n=3, similarity_threshold=0.5):
    """
    为指定用户推荐商品，支持设置推荐数量和相似度阈值。
    """
    # 获取用户兴趣向量
    user_vector = build_user_vector(user_id, user_behavior_data)
    
    # 计算用户兴趣向量与商品向量的相似度
    similarities = cosine_similarity([user_vector], item_vectors)
    
    # 过滤掉低于相似度阈值的商品
    filtered_items = [idx for idx, sim in enumerate(similarities[0]) if sim >= similarity_threshold]
    
    if not filtered_items:
        print("没有符合条件的商品推荐！")
        return []
    
    # 根据相似度排序，推荐最相关的商品
    recommended_items = np.argsort(similarities[0][filtered_items])[::-1]
    
    # 返回推荐的商品（限制数量为top_n）
    recommended_item_ids = [item_descriptions[filtered_items[i]] for i in recommended_items[:top_n]]
    return recommended_item_ids

# 7. 为用户U001推荐商品
user_id = "U001"
recommended_items = recommend(user_id, user_behavior_data, item_descriptions, item_vectors, top_n=1, similarity_threshold=0.5)

# 输出推荐结果
print(f"为用户 {user_id} 推荐的商品：")
for idx, item in enumerate(recommended_items):
    print(f"{idx + 1}. {item}")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

为用户 U001 推荐的商品：
1. 智能手表


In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 加载中文RoBERTa模型和Tokenizer
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
model = AutoModel.from_pretrained("hfl/chinese-roberta-wwm-ext")

def encode_text(text):
    """
    使用RoBERTa模型将文本转化为向量表示。
    """
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    
    # 获取最后一层隐藏状态的平均值作为文本的向量表示
    last_hidden_state = outputs.last_hidden_state
    sentence_embedding = torch.mean(last_hidden_state, dim=1).detach().numpy().squeeze()
    return sentence_embedding

# 词语列表
words = ["公主", "王子", "青蛙", "苹果", "香蕉","牛马","猴子","国王"]

# 获取每个词语的向量表示
word_vectors = np.array([encode_text(word) for word in words])

# 计算“公主”与其他词语的余弦相似度
princess_vector = word_vectors[0]  # "公主"的向量
similarities = cosine_similarity([princess_vector], word_vectors[1:])

# 输出每个词语的相似度
for idx, word in enumerate(words[1:]):
    print(f"公主和{word}的余弦相似度：{similarities[0][idx]:.4f}")


  return self.fget.__get__(instance, owner)()
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


公主和王子的余弦相似度：0.8404
公主和青蛙的余弦相似度：0.7107
公主和苹果的余弦相似度：0.5628
公主和香蕉的余弦相似度：0.6400
公主和牛马的余弦相似度：0.6865
公主和猴子的余弦相似度：0.6656
公主和国王的余弦相似度：0.7919


In [8]:
# 连接weaviate
WEAVIATE_HOST = '124.222.113.16'
WEAVIATE_PORT = '8080'
WEAVIATE_URL = f'http://{WEAVIATE_HOST}:{WEAVIATE_PORT}'
import weaviate
# 创建Weaviate客户端
client = weaviate.Client(
    url=WEAVIATE_URL,
)
# 获取模式
schema = client.schema.get()
print(schema)

{'classes': [{'class': 'LangChain_78085c2ecb764d0393f50db51561c6bc', 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2}, 'cleanupIntervalSeconds': 60, 'stopwords': {'additions': None, 'preset': 'en', 'removals': None}}, 'multiTenancyConfig': {'enabled': False}, 'properties': [{'dataType': ['text'], 'indexFilterable': True, 'indexSearchable': True, 'name': 'text', 'tokenization': 'word'}], 'replicationConfig': {'factor': 1}, 'shardingConfig': {'virtualPerPhysical': 128, 'desiredCount': 1, 'actualCount': 1, 'desiredVirtualCount': 128, 'actualVirtualCount': 128, 'key': '_id', 'strategy': 'hash', 'function': 'murmur3'}, 'vectorIndexConfig': {'skip': False, 'cleanupIntervalSeconds': 300, 'maxConnections': 64, 'efConstruction': 128, 'ef': -1, 'dynamicEfMin': 100, 'dynamicEfMax': 500, 'dynamicEfFactor': 8, 'vectorCacheMaxObjects': 1000000000000, 'flatSearchCutoff': 40000, 'distance': 'cosine', 'pq': {'enabled': False, 'bitCompression': False, 'segments': 0, 'centroids': 256, 'trainingLimi

In [3]:
import json
import pymysql
from datetime import datetime
from sqlalchemy import create_engine
import pandas as pd

# 数据库连接配置
DB_CONFIG = {
    'host': '124.222.113.16',
    'port': 3306,
    'user': 'root',
    'password': 'WYX&wyx',
    'database': 'Zhihuiyuyan',
    'charset': 'utf8mb4'
}

def load_json_file(file_path):
    """加载JSON文件"""
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def create_db_connection():
    """创建数据库连接"""
    return pymysql.connect(**DB_CONFIG)

def insert_weibo_data(conn, data):
    """插入微博数据"""
    cursor = conn.cursor()
    sql = """INSERT INTO weibo_data (id, author, publish_date, content, platform) 
             VALUES (%s, %s, %s, %s, %s)"""
    
    for item in data:
        try:
            cursor.execute(sql, (
                item['id'],
                item['author'],
                item['publish_date'],
                item['content'],
                item['platform']
            ))
        except Exception as e:
            print(f"Error inserting weibo data: {e}")
    
    conn.commit()
    cursor.close()

def parse_datetime(date_str):
    """解析各种格式的日期时间字符串"""
    if not date_str or date_str == 'Unknown':
        return None
        
    try:
        # 尝试不同的日期格式
        formats = [
            '%Y-%m-%d %H:%M:%S',
            '%Y-%m-%d %H:%M',
            '%Y-%m-%d',
            '%Y年%m月%d日 %H:%M',
            '%Y年%m月%d日',
            '%m月%d日 %H:%M',
            '%H:%M:%S',
            '%H:%M'
        ]
        
        for fmt in formats:
            try:
                # 如果是没有年份的格式，添加当前年份
                if '年' not in date_str and '-' not in date_str:
                    date_str = f'2024年{date_str}'
                return datetime.strptime(date_str, fmt)
            except ValueError:
                continue
                
        return None
    except Exception:
        return None

def insert_news_results(conn, data):
    """插入新闻数据"""
    cursor = conn.cursor()
    sql = """INSERT INTO news_results (id, title, content, summary, publish_date, 
             source, url, media_url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"""
    
    for item in data:
        try:
            # 解析日期
            publish_date = parse_datetime(item['publish_date'])
            
            cursor.execute(sql, (
                item['id'],
                item['title'],
                item['content'],
                item['summary'],
                publish_date,
                item['source'],
                item['url'],
                item['media_url']
            ))
        except Exception as e:
            print(f"Error inserting news data: {e}")
            print(f"Problematic record: {item}")
    
    conn.commit()
    cursor.close()

def insert_douban_topics(conn, data):
    """插入豆瓣话题数据"""
    cursor = conn.cursor()
    sql = """INSERT INTO douban_topics (id, content, author, publish_date, 
             platform, url) VALUES (%s, %s, %s, %s, %s, %s)"""
    
    for item in data:
        try:
            # 解析日期
            publish_date = parse_datetime(item['publish_date'])
            
            cursor.execute(sql, (
                item['id'],
                item['content'],
                item['author'],
                publish_date,
                item['platform'],
                item['url']
            ))
        except Exception as e:
            print(f"Error inserting douban data: {e}")
            print(f"Problematic record: {item}")
    
    conn.commit()
    cursor.close()

def insert_conferences(conn, data):
    """插入会议数据"""
    cursor = conn.cursor()
    sql = """INSERT INTO conferences (id, name, date, location, agenda, url, abstract) 
             VALUES (%s, %s, %s, %s, %s, %s, %s)"""
    
    for item in data:
        try:
            cursor.execute(sql, (
                item['id'],
                item['name'],
                item['date'],
                item['location'],
                json.dumps(item['agenda'], ensure_ascii=False),
                item['url'],
                item['abstract']
            ))
        except Exception as e:
            print(f"Error inserting conference data: {e}")
    
    conn.commit()
    cursor.close()

def insert_white_papers(conn, data):
    """插入白皮书数据"""
    cursor = conn.cursor()
    sql = """INSERT INTO white_papers (id, title, abstract, content, publish_date, 
             publisher, url) VALUES (%s, %s, %s, %s, %s, %s, %s)"""
    
    for item in data:
        try:
            cursor.execute(sql, (
                item['id'],
                item['title'],
                item['abstract'],
                item['content'],
                item['publish_date'],
                item['publisher'],
                item['url']
            ))
        except Exception as e:
            print(f"Error inserting white paper data: {e}")
    
    conn.commit()
    cursor.close()

def insert_academic_papers(conn, data):
    """插入学术论文数据"""
    cursor = conn.cursor()
    sql = """INSERT INTO academic_papers (id, title, abstract, authors, publish_date, 
             journal, keywords, doi, url, citations) 
             VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
    
    for item in data:
        try:
            cursor.execute(sql, (
                item['id'],
                item['title'],
                item['abstract'],
                json.dumps(item['authors'], ensure_ascii=False),
                item['publish_date'],
                item['journal'],
                json.dumps(item.get('keywords', []), ensure_ascii=False),
                item['doi'],
                item['url'],
                item['citations']
            ))
        except Exception as e:
            print(f"Error inserting academic paper data: {e}")
    
    conn.commit()
    cursor.close()

def main():
    """主函数"""
    try:
        # 创建数据库连接
        conn = create_db_connection()
        
        # 加载并插入各类数据
        data_files = {
            'weibo_data': 'data/weibo_data.json',
            'news_results': 'data/news_results.json',
            'douban_topics': 'data/douban_topics.json',
            'conferences': 'data/conference.json',
            'white_papers': 'data/white_paper.json',
            'academic_papers': 'data/academic_papers.json'
        }
        
        insert_functions = {
            'weibo_data': insert_weibo_data,
            'news_results': insert_news_results,
            'douban_topics': insert_douban_topics,
            'conferences': insert_conferences,
            'white_papers': insert_white_papers,
            'academic_papers': insert_academic_papers
        }
        
        for table_name, file_path in data_files.items():
            print(f"Processing {table_name}...")
            try:
                data = load_json_file(file_path)
                insert_functions[table_name](conn, data)
                print(f"Successfully inserted {table_name} data")
            except Exception as e:
                print(f"Error processing {table_name}: {e}")
        
    except Exception as e:
        print(f"Database connection error: {e}")
    finally:
        if conn:
            conn.close()

if __name__ == "__main__":
    main()

Processing weibo_data...
Error inserting weibo data: (1062, "Duplicate entry '1' for key 'weibo_data.PRIMARY'")
Error inserting weibo data: (1062, "Duplicate entry '2' for key 'weibo_data.PRIMARY'")
Error inserting weibo data: (1062, "Duplicate entry '3' for key 'weibo_data.PRIMARY'")
Error inserting weibo data: (1062, "Duplicate entry '4' for key 'weibo_data.PRIMARY'")
Error inserting weibo data: (1062, "Duplicate entry '5' for key 'weibo_data.PRIMARY'")
Error inserting weibo data: (1062, "Duplicate entry '6' for key 'weibo_data.PRIMARY'")
Error inserting weibo data: (1062, "Duplicate entry '7' for key 'weibo_data.PRIMARY'")
Error inserting weibo data: (1062, "Duplicate entry '8' for key 'weibo_data.PRIMARY'")
Error inserting weibo data: (1062, "Duplicate entry '9' for key 'weibo_data.PRIMARY'")
Error inserting weibo data: (1062, "Duplicate entry '10' for key 'weibo_data.PRIMARY'")
Error inserting weibo data: (1062, "Duplicate entry '11' for key 'weibo_data.PRIMARY'")
Error inserting 