In [None]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import jieba
import re
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel
import torch

ModuleNotFoundError: No module named 'sentence_transformers'

In [None]:
# 配置参数
CORE_KEYWORDS = {
    '鲜芋仙', 'Meet Fresh', 'MeetFresh', '台湾美食', '甜品', 
    '芋圆', 'taro', '仙草', 'grass jelly', '奶茶', 'milk tea',
    '豆花', 'tofu pudding', '奶刨冰', 'milked shaved ice',
    '红豆汤', 'purple rice soup', '紫米粥', 'red bean soup',
    '2001 Coit Rd', 'Park Pavillion Center', '(972) 596-6088'
}

In [None]:
class ContentScorer:
    def __init__(self, df):
        self.df = df.copy()
        self.users = df['user_id'].unique()
        self._prepare_models()
        
    def _prepare_models(self):
        """预加载所有模型"""
        # 原创性检测模型
        self.tfidf = TfidfVectorizer(max_features=6000)
        self.simcse = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
        
        # 情感分析模型
        self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
        self.bert_model = BertModel.from_pretrained("bert-base-chinese")
        self.bert_model.eval()
        
    def calculate_all_scores(self):
        """主流程：计算所有分数"""
        # 预处理
        self._preprocess()
        
        # 计算各维度分数
        self._calc_originality()
        self._calc_vertical()
        self._calc_sentiment()
        self._calc_keyword()
        
        return self.df
    
    def _preprocess(self):
        """数据预处理"""
        # 转换时间格式
        self.df['elapsed_days'] = self.df['elapsed_time']
        
        # 构建用户笔记映射
        self.user_notes = self.df.groupby('user_id')['text'].apply(list)
        
    # ======================
    # 1. 文本原创性计算（用户维度）
    # ======================
    def _calc_originality(self):
        """原创性评分"""
        all_texts = self.df['text'].tolist()
        
        # TF-IDF相似度
        tfidf_matrix = self.tfidf.fit_transform(all_texts)
        tfidf_sim = cosine_similarity(tfidf_matrix)
        
        # SimCSE相似度
        embeddings = self.simcse.encode(all_texts, show_progress_bar=True)
        simcse_sim = cosine_similarity(embeddings)
        
        # 取最大值并计算得分
        max_sim = np.maximum(tfidf_sim.max(axis=1), simcse_sim.max(axis=1))
        self.df['originality'] = (1 - max_sim) * 25
        
    # ======================
    # 2. 垂直领域分布（使用tag_list）
    # ======================
    def _calc_vertical(self):
        """垂直领域评分"""
        target_tags = {'美食', '探店', '餐饮', '餐厅', '美食探店'}
        
        def _calc_similarity(tags):
            user_tags = set(tags.split(','))
            intersection = user_tags & target_tags
            return len(intersection) / len(target_tags) if target_tags else 0
        
        self.df['vertical'] = self.df['tag_list'].apply(
            lambda x: min(_calc_similarity(x)*25, 25)
        )
        
    # ======================
    # 3. 情感强度计算
    # ======================
    def _calc_sentiment(self):
        """情感评分"""
        # 时间衰减系数
        decay = np.exp(-0.05 * self.df['elapsed_days'] / 7)
        
        # 情感值计算
        self.df['sentiment'] = self.df['text'].apply(
            lambda x: self._get_bert_sentiment(x)
        ) * decay
        
        # 标准化
        p90 = self.df['sentiment'].quantile(0.9)
        self.df['sentiment'] = (self.df['sentiment'] / p90 * 25).clip(0, 25)
        
    def _get_bert_sentiment(self, text):
        """BERT情感分析"""
        inputs = self.bert_tokenizer(text[:512], return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
        return outputs.last_hidden_state.mean().item()
    
    # ======================
    # 4. 关键词覆盖
    # ======================
    def _calc_keyword(self):
        """关键词评分"""
        # 时间衰减
        decay = np.exp(-0.05 * self.df['elapsed_days'] / 7)
        
        # 关键词计数
        pattern = re.compile('|'.join(re.escape(kw) for kw in CORE_KEYWORDS), flags=re.IGNORECASE)
        self.df['kw_count'] = self.df['text'].apply(
            lambda x: len(pattern.findall(x))
        )
        
        # 单篇得分
        single_score = np.maximum(1, self.df['kw_count'] / 5) * decay
        
        # 用户级计算
        user_scores = self.df.groupby('user_id').apply(self._user_keyword_score)
        self.df = self.df.merge(user_scores, on='user_id')
        
    def _user_keyword_score(self, group):
        """用户维度关键词计算"""
        # 基准值计算
        p90 = group['single_score'].quantile(0.9)
        cover_rate = (group['kw_count'] > 0).mean()
        
        # 综合得分
        score = (group['single_score'].mean() / p90 * 0.7 + cover_rate / 0.5 * 0.3) * 25
        return pd.Series({'keyword': score.clip(0, 25)})