In [4]:
import csv
import numpy as np
import pandas as pd
import collections
from annoy import AnnoyIndex
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from sentence_transformers import SentenceTransformer as st
data_path = '"E:/Charlotte/CS/MACHINE LEARNING/NATURAL LANGUAGE PROCESSING/data.csv"'
save_path = '"E:/Charlotte/CS/MACHINE LEARNING/NATURAL LANGUAGE PROCESSING/result.xlsx"'
bert_model_path = "D:/wk/task/review/nanhai/data/bert_based_chinese"

def preprocess_data(data_path):
    final = []
    with open(data_path + 'data.csv', 'rt', encoding="utf-8") as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='\"')
        for row in spamreader:
            tmp_row = []
            for col_idx in range(0,len(row)):
                if isinstance(row[col_idx], str):
                    row[col_idx] = row[col_idx].replace('\\n', '').replace(' ', '').replace('\\t', '').replace('\\r', '')
                tmp_row.append(row[col_idx])
            final.append(tmp_row)
    df = pd.DataFrame(final, columns=['number','title','news','time'])
    return df

########################################################################################################
###特征向量生成模块
########################################################################################################
###生成包含关键词集合的语料库
def generate_vectors(data_path,bert_model_path,data):
    # 加载模型
    model = st(bert_model_path)
    # 逐条编码句子
    tokenize_corpus = []
    for i in range(len(data['news'])):  
    # for i in range(5):  
        raw = data['news'][i]
        if raw == "/":
            raw = data['title'][i]
        sentence = text_to_word_sequence(raw, filters='\\n')
        embeddings = model.encode(sentence)
        embeddings = embeddings[0]
        tokenize_corpus.append(embeddings)
    print(type(tokenize_corpus))
    return tokenize_corpus

########################################################################################################
###相似度计算及智能推荐模块
########################################################################################################
###采用Annoy(Approximate Nearest Neighbors Oh Yeah)进行相似度检索并取前N值
def annoy_similarity(vector,save_path):
    # DataFrame格式转换为appay格式
    item_emb_np=np.array(vector)
    # 向量进行单位化
    item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)
    print(item_emb_np.shape)
    ann_index =item_emb_np.shape[1]   # Length of item vector that will be indexed
    t = AnnoyIndex(ann_index, 'dot')  ###metric可以是\"angular\", “euclidean”, “manhattan”, “hamming”, or “dot”. 
    for i in range(item_emb_np.shape[0]):
        t.add_item(i, item_emb_np[i])
    t.build(10)   #  10 trees
    item_sim_dict = collections.defaultdict(dict)
    for ii, uu in enumerate(item_emb_np):
        rele_idx, sim_value = t.get_nns_by_item(ii, 11, include_distances=True)
        rele_idx = list(rele_idx[1:]) 
        sim_value = list(sim_value[1:])    
        print(rele_idx)
        print(sim_value)
        for num in range(len(rele_idx)):
            rele_raw_id = rele_idx[num]
            item_sim_dict[ii][rele_raw_id] = item_sim_dict.get(ii, {}).get(rele_raw_id, 0) + sim_value[num]
    return item_sim_dict
############################生成提交文件
def submit(item_sim_dict, save_path, model_name=None):
    # 将字典的形式转换成df
    user_item_score_list = []
    for i in range(len(item_sim_dict.items())):
        for item, score in item_sim_dict[i].items():
            user_item_score_list.append([i, item, score])    
    recall_df = pd.DataFrame(user_item_score_list, columns=['clicked_article_id', 'rec_article_id', 'pred_score'])
    print(recall_df)  
    recall_df['rank'] = recall_df.groupby(['clicked_article_id'])['pred_score'].rank(ascending=False, method='first')    
    # 判断是不是每个用户都有10篇文章及以上
    tmp = recall_df.groupby('clicked_article_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= 10    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= 10].set_index(['clicked_article_id', 'rank']).unstack(-1).reset_index()
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    print(submit) 
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'clicked_article_id', 1: 'rec_1', 2: 'rec_2', 
                                                  3: 'rec_3', 4: 'rec_4', 5: 'rec_5', 6: 'rec_6', 7: 'rec_7', 
                                                  8: 'rec_8', 9: 'rec_9', 10: 'rec_10'})
    save_name = save_path + model_name  + '.csv'
    submit.to_csv(save_name, index=False, header=True)

if __name__ == '__main__':
    ########################################################################################################预处理模块
    data = preprocess_data(data_path) 
    ########################################################################################################特征向量生成模块
    tokenize_corpus=generate_vectors(data_path,bert_model_path,data)
    ########################################################################################################相似度计算及智能推荐模块
    ###采用进行相似度检索并取前N值
    item_sim_dict_bert= annoy_similarity(tokenize_corpus,save_path)
    # 生成提交文件
    submit(item_sim_dict_bert, save_path, model_name='submit')


ModuleNotFoundError: No module named 'annoy'