In [1]:
import jieba
from docx import Document
import requests
import os
import pandas as pd
import numpy as np

In [2]:
### 分词

In [3]:
# 读取本地停用词表
stopwords = set(open('C:/pythonProject/train_model/stopwords.txt', 'r', encoding='utf-8').read().splitlines())

In [4]:
# 下载停用词表
# url = "https://raw.githubusercontent.com/goto456/stopwords/master/cn_stopwords.txt"
# stopwords = requests.get(url).text
# stopwords = set(stopwords.splitlines())

In [5]:
# 读取文本文件
def read_text_from_docx(file_path):
    document = Document(file_path)
    text = []
    
    for para in document.paragraphs:
        text.append(para.text)
    
    return "\n".join(text)

In [6]:
# 读取停用词
def preprocess_chinese_text(text):
    # 使用jieba进行分词和词性标注
    words = jieba.cut_for_search(text, HMM=True)
    
    # 过滤停用词和进行词性选择
    filtered_words = [word for word in words if word not in stopwords and word.strip() and not word.isspace()]  

    return filtered_words

In [7]:
def process_documents(directory, output_filename):
    """处理目录下的所有.docx文件，并保存分词结果到CSV文件中。"""
    data = []
    documents_name = []
    for filename in os.listdir(directory):
        if filename.endswith(".docx"):
            file_path = os.path.join(directory, filename)
            text = read_text_from_docx(file_path)
            segmented_text = preprocess_chinese_text(text)
            data.append({'filename': filename, 'text': segmented_text})
            documents_name.append(filename)
    
    # 将数据转换成DataFrame并保存到CSV文件
    df = pd.DataFrame(data)
    df.to_csv(output_filename, index=False, encoding='utf-8')
    
    return documents_name

In [8]:
# 指定包含.docx文件的目录和输出CSV文件的路径
directory = "技术1"
output_filename = "segmented_text.csv"

In [9]:
# 处理指定目录中的所有.docx文件并将结果保存为CSV
document_names = process_documents(directory, output_filename)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\LIUYIN~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.448 seconds.
Prefix dict has been built successfully.


In [10]:
### LDA建模

In [11]:
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel

In [12]:
def load_data(file_path):
    """从CSV文件加载数据。"""
    df = pd.read_csv(file_path)
    # 假设CSV文件中每行包含一个文档的分词文本
    # 将字符串形式的列表转换为真正的列表
    df['text'] = df['text'].apply(eval)
    return df['text'].tolist()

In [13]:
def prepare_corpus(documents):
    """准备语料库和词典，用于LDA模型。"""
    # 创建字典
    dictionary = corpora.Dictionary(documents)
    # 使用字典转换文本数据为词袋模型
    corpus = [dictionary.doc2bow(text) for text in documents]
    return dictionary, corpus

In [14]:
def lda_model(corpus, dictionary, num_topics=5):
    """训练LDA模型并返回。"""
    # 设置训练LDA模型的参数
    lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=30, random_state=42)
    return lda


In [15]:
# 加载数据
file_path = 'segmented_text.csv'
documents = load_data(file_path)

In [16]:
# 准备语料库和字典
dictionary, corpus = prepare_corpus(documents)

In [17]:
### 评价LDA模型 ###

In [18]:
#计算coherence score
def coherence(num_topics, dictionary, corpus, documents):
    lda = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=30, random_state=42)
    coherence_model_lda = models.CoherenceModel(model=lda, texts=documents, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    return coherence_lda


In [19]:
### 储存主题分布 ###

In [20]:
lda = LdaModel.load("C:/pythonProject/train_model/lda_model_35")

In [43]:
lda.show_topics()

[(8,
  '0.099*"姚明" + 0.032*"火箭" + 0.020*"年" + 0.018*"退役" + 0.018*"篮球" + 0.018*"NBA" + 0.011*"生涯" + 0.011*"中国" + 0.011*"火箭队" + 0.010*"职业"'),
 (26,
  '0.022*"中国" + 0.021*"裁判" + 0.016*"姚明" + 0.012*"上海" + 0.012*"篮协" + 0.012*"CBA" + 0.011*"篮球" + 0.009*"男篮" + 0.008*"布里" + 0.007*"邓华德"'),
 (33,
  '0.018*"足球" + 0.013*"教练" + 0.012*"中国" + 0.009*"年" + 0.008*"说" + 0.006*"球员" + 0.006*"俱乐部" + 0.005*"孩子" + 0.005*"国家" + 0.004*"岁"'),
 (3,
  '0.038*"奇才" + 0.017*"阿联" + 0.016*"沃尔" + 0.015*"易建联" + 0.013*"比赛" + 0.011*"中" + 0.010*"赛季" + 0.010*"格里" + 0.009*"格里芬" + 0.008*"步行"'),
 (14,
  '0.030*"说" + 0.020*"记者" + 0.016*"比赛" + 0.016*"球迷" + 0.008*"时" + 0.008*"训练" + 0.007*"做" + 0.007*"想" + 0.006*"采访" + 0.006*"时间"'),
 (17,
  '0.039*"大利" + 0.036*"意大利" + 0.015*"里" + 0.014*"皮" + 0.010*"世界" + 0.008*"意大利队" + 0.008*"新西兰" + 0.008*"西兰" + 0.007*"世界杯" + 0.007*"比赛"'),
 (22,
  '0.056*"法国" + 0.031*"法国队" + 0.014*"世界" + 0.013*"梅内" + 0.013*"内克" + 0.013*"梅内克" + 0.012*"世界杯" + 0.009*"内尔" + 0.008*"阿内" + 0.008*"阿内尔卡"'),
 (18,
  '0.015*"

In [21]:
### 获取每个文档的主题分布
def get_topic_distribution(lda, corpus):
    topic_distribution = []
    for doc in corpus:
        topic_distribution.append(lda.get_document_topics(doc,minimum_probability=0.0))
    return topic_distribution

In [44]:
## 保存主题分布到文件
def save_topic_distribution(lda, corpus, output_filename):
    topic_distribution = get_topic_distribution(lda, corpus)
    df = pd.DataFrame(topic_distribution)
    df.to_csv(output_filename, index=False)

In [45]:
output_filename = 'topic_distribution.csv'
save_topic_distribution(lda, corpus, output_filename)

In [53]:
## 主题可视化 ###

import pyLDAvis.gensim
pyLDAvis.enable_notebook()
data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.save_html(data, 'lda.html')

ValidationError: 
 * Not all rows (distributions) in topic_term_dists sum to 1.

In [25]:
### 计算文档之间的主题距离 ###

In [26]:
# 计算两个两个主题分布之间的曼哈顿距离
def calculate_topic_distance_abs_diff(doc_topics_i, doc_topics_j):
    # 保证每个主题的概率被考虑到，即使某些主题在某文档中的概率为0
    # 将主题分布转换为字典形式
    topic_dist_i = dict(doc_topics_i)
    topic_dist_j = dict(doc_topics_j)
    
    # 获取所有主题的并集
    all_topics = set(topic_dist_i.keys()).union(set(topic_dist_j.keys()))
    distance = sum(abs(topic_dist_i.get(topic, 0) - topic_dist_j.get(topic, 0)) for topic in all_topics)
    return distance

In [27]:
doc_topics = get_topic_distribution(lda, corpus)

In [28]:
# 计算所有文档之间的主题距离 
def calculate_all_topic_distances(doc_topics):
    num_docs = len(doc_topics)
    distances = [[0] * num_docs for _ in range(num_docs)]
    for i in range(num_docs):
        for j in range(i+1, num_docs):
            distance = calculate_topic_distance_abs_diff(doc_topics[i], doc_topics[j])
            distances[i][j] = distances[j][i] = distance
    return distances

In [29]:
# 处理查询并计算查询的主题分布 
def preprocess_query(query, lda, dictionary):
    query_bow = dictionary.doc2bow(jieba.cut_for_search(query, HMM=True))
    query_topics = lda.get_document_topics(query_bow, minimum_probability=0.0)
    return dict(query_topics)

query = '技术'
query_topics = preprocess_query(query, lda, dictionary)

In [30]:
# 将主题距离转换为相似度
# 阈值threshold用于确定两个文档之间是否存在链接，按需调整
def convert_distance_to_similarity(distances, threshold=0.1):
    max_distance = max(max(row) for row in distances if row)
    similarity_matrix = []
    links = []
    for i, row in enumerate(distances):
        new_row = []
        link_row = []
        for j, dist in enumerate(row):
            similarity = 1 - (dist / max_distance) if max_distance else 1
            new_row.append(similarity)
            if similarity > threshold and i != j:
                link_row.append(j)
        similarity_matrix.append(new_row)
        links.append(link_row)
    return similarity_matrix, links

In [31]:
def calculate_query_similarity(query_topics, doc_topics):
    num_docs = len(doc_topics)
    distances = [[0] for _ in range(num_docs)]
    for i in range(num_docs):
        distance = calculate_topic_distance_abs_diff(query_topics, doc_topics[i])
        distances[i] = distance
    max_distance = max(distances) if distances else 0
    query_similarity = [1 - (dist / max_distance) if max_distance else 1 for dist in distances]
    return query_similarity

In [32]:
def adjust_link_weights(doc_topics, query_similarity, links, base_weight=0.01):
    adjusted_weights = []
    for i, topics in enumerate(doc_topics):
        doc_similarity = query_similarity[i]
        # 将基础权重添加到与查询相关的文档上
        row_weights = [base_weight + doc_similarity if j in links[i] else 0 for j in range(len(doc_topics))]
        adjusted_weights.append(row_weights)
    return adjusted_weights


In [33]:
query_similarity = calculate_query_similarity(query_topics, doc_topics)

In [34]:
query_similarity

[0.4601869705915761,
 0.24386147825803794,
 0.15967412767933864,
 0.01820093688090152,
 0.0002984764617403046,
 0.08964853190209754,
 0.3495462898577584,
 0.025152324235296897,
 0.14351344073932715,
 0.0,
 0.20233717625530168,
 0.12719021067869796,
 0.24241894778584272,
 0.2388192418147178,
 0.16795632991711684,
 0.24895084149501012,
 0.020854069671643827,
 0.03811001773482059,
 0.11505013461046687,
 0.014691897979240531]

In [35]:
# 计算PageRank
def page_rank(links, similarities, adjusted_weights, alpha=0.85, convergence_threshold=0.0001):
    N = len(links)
    pr = np.ones(N) / N  # 初始化PR值，总和为1
    # 进行迭代直到收敛
    while True:
        new_pr = np.zeros(N)
        for i in range(N):
            link_contributions = 0
            for j in links[i]:  # 遍历节点i的所有出链节点j
                if len(links[j]) > 0:  # 避免除以零
                    # 结合 adjusted_weights 和 similarities 作为权重
                    link_weight = adjusted_weights[j][i] * similarities[j][i]
                    link_contributions += pr[j] * link_weight / len(links[j])
            new_pr[i] = (1 - alpha) / N + alpha * link_contributions
        # 归一化新的PageRank值，确保它们的总和为1
        new_pr /= np.sum(new_pr)  # 归一化步骤
        
        change = np.linalg.norm(new_pr - pr)
        if change < convergence_threshold:
            break
        pr = new_pr
    return pr

In [36]:
distances = calculate_all_topic_distances(doc_topics)

In [37]:
similarities, links = convert_distance_to_similarity(distances)

In [38]:
adjusted_weights = adjust_link_weights(doc_topics, query_similarity, links)

In [39]:
pr = page_rank(links, similarities, adjusted_weights)

In [40]:
pr

array([0.05211678, 0.05388888, 0.05242332, 0.04033342, 0.05289948,
       0.05356109, 0.05318298, 0.05209765, 0.05222325, 0.05060618,
       0.05266906, 0.05009194, 0.05096044, 0.05037129, 0.04907672,
       0.04955003, 0.05007369, 0.03965215, 0.05395474, 0.04026691])

In [41]:
# 按PageRank值对文档进行排序
pagerank_score = pr
doc_pagerank = list(zip(document_names, pagerank_score))

# 按PageRank分数降序排序
sorted_doc_pagerank = sorted(doc_pagerank, key=lambda x: x[1], reverse=True)


In [42]:
# 打印排序后的结果
for doc_name, pr_score in sorted_doc_pagerank:
    print(f"{doc_name}: {pr_score}")

19_技术与环境的可持续发展.docx: 0.05395474019419387
02_现代技术的发展.docx: 0.05388887738210079
06_技术和社会变革.docx: 0.0535610878609115
07_技术与环境保护.docx: 0.053182978704172865
05_技术进步对经济的推动.docx: 0.05289948463443641
11_技术在航空业的革新.docx: 0.05266906448112079
03_技术与教育.docx: 0.052423319227500834
09_智能技术的未来.docx: 0.05222324828599845
01_技术革命.docx: 0.052116780209683654
08_技术在农业中的应用.docx: 0.052097648914056535
13_数字技术与数据安全.docx: 0.05096043526383481
10_技术在交通领域的变革.docx: 0.05060618368848751
14_技术在金融服务中的应用.docx: 0.050371290933558105
12_技术推动的社会变化.docx: 0.05009194281439715
17_技术与文化的互动.docx: 0.05007369156372026
16_技术对教育的长远影响.docx: 0.049550030748569014
15_技术在建筑行业的应用.docx: 0.04907672411951765
04_技术在医疗中的应用.docx: 0.04033341882045411
20_技术在全球治理中的作用.docx: 0.040266906370827325
18_未来技术的道德挑战.docx: 0.039652145782458355


In [54]:
### 结合点击率（CTR）分析 ###

In [55]:
# 假设 clicks 是上传的点击次数数组
clicks = np.array([5, 10, 15, 20, 25, 5, 10, 15, 20, 25, 5, 10, 15, 20, 25, 5, 10, 15, 20, 25])

In [56]:
# 将点击次数转换为点击率（CTR）
# 这里假设每个文档都展示了相同的次数，CTR = 点击次数 / 最大点击次数
max_clicks = np.max(clicks)
ctr = clicks / max_clicks  # 归一化点击次数作为CTR的简化模型

In [57]:
# 结合PageRank和CTR计算最终排序
def page_rank_with_ctr(links, similarities, ctr, alpha=0.85, beta=0.7, convergence_threshold=0.0001):
    N = len(links)
    pr = np.ones(N) / N  # 初始均等分配PageRank
    change = 1
    while change > convergence_threshold:
        new_pr = np.zeros(N)
        for i in range(N):
            link_contributions = 0
            for j in links[i]:  # 遍历节点i的所有出链节点j
                if len(links[j]) > 0:  # 避免除以零
                    link_contributions += pr[j] * similarities[i][j] / len(links[j])
            new_pr[i] = (1 - alpha) / N + alpha * (beta * link_contributions + (1 - beta) * ctr[i])
        # 归一化新的PageRank值，确保它们的总和为1
        new_pr /= np.sum(new_pr)  # 归一化步骤
        
        change = np.linalg.norm(new_pr - pr)
        pr = new_pr
    return pr


In [58]:
pr_with_ctr = page_rank_with_ctr(links, similarities, ctr)

In [59]:
# 按PageRank值对文档进行排序
pagerank_score = pr_with_ctr
doc_pagerank = list(zip(document_names, pagerank_score))

# 按PageRank分数降序排序
sorted_doc_pagerank = sorted(doc_pagerank, key=lambda x: x[1], reverse=True)

# 打印排序后的结果
for doc_name, pr_score in sorted_doc_pagerank:
    print(f"{doc_name}: {pr_score}")

10_技术在交通领域的变革.docx: 0.08001758953702781
05_技术进步对经济的推动.docx: 0.07955876089571845
15_技术在建筑行业的应用.docx: 0.0786462121886433
20_技术在全球治理中的作用.docx: 0.07771522271598984
19_技术与环境的可持续发展.docx: 0.0658173710786655
09_智能技术的未来.docx: 0.06495895339669519
14_技术在金融服务中的应用.docx: 0.0643529890581417
04_技术在医疗中的应用.docx: 0.062285011410169504
03_技术与教育.docx: 0.05109436418426403
08_技术在农业中的应用.docx: 0.050777050122811855
13_数字技术与数据安全.docx: 0.05039450388727602
18_未来技术的道德挑战.docx: 0.048297693445973185
12_技术推动的社会变化.docx: 0.03616673477649371
02_现代技术的发展.docx: 0.035528903335643014
07_技术与环境保护.docx: 0.03529284166733676
17_技术与文化的互动.docx: 0.03480940285410139
06_技术和社会变革.docx: 0.022021507697512224
11_技术在航空业的革新.docx: 0.021045707157936307
01_技术革命.docx: 0.0207020902725302
16_技术对教育的长远影响.docx: 0.020517090317070028
