***Bert获得文章向量***

In [None]:
import ijson
from cogdl.oag import oagbert
import torch
from tqdm import tqdm
import json

# 检查CUDA是否可用，并相应地设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载OAG-BERT模型，并将其移至指定设备
tokenizer, model = oagbert("oagbert-v2")
model.to(device)

# 定义一个函数来处理每篇论文
def process_paper(paper, model, device):
    title = paper.get("Title", "")
    abstract = paper.get("Abstract", "")
    authors = paper.get("Authors", [])
    venue = paper.get("Journal", "")
    affiliations = paper.get("Affiliations", [])
    concepts = []  # 没有概念字段，所以使用空列表

    # 构建模型输入
    input_ids, input_masks, token_type_ids, masked_lm_labels, position_ids, position_ids_second, masked_positions, num_spans = model.build_inputs(
        title=title, abstract=abstract, venue=venue, authors=authors, concepts=concepts, affiliations=affiliations
    )

    # 将输入数据移至设备
    input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(device)
    token_type_ids = torch.LongTensor(token_type_ids).unsqueeze(0).to(device)
    attention_mask = torch.LongTensor(input_masks).unsqueeze(0).to(device)
    position_ids = torch.LongTensor(position_ids).unsqueeze(0).to(device)
    position_ids_second = torch.LongTensor(position_ids_second).unsqueeze(0).to(device)

    # 运行模型
    with torch.no_grad():
        sequence_output, pooled_output = model.bert.forward(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            output_all_encoded_layers=False,
            checkpoint_activations=False,
            position_ids=position_ids,
            position_ids_second=position_ids_second
        )

    # 将输出移回CPU，并转换为numpy数组
    return pooled_output.cpu().detach().numpy()

# 读取目标索引
with open("network_graph.json", "r") as f:
    network_graph = json.load(f)
    target_indices = list(network_graph.keys())  # 获取所有节点的索引值


# 打开输出文件
with open("output_vectors.json", "w") as file:
    # 使用ijson逐步读取并处理文件
    with open("Aminer-Paper.json", "r", encoding="utf-8") as infile:
        papers = ijson.items(infile, 'item')

        # 使用tqdm显示进度
        for paper in tqdm(papers, total=2092356, desc="Processing Papers"):
            index = paper.get("Index")
            if str(index) in target_indices:
                output = process_paper(paper, model, device)
                # 将每个输出向量写入文件，每个向量一行
                file.write(json.dumps({"Index": index, "Vector": output.tolist()}) + "\n")

print("处理完成，输出向量已保存至output_vectors.json")

***筛选数据集***

***首先先要找出到底有哪些文章，在对文章对进行随机处理，然后求出随即处理对中的LOF Cos LDA，以及对应的flag = 1(异常)***

***再挑选非常适合的正常的数据对作为正常 flag= 0***

In [101]:
import json
import random

def load_network_graph(file_name):
    with open(file_name, "r") as file:
        network_graph = json.load(file)
    return network_graph

def get_unique_article_ids(network_graph):
    article_ids = set()
    for article_id, links in network_graph.items():
        article_ids.add(int(article_id))  
        article_ids.update(map(int, links['in']))  
        article_ids.update(map(int, links['out']))  
    return list(article_ids)

def generate_random_citations(article_ids, network_graph, count=2000):
    generated_pairs = set()
    existing_pairs = set()

    for article_id, links in network_graph.items():
        for cited in links['in']:
            existing_pairs.add((int(article_id), int(cited)))

    while len(generated_pairs) < count:
        cite = random.choice(article_ids)
        cited = random.choice(article_ids)

        if cite != cited and (cite, cited) not in existing_pairs and (cite, cited) not in generated_pairs:
            generated_pairs.add((cite, cited))

    return list(generated_pairs)

network_graph_file = 'network_graph.json'
network_graph = load_network_graph(network_graph_file)
article_ids = get_unique_article_ids(network_graph)
random_citations = generate_random_citations(article_ids, network_graph)

json_structure = [{"Cite": pair[0], "Cited": pair[1],"Flag":1,} for pair in random_citations]
with open("random_citations.json", "w") as json_file:
    json.dump(json_structure, json_file, indent=4)



In [103]:
import json

def merge_network_graphs(network_graph_file, random_citations_file, output_file):
    with open(network_graph_file, 'r') as file:
        network_graph = json.load(file)

    with open(random_citations_file, 'r') as file:
        random_citations = json.load(file)

    max_index = max(item["index"] for item in network_graph)

    for i, citation in enumerate(random_citations, start=max_index + 1):
        citation_entry = {
            "index": i,
            "Cite": citation["Cite"],
            "Cited": citation["Cited"],
            "flag": citation.get("Flag", 1)  
        }
        network_graph.append(citation_entry)

    with open(output_file, 'w') as file:
        json.dump(network_graph, file, indent=4)

network_graph_file = 'converted_network_graph.json'
random_citations_file = 'random_citations.json'
output_file = 'merged_network_graph.json'

merge_network_graphs(network_graph_file, random_citations_file, output_file)


***LOF算法***

In [104]:
import json
import numpy as np
from sklearn.neighbors import LocalOutlierFactor

# 从增量数据中整合所有独特的文章ID
def get_unique_article_ids(increment_data):  #假设这里的increment_data是已经把增加的和原本的数据对都已经整合到一起了！！！！
    article_ids = set()
    for item in increment_data:
        article_ids.add(item["Cite"])
        article_ids.add(item["Cited"])
    return list(article_ids)

# 从output_vectors.json文件中提取相应的文章向量
def get_article_vectors(article_ids, vector_file):
    vectors = {}
    with open(vector_file, "r") as f:
        for line in f:
            data = json.loads(line)
            article_id = data["Index"]
            if article_id in article_ids:
                vectors[article_id] = data["Vector"][0]  # 仅使用第一个向量
    return vectors

def detect_anomalies_with_lof(vectors, n_neighbors=20, contamination='auto'):
    """
    使用LOF算法检测异常值。
    
    参数:
    data (array-like): 需要检测的数据点，应为二维数组形式。
    n_neighbors (int): 用于计算LOF的邻居数,默认为20。
    contamination (float or 'auto'): 数据中预期的异常值比例。

    返回:
    is_anomaly (numpy array): 数据点的异常标记,1表示正常点,-1表示异常点。
    scores (numpy array): 数据点的异常分数，分数越低表示越可能是异常值。
    """

    data = np.array(list(vectors.values()))
    # 初始化LOF检测器
    lof = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination, novelty=False)
    # 训练模型并预测
    is_anomaly = lof.fit_predict(data)

    # 获取每个点的异常分数
    scores = -lof.negative_outlier_factor_  # 转换为正数，数值越大表示越可能是异常

    """
    import numpy as np

    # 假设 scores 是通过 LOF 算法计算得到的异常分数数组
    scores = -lof.negative_outlier_factor_

    # 确定一个阈值，这里假设使用分数的平均值加上两倍的标准差作为阈值
    threshold = np.mean(scores) + 2 * np.std(scores)

    # 初始化一个数组来存储异常标记
    is_anomaly = np.ones_like(scores, dtype=int)  # 默认所有点为正常

    # 标记异常值
    is_anomaly[scores > threshold] = -1  # 分数高于阈值的点被标记为异常

    # 现在 is_anomaly 数组包含了基于自定义阈值的异常标记，-1 表示异常，1 表示正常 
    """
    
    return is_anomaly, scores

def map_scores_to_citations(increment_data, vectors, scores):
    scores_dict = {article_id: score for article_id, score in zip(vectors.keys(), scores)}
    results = []

    # 遍历增量数据
    for item in increment_data:
        cite_id = item["Cite"]
        cited_id = item["Cited"]

        cite_score = scores_dict.get(cite_id, 0)
        cited_score = scores_dict.get(cited_id, 0)

        # 计算综合分数，这里我们简单地取两者分数的平均值，也可以根据需要采用不同的策略
        combined_score = (cite_score + cited_score) / 2
        # 这个方法有待讨论

        results.append({
            "Cite": cite_id,
            "Cited": cited_id,
            "LOF_Score": combined_score
        })

    return results


# ########################################################################################################################################
# LOF算法总的输入是output_vector.json  converted_network_graph.json  上次迭代结果剩下的result  flag = 0中结果比较好的引用对(将他转变为result格式)
# LOF算法的输出是result的结果（这个结果不仅可以在下一次增量学习迭代的时候拿来利用，同时也可以用作接下来分数的整合过程中）
# #########################################################################################################################################

# 加载增量数据
with open("merged_network_graph.json", "r") as f:
    increment_data = json.load(f)

article_ids = get_unique_article_ids(increment_data)
vectors = get_article_vectors(article_ids, "output_vectors.json")  # 确保文件名正确
is_anomaly, scores = detect_anomalies_with_lof(vectors)
results = map_scores_to_citations(increment_data, vectors, scores)
with open("LOF.json", 'w', encoding='utf-8') as file:
    json.dump(results, file, indent=4)

***LDA***

In [105]:
import json
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
import nltk

# Ensure the necessary NLTK downloads
nltk.download('stopwords')

num_topics = 20
passes = 30

# 加载JSON文件并提取摘要
input_filename = './abstracts.json'
with open(input_filename, 'r', encoding='utf-8') as file:
    # 加载数据
    data = json.load(file)
    # 仅保留摘要非空的项
    filtered_data = [item for item in data if item['Abstract']]
    
# 从过滤后的数据中提取摘要
documents = [item['Abstract'] for item in filtered_data]

# 文档预处理函数
def preprocess(documents):
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    preprocessed_texts = []
    for doc in tqdm(documents, desc="Preprocessing documents"):
        raw = doc.lower()
        tokens = tokenizer.tokenize(raw)
        stopped_tokens = [i for i in tokens if i not in stop_words]
        stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens]
        preprocessed_texts.append(stemmed_tokens)
    return preprocessed_texts

# 预处理摘要
preprocessed_texts = preprocess(documents)

# 创建字典和语料库
dictionary = corpora.Dictionary(preprocessed_texts)
corpus = [dictionary.doc2bow(text) for text in preprocessed_texts]

# 构建LDA模型
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes, update_every=1, eval_every=None)

# 打印并保存主题
output_filename = 'lda_topics.txt'
with open(output_filename, 'w', encoding='utf-8') as out_file:
    for i, topic in enumerate(lda_model.print_topics(num_words=10)):
        print(f"Topic {i+1}: {topic}")
        out_file.write(f"Topic {i+1}: {topic}\n")

print(f'LDA模型的主题已保存到文件: {output_filename}')

# 保存文档主题向量为JSON
def save_document_topics_as_vectors_json(data, corpus, lda_model, num_topics, output_filename):
    doc_topics_vectors = []
    for item, doc_bow in zip(data, corpus):
        topic_vector = [0.0] * num_topics
        topics = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
        for topic_id, prob in topics:
            topic_vector[topic_id] = float(prob)
        doc_index = item['Index']
        doc_topics_vectors.append({"Index": doc_index, "Vector": topic_vector})

    with open(output_filename, 'w', encoding='utf-8') as out_file:
        json.dump(doc_topics_vectors, out_file, indent=4)

doc_topics_vectors_filename = 'document_topics_vectors.json'
save_document_topics_as_vectors_json(filtered_data, corpus, lda_model, num_topics, doc_topics_vectors_filename)

print(f'每篇文章的主题分布向量及其索引已保存到文件: {doc_topics_vectors_filename}')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luzixuan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Preprocessing documents: 100%|██████████| 11999/11999 [00:22<00:00, 540.65it/s]


Topic 1: (0, '0.122*"busi" + 0.089*"process" + 0.063*"e" + 0.036*"event" + 0.036*"tempor" + 0.033*"spatial" + 0.027*"g" + 0.018*"time" + 0.015*"geograph" + 0.012*"organis"')
Topic 2: (1, '0.089*"databas" + 0.085*"data" + 0.067*"system" + 0.037*"manag" + 0.030*"workflow" + 0.023*"secur" + 0.021*"applic" + 0.019*"transact" + 0.016*"mine" + 0.015*"design"')
Topic 3: (2, '0.035*"book" + 0.023*"learn" + 0.018*"use" + 0.015*"chapter" + 0.014*"student" + 0.014*"2" + 0.014*"includ" + 0.013*"1" + 0.012*"publish" + 0.011*"design"')
Topic 4: (3, '0.259*"web" + 0.057*"applic" + 0.044*"site" + 0.035*"page" + 0.034*"warehous" + 0.029*"user" + 0.020*"navig" + 0.017*"internet" + 0.016*"hypertext" + 0.016*"link"')
Topic 5: (4, '0.112*"test" + 0.081*"graph" + 0.056*"visual" + 0.024*"gener" + 0.022*"imag" + 0.018*"graphic" + 0.016*"diagram" + 0.015*"label" + 0.014*"structur" + 0.014*"node"')
Topic 6: (5, '0.055*"algorithm" + 0.041*"time" + 0.036*"problem" + 0.020*"state" + 0.018*"comput" + 0.015*"optim" 

In [106]:
import json
import numpy as np

def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

def infer_topic_vector_for_article(article_index, network_graph, lda_vectors):
    # 获取当前文章的引用和被引用文章索引列表
    neighbors = network_graph.get(str(article_index), {}).get('in', []) + network_graph.get(str(article_index), {}).get('out', [])
    # 获取相邻文章的LDA向量
    neighbor_vectors = [lda_vectors[str(neighbor)] for neighbor in neighbors if str(neighbor) in lda_vectors]

    # 如果有相邻文章的LDA向量，则计算平均向量
    if neighbor_vectors:
        return np.mean(neighbor_vectors, axis=0)
    else:
        return None

def main():
    lda_vectors_data = load_json('document_topics_vectors.json')
    # 将LDA向量数据转换为字典格式，键为文章的Index，值为Vector
    lda_vectors = {str(item["Index"]): item["Vector"] for item in lda_vectors_data}
    
    network_graph = load_json('network_graph.json')

    inferred_vectors = {}
    # 遍历网络图中的每篇文章
    for article_index in network_graph.keys():
        if str(article_index) not in lda_vectors:
            inferred_vector = infer_topic_vector_for_article(article_index, network_graph, lda_vectors)
            if inferred_vector is not None:
                inferred_vectors[str(article_index)] = inferred_vector.tolist()

    # 将推断出的向量保存到文件中
    with open('inferred_LDA_vectors.json', 'w') as outfile:
        json.dump(inferred_vectors, outfile, indent=4)

if __name__ == "__main__":
    main()


In [107]:
import json

def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

def convert_to_lda_format(inferred_vectors):
    return [{"Index": int(index), "Vector": vector} for index, vector in inferred_vectors.items()]

def main():
    lda_vectors = load_json('document_topics_vectors.json') 
    inferred_vectors = load_json('inferred_LDA_vectors.json') 

    # 将推断的向量转换为LDA向量格式
    inferred_vectors_formatted = convert_to_lda_format(inferred_vectors)

    # 将推断的向量合并到LDA向量列表中
    merged_vectors = lda_vectors + inferred_vectors_formatted

    # 将合并后的向量列表按照Index排序
    merged_vectors_sorted = sorted(merged_vectors, key=lambda x: x["Index"])

    # 保存合并后的向量到新文件
    with open('LDA_vectors_None1.json', 'w', encoding='utf-8') as outfile:
        json.dump(merged_vectors_sorted, outfile, indent=4, ensure_ascii=False)

if __name__ == "__main__":
    main()


In [108]:
import json
import numpy as np

def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

def infer_topic_vector_for_article(article_index, network_graph, lda_vectors):
    # 获取当前文章的引用和被引用文章索引列表
    neighbors = network_graph.get(str(article_index), {}).get('in', []) + network_graph.get(str(article_index), {}).get('out', [])
    # 获取相邻文章的LDA向量
    neighbor_vectors = [lda_vectors[str(neighbor)] for neighbor in neighbors if str(neighbor) in lda_vectors]

    # 如果有相邻文章的LDA向量，则计算平均向量
    if neighbor_vectors:
        return np.mean(neighbor_vectors, axis=0)
    else:
        return None

def main():
    lda_vectors_data = load_json('LDA_vectors_None1.json')
    # 将LDA向量数据转换为字典格式，键为文章的Index，值为Vector
    lda_vectors = {str(item["Index"]): item["Vector"] for item in lda_vectors_data}
    
    network_graph = load_json('network_graph.json')

    inferred_vectors = {}
    # 遍历网络图中的每篇文章
    for article_index in network_graph.keys():
        if str(article_index) not in lda_vectors:
            inferred_vector = infer_topic_vector_for_article(article_index, network_graph, lda_vectors)
            if inferred_vector is not None:
                inferred_vectors[str(article_index)] = inferred_vector.tolist()

    # 将推断出的向量保存到文件中
    with open('inferred_LDA_vectors_2.json', 'w') as outfile:
        json.dump(inferred_vectors, outfile, indent=4)

if __name__ == "__main__":
    main()


In [109]:
import json

def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

def convert_to_lda_format(inferred_vectors):
    return [{"Index": int(index), "Vector": vector} for index, vector in inferred_vectors.items()]

def main():
    lda_vectors = load_json('LDA_vectors_None1.json') 
    inferred_vectors = load_json('inferred_LDA_vectors_2.json') 

    # 将推断的向量转换为LDA向量格式
    inferred_vectors_formatted = convert_to_lda_format(inferred_vectors)

    # 将推断的向量合并到LDA向量列表中
    merged_vectors = lda_vectors + inferred_vectors_formatted

    # 将合并后的向量列表按照Index排序
    merged_vectors_sorted = sorted(merged_vectors, key=lambda x: x["Index"])

    # 保存合并后的向量到新文件
    with open('LDA_vectors_None2.json', 'w', encoding='utf-8') as outfile:
        json.dump(merged_vectors_sorted, outfile, indent=4, ensure_ascii=False)

if __name__ == "__main__":
    main()


In [110]:
import json
import numpy as np

def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

def infer_topic_vector_for_article(article_index, network_graph, lda_vectors):
    # 获取当前文章的引用和被引用文章索引列表
    neighbors = network_graph.get(str(article_index), {}).get('in', []) + network_graph.get(str(article_index), {}).get('out', [])
    # 获取相邻文章的LDA向量
    neighbor_vectors = [lda_vectors[str(neighbor)] for neighbor in neighbors if str(neighbor) in lda_vectors]

    # 如果有相邻文章的LDA向量，则计算平均向量
    if neighbor_vectors:
        return np.mean(neighbor_vectors, axis=0)
    else:
        return None

def main():
    lda_vectors_data = load_json('LDA_vectors_None2.json')
    # 将LDA向量数据转换为字典格式，键为文章的Index，值为Vector
    lda_vectors = {str(item["Index"]): item["Vector"] for item in lda_vectors_data}
    
    network_graph = load_json('network_graph.json')

    inferred_vectors = {}
    # 遍历网络图中的每篇文章
    for article_index in network_graph.keys():
        if str(article_index) not in lda_vectors:
            inferred_vector = infer_topic_vector_for_article(article_index, network_graph, lda_vectors)
            if inferred_vector is not None:
                inferred_vectors[str(article_index)] = inferred_vector.tolist()

    # 将推断出的向量保存到文件中
    with open('inferred_LDA_vectors_3.json', 'w') as outfile:
        json.dump(inferred_vectors, outfile, indent=4)

if __name__ == "__main__":
    main()


In [111]:
import json

def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

def convert_to_lda_format(inferred_vectors):
    return [{"Index": int(index), "Vector": vector} for index, vector in inferred_vectors.items()]

def main():
    lda_vectors = load_json('LDA_vectors_None2.json') 
    inferred_vectors = load_json('inferred_LDA_vectors_3.json') 

    # 将推断的向量转换为LDA向量格式
    inferred_vectors_formatted = convert_to_lda_format(inferred_vectors)

    # 将推断的向量合并到LDA向量列表中
    merged_vectors = lda_vectors + inferred_vectors_formatted

    # 将合并后的向量列表按照Index排序
    merged_vectors_sorted = sorted(merged_vectors, key=lambda x: x["Index"])

    # 保存合并后的向量到新文件
    with open('LDA_vectors.json', 'w', encoding='utf-8') as outfile:
        json.dump(merged_vectors_sorted, outfile, indent=4, ensure_ascii=False)

if __name__ == "__main__":
    main()


In [112]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)

def calculate_cosine_similarity(vector1, vector2):
    return cosine_similarity([vector1], [vector2])[0][0]

def main():
    lda_vectors = load_json('LDA_vectors.json')  # 已合并的LDA向量
    # lda_vectors = load_json('document_topics_vectors.json')
    network_graph = load_json('merged_network_graph.json')  # 引用网络关系

    # 将LDA向量转换为字典形式
    lda_dict = {item['Index']: item['Vector'] for item in lda_vectors}

    results = []

    for relation in network_graph:
        cite_id = relation['Cite']
        cited_id = relation['Cited']

        # 获取引用和被引用文章的主题向量
        cite_vector = lda_dict.get(cite_id)
        cited_vector = lda_dict.get(cited_id)

        if cite_vector is not None and cited_vector is not None:
            lda_score = calculate_cosine_similarity(cite_vector, cited_vector)
        else:
            lda_score = None  # 如果找不到对应的LDA向量，则设置相似度为None

        results.append({
            "Cite": cite_id,
            "Cited": cited_id,
            "LDA_Score": lda_score
        })

    with open('LDA.json', 'w', encoding='utf-8') as outfile:
        json.dump(results, outfile, indent=4, ensure_ascii=False)

if __name__ == "__main__":
    main()


***Cos方法***

In [114]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)
def load_json_lines(filename):
    with open(filename, 'r') as file:
        return [json.loads(line) for line in file]

def compute_similarity(converted_graph, vectors):
    vectors_dict = {item["Index"]: np.array(item["Vector"][0]) for item in vectors}
    result = []
    for item in tqdm(converted_graph, desc="Computing similarities"):
        cite_vector = vectors_dict.get(item["Cite"])
        cited_vector = vectors_dict.get(item["Cited"])
        if cite_vector is not None and cited_vector is not None:
            similarity = cosine_similarity([cite_vector], [cited_vector])[0][0]
        else:
            similarity = None
        item["similarity"] = similarity
        result.append(item)
    return result

def save_json(data, filename):
    results = []
    for item in data:
        # 获取引用和被引用的文章ID
        cite_id = item["Cite"]
        cited_id = item["Cited"]
        similarity = item["similarity"]

        # 将结果添加到列表中
        results.append({
            "Cite": cite_id,
            "Cited": cited_id,
            "Cos_Score": similarity
        })
    
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(results, file, indent=4)

converted_network_graph = load_json('merged_network_graph.json')
output_vectors = load_json_lines('output_vectors.json')

result = compute_similarity(converted_network_graph, output_vectors)

save_json(result, 'Cos.json')

print("Completed!")


"""
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
def load_json(filename):
    with open(filename, 'r') as file:
        return json.load(file)
def load_json_lines(filename):
    with open(filename, 'r') as file:
        return [json.loads(line) for line in file]

def compute_similarity(converted_graph, vectors):
    vectors_dict = {item["Index"]: np.array(item["Vector"][0]) for item in vectors}
    result = []
    for item in tqdm(converted_graph, desc="Computing similarities"):
        cite_vector = vectors_dict.get(item["Cite"])
        cited_vector = vectors_dict.get(item["Cited"])
        if cite_vector is not None and cited_vector is not None:
            similarity = cosine_similarity([cite_vector], [cited_vector])[0][0]
        else:
            similarity = None
        item["similarity"] = similarity
        result.append(item)
    return result

def save_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for item in data:
            file.write(json.dumps(item) + "\n")

# 路径根据实际情况修改
converted_network_graph = load_json('converted_network_graph.json')
output_vectors = load_json_lines('output_vectors.json')

# 计算相似度
result = compute_similarity(converted_network_graph, output_vectors)

# 保存结果到新文件
save_json(result, 'new_file_with_similarities.json')

print("Completed!")
"""

Computing similarities: 100%|██████████| 26533/26533 [00:09<00:00, 2666.76it/s]


Completed!


'\nimport json\nimport numpy as np\nfrom sklearn.metrics.pairwise import cosine_similarity\nfrom tqdm import tqdm\ndef load_json(filename):\n    with open(filename, \'r\') as file:\n        return json.load(file)\ndef load_json_lines(filename):\n    with open(filename, \'r\') as file:\n        return [json.loads(line) for line in file]\n\ndef compute_similarity(converted_graph, vectors):\n    vectors_dict = {item["Index"]: np.array(item["Vector"][0]) for item in vectors}\n    result = []\n    for item in tqdm(converted_graph, desc="Computing similarities"):\n        cite_vector = vectors_dict.get(item["Cite"])\n        cited_vector = vectors_dict.get(item["Cited"])\n        if cite_vector is not None and cited_vector is not None:\n            similarity = cosine_similarity([cite_vector], [cited_vector])[0][0]\n        else:\n            similarity = None\n        item["similarity"] = similarity\n        result.append(item)\n    return result\n\ndef save_json(data, filename):\n    with 

***融合***

In [119]:
import json

# Load JSON files
def load_json_file(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# Merge the JSON data
def merge_json_data(lof_data, cos_data, lda_data):
    merged_data = []

    cos_dict = {(item["Cite"], item["Cited"]): item["Cos_Score"] for item in cos_data}
    lda_dict = {(item["Cite"], item["Cited"]): item["LDA_Score"] for item in lda_data}

    for index, lof_item in enumerate(lof_data, start=1):
        cite = lof_item["Cite"]
        cited = lof_item["Cited"]
        cos_score = cos_dict.get((cite, cited), None)
        lda_score = lda_dict.get((cite, cited), None)

        flag = 1 if index > len(lof_data) - 20 else []

        merged_item = {
            "index": index,
            "Cite": cite,
            "Cited": cited,
            "LOF_Score": lof_item["LOF_Score"],
            "Cos_Score": cos_score,
            "LDA_Score": lda_score,
            "Score":[],
            "flag":flag
        }
        merged_data.append(merged_item)
        
    return merged_data

lof_json = load_json_file("LOF.json")
cos_json = load_json_file("Cos.json")
lda_json = load_json_file("LDA.json")
merged_json_data = merge_json_data(lof_json, cos_json, lda_json)

data = [item for item in merged_json_data if item.get('LDA_Score', 0) < 1]

with open("Merged_Data.json", 'w', encoding='utf-8') as file:
    json.dump(data, file, indent=4)

print("Merged data saved to Merged_Data.json")


Merged data saved to Merged_Data.json


***添加flag = 0***

In [120]:
import json

def composite_score(item):
    return item.get('LDA_Score', 0) + item.get('Cos_Score', 0) + (1 - item.get('LOF_Score', 2))

# 读取JSON文件
with open('Merged_Data.json', 'r') as file:
    data = json.load(file)

scored_data = [(item['index'], composite_score(item)) for item in data if item.get('flag') != 1]
sorted_indices = [index for index, _ in sorted(scored_data, key=lambda x: x[1], reverse=True)[:80]]
for item in data:
    if item['index'] in sorted_indices:
        item['flag'] = 0

with open('Merged_Data.json', 'w') as file:
    json.dump(data, file, indent=4)