In [74]:
import os
import sys
import yaml
import time
from tqdm import tqdm
from src.utils.logger import setup_logger
from src.utils.data_loader import DataLoader
from src.feature_extraction import FeatureExtractor
from src.fingerprint.minhash import MinHash
from src.fingerprint.simhash import SimHash
from src.fingerprint.bitsampling import BitSampling
from src.lsh.lsh_index import MinHashLSHIndex, SimHashLSHIndex, BitSamplingLSHIndex
from src.lsh.evaluation import Evaluator
from src.preprocessing import preprocess_text

In [60]:


def load_config(config_path: str) -> dict:
    with open(config_path, "r") as file:
        return yaml.safe_load(file)

In [61]:
# 加载配置文件
config_path = "config/config.yaml"
config = load_config(config_path)

log_file = config["logging"]["log_file"]
log_level = config["logging"]["log_level"]
logger = setup_logger(log_file, log_level)
logger.info("系统启动，加载配置完成。")

2025-04-12 16:51:09,319 - data/processed/system.log - INFO - 系统启动，加载配置完成。


In [62]:
# 加载数据
data_loader = DataLoader()
raw_data_path = config["data"]["raw_data_path"]
logger.info(f"加载数据路径：{raw_data_path}")

raw_data = []

if os.path.isfile(raw_data_path) and raw_data_path.endswith(".parquet"):
    try:
        logger.info(f"加载单个文件：{raw_data_path}")
        raw_data = data_loader.load_data(raw_data_path)
    except Exception as e:
        logger.error(f"文件加载失败：{e}")
elif os.path.isdir(raw_data_path):
    logger.info(f"加载目录中的 Parquet 文件：{raw_data_path}")
    parquet_files = [os.path.join(raw_data_path, f) for f in os.listdir(
        raw_data_path) if f.endswith(".parquet")]
    for file_path in parquet_files:
        try:
            logger.info(f"加载文件：{file_path}")
            raw_data.extend(data_loader.load_data(file_path))
        except Exception as e:
            logger.warning(f"{file_path} 加载失败：{e}")
else:
    logger.error("无效路径，退出。")
    raise SystemExit()

logger.info(f"数据加载完成，共加载 {len(raw_data)} 条记录。")

print(raw_data[:5])

2025-04-12 16:51:11,969 - data/processed/system.log - INFO - 加载数据路径：data/raw/test/0000.parquet
2025-04-12 16:51:12,293 - data/processed/system.log - INFO - 加载单个文件：data/raw/test/0000.parquet
2025-04-12 16:51:12,626 - data/processed/system.log - INFO - 数据加载完成，共加载 81137 条记录。


['\n_START_ARTICLE_\n1882 Prince Edward Island general election\n_START_PARAGRAPH_\nThe 1882 Prince Edward Island election was held on May 8, 1882 to elect members of the House of Assembly of the province of Prince Edward Island, Canada. It was won by the Conservative Party._NEWLINE_The election is currently listed on the website of Elections Prince Edward Island as taking place in 1883 — however, contemporaneous sources place the election in 1882.', '\n_START_ARTICLE_\n1917 Spalding by-election\n_START_PARAGRAPH_\nThe Spalding by-election, 1917 was a parliamentary by-election held for the House of Commons constituency of Spalding  in Lincolnshire on 25 October 1917.\n_START_SECTION_\nVacancy\n_START_PARAGRAPH_\nThe by-election was caused by the death of the sitting Liberal MP, the Hon. Francis McLaren. McClaren was the younger son of Charles McLaren, 1st Baron Aberconway. He was first elected as Liberal MP for Spalding at the January 1910 general election. In 1916, he joined the Royal

In [58]:
# 读取“data/results/candidate_pairs.csv”并print出前五个候选对
candidate_pairs_path = "data/results/candidate_pairs.csv"

with open(candidate_pairs_path, "r") as f:
    candidate_pairs = [tuple(map(int, line.strip().split(","))) for line in f]
print(candidate_pairs)
candidate_pairs = list(candidate_pairs)
pair = candidate_pairs[1]
print(pair)
print(raw_data[pair[0]])
print()
print(raw_data[pair[1]])
print("-----")

[(22063, 57729), (2491, 76015), (24349, 31749), (7079, 40538), (10399, 46220), (8943, 30104), (56284, 61956), (62612, 67014), (64057, 71823), (18953, 31217), (23907, 74909), (15301, 69076), (14397, 40525), (28131, 75219), (68039, 80450), (23899, 39133), (14573, 36666), (39607, 55946), (8943, 71839), (30571, 51957), (58610, 63109), (57729, 79081), (64153, 77511), (24312, 77474), (29789, 35297), (55413, 72544), (38800, 40727), (2491, 37591), (66145, 67528), (12132, 24349), (5989, 77776), (27482, 42357), (10271, 44205), (7079, 52035), (35759, 74209), (27332, 57087), (38800, 63623), (36666, 44740), (14277, 51957), (49793, 52418), (11230, 62569), (12047, 25967), (31217, 52418), (9571, 55170), (27572, 77484), (8130, 26451), (71127, 71839), (39474, 55430), (26443, 54221), (68578, 75079), (31217, 40525), (38197, 55043), (27, 20500), (45746, 74965), (37073, 52418), (15143, 64679), (3082, 54517), (20371, 36144), (58003, 63576), (40538, 43142), (8069, 8391), (61425, 79450), (37073, 40525), (5931,

In [28]:
# 文本处理和特征提取示例
logger.info("开始文本预处理...")
preprocessed_data = [preprocess_text(text) for text in raw_data]

feature_method = config["feature_extraction"]["method"]
ngram_size = config["feature_extraction"].get("ngram_size", 3)

logger.info(f"特征提取方法：{feature_method}")
extractor = FeatureExtractor(method=feature_method, n=ngram_size)
features = [extractor.extract_features(text) for text in preprocessed_data]
logger.info("预处理和特征提取完成。")

2025-04-12 14:06:57,484 - data/processed/system.log - INFO - 开始文本预处理...
2025-04-12 14:07:12,665 - data/processed/system.log - INFO - 特征提取方法：ngram
2025-04-12 14:07:36,143 - data/processed/system.log - INFO - 预处理和特征提取完成。


In [29]:
# 指纹生成
fingerprint_method = config["fingerprint"]["method"]
logger.info(f"开始指纹生成，方法：{fingerprint_method}")

if fingerprint_method == "minhash":
    minhash = MinHash(config["fingerprint"]["num_hashes"],
                      seed=config["fingerprint"].get("seed"))
    signatures = [minhash.compute_signature(
        f) for f in tqdm(features, desc="生成 MinHash 签名")]
elif fingerprint_method == "simhash":
    simhash = SimHash(hash_bits=config["fingerprint"]["hash_bits"])
    signatures = [simhash.compute_signature(
        f) for f in tqdm(features, desc="生成 SimHash 签名")]
elif fingerprint_method == "bitsampling":
    bitsampling = BitSampling(
        sample_size=config["fingerprint"]["sample_size"],
        hash_bits=config["fingerprint"]["hash_bits"],
        seed=config["fingerprint"].get("seed")
    )
    signatures = [bitsampling.compute_signature(
        f) for f in tqdm(features, desc="生成 BitSampling 签名")]
else:
    logger.error("未知指纹方法，退出。")
    raise SystemExit()

# 可选：保存签名
fingerprint_output_path = config["output"]["fingerpritnts_path"]
data_loader.save_signatures(signatures, fingerprint_output_path)
logger.info(f"签名保存至：{fingerprint_output_path}")

2025-04-12 14:07:36,154 - data/processed/system.log - INFO - 开始指纹生成，方法：minhash
生成 MinHash 签名: 100%|██████████| 162274/162274 [00:56<00:00, 2893.43it/s]
2025-04-12 14:08:38,994 - data/processed/system.log - INFO - 签名保存至：data/processed/fingerprints.csv


数据已成功保存为 CSV 文件: data/processed/fingerprints.csv


In [68]:
# 从文件中提取签名
# data/processed/fingerprints.csv
# 文件路径
fingerprint_input_path = "data/processed/fingerprints.csv"

# 读取 CSV 文件内容
with open(fingerprint_input_path, "r") as file:
    fingerprints = [int(line.strip()) for line in file.readlines()[1:]]

In [76]:
# LSH 索引构建
lsh_method = config["lsh"]["method"]
logger.info(f"开始构建 LSH 索引，方法：{lsh_method}")

if lsh_method == "minhash":
    lsh_index = MinHashLSHIndex(
        config["lsh"]["num_bands"], config["lsh"]["rows_per_band"])
elif lsh_method == "simhash":
    lsh_index = SimHashLSHIndex(radius=config["lsh"]["radius"])
elif lsh_method == "bitsampling":
    lsh_index = BitSamplingLSHIndex(
        config["lsh"]["num_hash_tables"], config["lsh"]["bits_per_table"])
else:
    logger.error("未知 LSH 方法，退出。")
    raise SystemExit()

lsh_index.index(signatures)
candidate_pairs = lsh_index.get_candidate_pairs()
logger.info(f"生成候选文档对数量：{len(candidate_pairs)}")

2025-04-12 17:04:26,653 - data/processed/system.log - INFO - 开始构建 LSH 索引，方法：bitsampling


TypeError: unsupported operand type(s) for &: 'list' and 'int'

In [31]:
# 评估
evaluator = Evaluator(candidate_pairs)
duplicate_rate = evaluator.compute_duplicate_rate()
logger.info(f"候选对中的近重复文档比率：{duplicate_rate:.2f}")

2025-04-12 14:09:41,646 - data/processed/system.log - INFO - 候选对中的近重复文档比率：1.00


In [32]:
# 保存候选对
results_path = config["output"]["results_path"]
os.makedirs(os.path.dirname(results_path), exist_ok=True)

with open(results_path, "w") as f:
    for pair in candidate_pairs:
        f.write(f"{pair[0]},{pair[1]}\n")

logger.info(f"候选对写入完成：{results_path}")

2025-04-12 14:09:41,666 - data/processed/system.log - INFO - 候选对写入完成：data/results/candidate_pairs.csv
