# 中文新词/多词条识别 · 分步版（Colab）
将原始长脚本拆成若干小单元格，便于在 Google Colab 里逐步执行、调参与排错。

> 运行顺序：从上到下依次执行。如果你只改了参数，通常只需重跑 **配置** 之后相关统计单元格即可。

## 0) （可选）挂载 Google Drive
如果你的数据/输出路径在 Drive 上，请先挂载。

In [2]:
# 在 Colab 里执行本单元以挂载 Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 1) 导入与全局配置
- 路径：参考词表、语料 CSV（需包含 `content` 列，`date` 可选）、输出目录
- 阈值：频次、PMI、左右熵、去嵌套比例
- 断句与停用词：根据你的分词器输出微调

In [None]:
from pathlib import Path
import pandas as pd
from collections import Counter, defaultdict
import numpy as np
import unicodedata, re, math

# =========================
# 配置（根据你的实际路径修改）
# =========================
IN_REF = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all_ch.txt")
IN_CORPUS = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/ch2_all_data_2015_2025_tok_thulac.csv")
OUT_DIR = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# 阈值（可按语料调整）
MINF_UNI, MINDF_UNI = 5, 3     # unigram 词频/文档频
MINF_BI,  MINDF_BI  = 5, 3     # bigram 词频/文档频
MINF_TRI, MINDF_TRI = 3, 2     # trigram 词频/文档频
PMI2_MIN, PMI3_MIN  = 3.5, 2.5 # PMI 阈值
ENT_MIN             = 2.0      # 左右熵阈值（L+R）
ABSORB_RATIO        = 0.80     # 去嵌套：被更长单位覆盖比例阈值
SENT_SEP = set("。！？!?；;：:") # 断句标记（基于分好词后仍保留的中文标点）

STOP = {
    "的","了","和","与","及","等","在","把","被","对","于","之","其",
    "并","或","而","但","若","即","乃","则","所","从","自","向","往",
    "比","将","因","由","以","而是","还是","还是","以及","其中","通过",
    "作为","对于","关于","按照","根据","由于"
}  # 边界停用词 #语法功能词（助词、介词、连词等）；边界性虚词（容易出现在词组/短语边缘，削弱搭配意义）。
print("输出目录：", OUT_DIR)

输出目录： /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification


## 2) 读取参考词表
参考词表用于对候选项标注 `in_reference/new`。若不需要，可置空。

In [None]:
ref_set = set()
with IN_REF.open("r", encoding="utf-8") as f:
    for line in f:
        t = line.strip()
        if t:
            ref_set.add(t)
print(f"参考词表条目数：{len(ref_set)}")

参考词表条目数：126358


## 3) 读取语料并做基础清洗
- 需要 `content` 列（空格分词后的文本）
- `date` 列可选，用于首现年份与“爆发度”等后续扩展

In [None]:
df = pd.read_csv(IN_CORPUS, encoding="utf-8")
if "content" not in df.columns:
    raise ValueError("CSV 缺少 'content' 列。")
df["date_parsed"] = pd.to_datetime(df.get("date", None), errors="coerce")
df["year"] = df["date_parsed"].dt.year
df["content"] = df["content"].fillna("").astype(str)

def normalize(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u3000", " ").strip()
    return s

df["content"] = df["content"].map(normalize)
print(df.shape, df.columns.tolist())
df.head(2)

(1282, 6) ['author', 'title', 'date', 'content', 'date_parsed', 'year']


Unnamed: 0,author,title,date,content,date_parsed,year
0,(王连香、高雷),,,人民网 北京 9月 28日 电 ( 记者 王 连 香 ) 在 今日 举行 的 交通 运输部 ...,NaT,
1,(王连香、高雷),,,人民网 北京 9月 28日 电 ( 记者 王 连 香 ) 据 中国 国 家 铁路 集团 有限...,NaT,


## 4) 基本噪声过滤与 token 判别
可按实际分词器输出微调（数字、百分号、URL、@# 等）。

In [None]:
import re, unicodedata

punct_or_space = lambda ch: unicodedata.category(ch).startswith("P") or ch.isspace()

def is_punct_token(tok):
    return all(punct_or_space(ch) for ch in tok)

# 数字（含小数点、全角点）
num_pat = re.compile(r"^[0-9０-９一二三四五六七八九十百千万亿〇零壹贰叁肆伍陆柒捌玖拾佰仟]+([\.．][0-9０-９]+)?$")
def is_number(tok):
    return bool(num_pat.match(tok))

# 百分比
percent_pat = re.compile(r"^([0-9０-９]+(\.[0-9０-９]+)?%|百分之[一二三四五六七八九十百千万亿〇零壹贰叁肆伍陆柒捌玖拾佰仟]+)$")
def is_percent(tok):
    return bool(percent_pat.match(tok))

# URL 和 mention
url_pat = re.compile(r"https?://|www\.", re.I)
mention_pat = re.compile(r"^[@#]")

def bad_token(tok):
    if not tok:
        return True
    if is_punct_token(tok) or is_number(tok) or is_percent(tok):
        return True
    if url_pat.search(tok) or mention_pat.search(tok):
        return True
    return False


## 5) 句内 n-gram 生成（不跨句）
避免跨句拼接产生伪搭配；并对边界停用词过滤。

In [None]:
def split_by_sentence(tokens):
    sents, cur = [], []
    for t in tokens:
        cur.append(t)
        if t in SENT_SEP:
            sents.append(cur); cur = []
    if cur: sents.append(cur)
    return sents

def bad_edge_ngram(ng):
    parts = ng.split()
    return (parts[0] in STOP) or (parts[-1] in STOP)

以下两步为改进

In [None]:
import re, unicodedata

# —— 建议的句末分隔符与“贴边闭合符” ——
SENT_SEP = {"。","！","？","!","?","；",";","…","……"}
CLOSE_TAIL = {"」","』","）",")","]","】","”","’","'"}

def split_by_sentence(tokens, keep_sep=False):
    """
    将 token 序列切成句子（list[list[str]]）。
    - 支持中文标点与连续省略号作为分句点
    - 将句末的闭合引号/括号等并入当前句子后再断句
    - keep_sep=True 时保留分隔符（更利于还原原文）
    """
    sents, cur = [], []
    i, n = 0, len(tokens)
    while i < n:
        t = tokens[i]
        cur.append(t)

        if t in SENT_SEP:
            # 把紧随其后的闭合符号吃进来（如 “。” + ”】） ）
            j = i + 1
            while j < n and tokens[j] in CLOSE_TAIL:
                cur.append(tokens[j])
                j += 1
            if not keep_sep:
                # 如果不想保留分隔符，可在断句前去掉它（但保留贴边闭合符）
                # 注意：只有当最后一个是真正分隔符时才弹出
                # 这里的策略：去掉最右侧第一个 SENT_SEP
                for k in range(len(cur)-1, -1, -1):
                    if cur[k] in SENT_SEP:
                        cur.pop(k)
                        break
            sents.append(cur)
            cur = []
            i = j
            continue
        i += 1

    if cur:
        sents.append(cur)
    return sents


In [None]:
# —— n-gram 边界过滤：更稳健的版本 ——

def _as_parts(ng):
    # 既兼容传入 "a b c" 也兼容 ["a","b","c"]
    if isinstance(ng, str):
        return ng.split()
    return list(ng)

def bad_edge_ngram(ng, STOP=None, bad_token=None, min_len=1):
    """
    返回 True 表示“该 n-gram 不合格，应丢弃”。
    规则：
      - 空或长度 < min_len
      - 左/右边是 STOP
      - 左/右边是 bad_token（纯标点/空白、数字、小数、百分比、URL、@/# 等）
    """
    parts = _as_parts(ng)
    if not parts or len(parts) < min_len:
        return True

    left, right = parts[0], parts[-1]

    if STOP is not None and (left in STOP or right in STOP):
        return True

    if bad_token is not None and (bad_token(left) or bad_token(right)):
        return True

    # 可选：单字纯符号（保险起见，避免孤立标点或噪音）
    def punct_or_space(ch):
        return unicodedata.category(ch).startswith("P") or ch.isspace()

    if all(punct_or_space(ch) for ch in left):
        return True
    if all(punct_or_space(ch) for ch in right):
        return True

    return False


## 6) 统计 1/2/3-gram 频次与文档频（DF）
同时记录首现年份（以 unigram 为例，可扩展到 n-gram）。

In [None]:
uni_freq, bi_freq, tri_freq = Counter(), Counter(), Counter()
uni_df,   bi_df,   tri_df   = Counter(), Counter(), Counter()
first_year = {}
year_counts_uni = defaultdict(lambda: Counter())  # 首现/爆发度可用（此处以 unigram 为主）

for _, row in df.iterrows():
    year = row["year"]
    tokens = [t for t in row["content"].split() if not bad_token(t)]
    sents = split_by_sentence(tokens)

    seen_uni, seen_bi, seen_tri = set(), set(), set()

    for sent in sents:
        clean = [w for w in sent if w.strip() and w not in SENT_SEP]

        # 1-gram
        uni_freq.update(clean)
        seen_uni.update(set(clean))

        # 2-gram / 3-gram（句内滚动）
        for n in (2,3):
            for i in range(len(clean)-n+1):
                ng = " ".join(clean[i:i+n])
                if bad_edge_ngram(ng):
                    continue
                if n == 2:
                    bi_freq[ng] += 1; seen_bi.add(ng)
                else:
                    tri_freq[ng] += 1; seen_tri.add(ng)

    # 文档频
    uni_df.update(seen_uni)
    bi_df.update(seen_bi)
    tri_df.update(seen_tri)

    # 首现年份（以 unigram 为例）
    if pd.notna(year):
        y = int(year)
        for w in seen_uni:
            if w not in first_year:
                first_year[w] = y
            year_counts_uni[y][w] += 1

print("统计完成：",
      f"uni={len(uni_freq)}, bi={len(bi_freq)}, tri={len(tri_freq)}")

统计完成： uni=52475, bi=465497, tri=746467


以下为改进版本

In [None]:
from collections import Counter, defaultdict
import pandas as pd
import unicodedata

uni_freq, bi_freq, tri_freq = Counter(), Counter(), Counter()
uni_df,   bi_df,   tri_df   = Counter(), Counter(), Counter()
first_year = {}
year_counts_uni = defaultdict(lambda: Counter())  # 统计每年首现/文档计数（以unigram为主）

def normalize_token(t: str) -> str:
    # 可选：统一NFKC，去两端空白
    return unicodedata.normalize("NFKC", t).strip()

for row in df.itertuples(index=False):
    year = getattr(row, "year", None)
    content = getattr(row, "content", "")

    # 先切词，再做NFKC与噪声过滤
    tokens = []
    for t in content.split():
        t = normalize_token(t)
        if t and not bad_token(t):
            tokens.append(t)

    # 句子切分
    sents = split_by_sentence(tokens)

    # 文档内去重容器
    seen_uni, seen_bi, seen_tri = set(), set(), set()

    for sent in sents:
        # 句内清洗：去空白与分句符
        clean = [w for w in sent if w and w not in SENT_SEP]
        if not clean:
            continue

        # 1-gram
        uni_freq.update(clean)
        seen_uni.update(clean)  # set自动去重

        # 2-gram / 3-gram（句内滚动）
        L = len(clean)
        for n in (2, 3):
            if L < n:
                continue
            for i in range(L - n + 1):
                parts = clean[i:i+n]                # list切片传给过滤器
                if bad_edge_ngram(parts):           # 通过你新版的边界过滤
                    continue
                key = " ".join(parts)               # 只在通过后再join，减少开销
                if n == 2:
                    bi_freq[key] += 1; seen_bi.add(key)
                else:
                    tri_freq[key] += 1; seen_tri.add(key)

    # 文档频（DF）
    if seen_uni: uni_df.update(seen_uni)
    if seen_bi:  bi_df.update(seen_bi)
    if seen_tri: tri_df.update(seen_tri)

    # 首现年份与年度文档计数（以unigram为例）
    try:
        if pd.notna(year):
            y = int(year)
            for w in seen_uni:
                if w not in first_year:
                    first_year[w] = y
                year_counts_uni[y][w] += 1
    except Exception:
        # 年份缺失或格式异常时跳过
        pass

print("统计完成：",
      f"uni={len(uni_freq)}, bi={len(bi_freq)}, tri={len(tri_freq)}")


统计完成： uni=52475, bi=465497, tri=746467


## 7) PMI 与 左右熵（凝固度）
- PMI 对 2/3-gram 计算搭配强度  
- 左右熵：用首词的左熵 + 末词的右熵，衡量“固定搭配”程度

In [None]:
# 概率与 PMI
N_uni = sum(uni_freq.values())
def p_uni(w): return (uni_freq[w] + 1) / (N_uni + len(uni_freq))

def p_bi(ng):
    N_bi = sum(bi_freq.values())
    return (bi_freq[ng] + 1) / (N_bi + len(bi_freq) + 1e-9)

def p_tri(ng):
    N_tri = sum(tri_freq.values())
    return (tri_freq[ng] + 1) / (N_tri + len(tri_freq) + 1e-9)

def PMI2(ng):
    a,b = ng.split()
    return math.log( p_bi(ng) / (p_uni(a)*p_uni(b)) + 1e-12 )

def PMI3(ng):
    a,b,c = ng.split()
    return math.log( p_tri(ng) / (p_uni(a)*p_uni(b)*p_uni(c)) + 1e-12 )

# 左右熵所需的左右邻接
left_ctx, right_ctx = defaultdict(Counter), defaultdict(Counter)
for _, row in df.iterrows():
    toks = [t for t in row["content"].split() if t.strip()]
    for i, w in enumerate(toks):
        if i>0: left_ctx[w][toks[i-1]] += 1
        if i<len(toks)-1: right_ctx[w][toks[i+1]] += 1

def entropy(counter):
    total = sum(counter.values()) or 1
    return -sum((c/total)*math.log((c/total)+1e-12) for c in counter.values())

def lr_entropy_ng(ng):
    parts = ng.split()
    return entropy(left_ctx[parts[0]]) + entropy(right_ctx[parts[-1]])

改进版本

In [None]:
import math
from collections import defaultdict, Counter
import unicodedata

# ---- 预计算：总频与“词表大小”缓存 ----
N_uni = sum(uni_freq.values())
N_bi  = sum(bi_freq.values())
N_tri = sum(tri_freq.values())
V_uni = len(uni_freq)
V_bi  = len(bi_freq)
V_tri = len(tri_freq)

# 可选：拉普拉斯平滑强度（alpha=1 即加一平滑）
ALPHA = 1.0
EPS = 1e-12

def p_uni(w):
    return (uni_freq[w] + ALPHA) / (N_uni + ALPHA * max(V_uni, 1))

def p_bi_key(key):
    # key: "a b"
    return (bi_freq[key] + ALPHA) / (N_bi + ALPHA * max(V_bi, 1))

def p_tri_key(key):
    # key: "a b c"
    return (tri_freq[key] + ALPHA) / (N_tri + ALPHA * max(V_tri, 1))

# ---- PMI（含阈值、PPMI、NPMI 选项）----
def PMI2(key, min_count=3, kind="pmi"):  # kind: "pmi" | "ppmi" | "npmi"
    a, b = key.split()
    if bi_freq[key] < min_count or uni_freq[a] < min_count or uni_freq[b] < min_count:
        return float("-inf") if kind != "ppmi" else 0.0
    p_ab = p_bi_key(key)
    p_a  = p_uni(a)
    p_b  = p_uni(b)
    val = math.log((p_ab / (p_a * p_b)) + EPS)
    if kind == "ppmi":
        return max(val, 0.0)
    if kind == "npmi":
        return val / (-math.log(p_ab + EPS))
    return val

def PMI3(key, min_count=3, kind="pmi"):
    a, b, c = key.split()
    if tri_freq[key] < min_count or min(uni_freq[a], uni_freq[b], uni_freq[c]) < min_count:
        return float("-inf") if kind != "ppmi" else 0.0
    p_abc = p_tri_key(key)
    p_a, p_b, p_c = p_uni(a), p_uni(b), p_uni(c)
    val = math.log((p_abc / (p_a * p_b * p_c)) + EPS)
    if kind == "ppmi":
        return max(val, 0.0)
    if kind == "npmi":
        return val / (-math.log(p_abc + EPS))
    return val

# ---- 左右熵：基于“清洗后的句内邻接”构建上下文 ----
left_ctx, right_ctx = defaultdict(Counter), defaultdict(Counter)

def normalize_token(t: str) -> str:
    return unicodedata.normalize("NFKC", t).strip()

for row in df.itertuples(index=False):
    content = getattr(row, "content", "")
    # 与主流程一致的清洗
    tokens = []
    for t in content.split():
        t = normalize_token(t)
        if t and not bad_token(t):
            tokens.append(t)
    sents = split_by_sentence(tokens)

    for sent in sents:
        clean = [w for w in sent if w and w not in SENT_SEP]
        for i, w in enumerate(clean):
            if i > 0:
                left_ctx[w][clean[i-1]] += 1
            if i < len(clean) - 1:
                right_ctx[w][clean[i+1]] += 1

def entropy(counter: Counter):
    total = sum(counter.values())
    if total <= 0:
        return 0.0
    H = 0.0
    for c in counter.values():
        p = c / total
        H -= p * math.log(p + EPS)
    return H

def lr_entropy_ng(key):
    parts = key.split()
    # 左取首词的左熵，右取末词的右熵
    return entropy(left_ctx[parts[0]]) + entropy(right_ctx[parts[-1]])


## 8) 去嵌套（C-value 思路）
当某个 bigram 的大部分频次被某个 trigram 覆盖时，剔除该 bigram。

In [None]:
def suppress_nested_bi(bi_freq, tri_freq, ratio=0.8):
    keep = {}
    index_max = defaultdict(int)  # bigram -> 覆盖它的 trigram 最大频次
    for tri, ftri in tri_freq.items():
        parts = tri.split()
        for i in range(2):
            index_max[" ".join(parts[i:i+2])] = max(index_max[" ".join(parts[i:i+2])], ftri)
    for bi, fbi in bi_freq.items():
        if index_max[bi] >= ratio * fbi:
            continue
        keep[bi] = fbi
    return Counter(keep)

bi_freq = suppress_nested_bi(bi_freq, tri_freq, ABSORB_RATIO)
print("去嵌套后 bigram 数量：", len(bi_freq))

去嵌套后 bigram 数量： 72335


改进版本

In [None]:
from collections import defaultdict, Counter

def suppress_nested_bi(
    bi_freq: Counter,
    tri_freq: Counter,
    ratio: float = 0.8,
    min_bi: int = 3,
    entropy_fn=None,          # 可传入你上面的 lr_entropy_ng
    entropy_min: float = None # 若设定阈值，则高于该阈值的 bi 不抑制
) -> Counter:
    """
    抑制主要作为 trigram 组成部分出现的 bigram。
    - 使用“累计覆盖频次”而非 max
    - 对覆盖进行上限裁剪（不超过该 bigram 自身频次）
    - 提供最小频次与左右熵保护选项
    """
    # 统计每个 bigram 被哪些 trigram 覆盖的“累计覆盖频次”
    cover_sum = defaultdict(int)
    for tri, ftri in tri_freq.items():
        parts = tri.split()
        if len(parts) != 3 or ftri <= 0:
            continue
        bi_left  = f"{parts[0]} {parts[1]}"
        bi_right = f"{parts[1]} {parts[2]}"
        cover_sum[bi_left]  += ftri
        cover_sum[bi_right] += ftri

    keep = {}
    for bi, fbi in bi_freq.items():
        if fbi < min_bi:
            # 低频的保守保留（避免数据稀疏下的过拟合抑制）
            keep[bi] = fbi
            continue

        covered = cover_sum.get(bi, 0)
        # 覆盖频次不应超过自身出现频次
        covered = min(covered, fbi)
        cover_ratio = covered / max(fbi, 1)

        # 熵保护：若左右熵高（上下文多样），认为是“可独立成块”的搭配→保留
        if entropy_fn is not None and entropy_min is not None:
            try:
                if entropy_fn(bi) >= entropy_min:
                    keep[bi] = fbi
                    continue
            except Exception:
                # 熵计算异常时忽略保护，走默认逻辑
                pass

        if cover_ratio >= ratio:
            # 被 trigram “吸收”占比过高，抑制之（不加入 keep）
            continue
        else:
            keep[bi] = fbi

    return Counter(keep)


In [None]:
# 仅按覆盖比例抑制
bi_freq = suppress_nested_bi(bi_freq, tri_freq, ratio=ABSORB_RATIO, min_bi=3)

# 或者加上左右熵保护（例如阈值 1.5~2.5 之间可试）
# bi_freq = suppress_nested_bi(bi_freq, tri_freq, ratio=ABSORB_RATIO, min_bi=3,
#                              entropy_fn=lr_entropy_ng, entropy_min=2.0)

print("去嵌套后 bigram 数量：", len(bi_freq))


去嵌套后 bigram 数量： 20634


## 9) 候选选择与打标签（按 n 分表）
- 过滤：频次/文档频、PMI、左右熵、边界停用词  
- 标注：是否在参考词表 `in_reference`

In [None]:
import pandas as pd

def _finalize_df(rows, cols_order, sort_cols, ascending):
    df_out = pd.DataFrame(rows)
    # 空表保护：返回带齐列名的空表（不会再因缺列而报错）
    if df_out.empty:
        return pd.DataFrame(columns=cols_order)
    # 让 status 有固定排序：in_reference 优先于 new
    if "status" in df_out.columns:
        df_out["status"] = pd.Categorical(
            df_out["status"],
            categories=["in_reference", "new"],
            ordered=True
        )
    # 重排列顺序（多余列保留在后）
    cols = [c for c in cols_order if c in df_out.columns]
    cols += [c for c in df_out.columns if c not in cols]
    return df_out[cols].sort_values(sort_cols, ascending=ascending).reset_index(drop=True)

def select_unigram():
    rows=[]
    for w,f in uni_freq.items():
        if f<MINF_UNI or uni_df[w]<MINDF_UNI:
            continue
        in_ref = (w in ref_set)
        rows.append({
            "term": w, "n":1, "freq":int(f), "doc_freq":int(uni_df[w]),
            "first_year": first_year.get(w), "in_reference": in_ref,
            "status": "in_reference" if in_ref else "new"
        })
    return _finalize_df(
        rows,
        cols_order=["term","n","freq","doc_freq","first_year","in_reference","status"],
        sort_cols=["status","freq","doc_freq"],
        ascending=[True,False,False]
    )

def select_bigram():
    rows=[]
    for ng,f in bi_freq.items():
        if f<MINF_BI or bi_df[ng]<MINDF_BI:
            continue
        if bad_edge_ngram(ng):
            continue
        pmi = PMI2(ng); ent = lr_entropy_ng(ng)
        if pmi<PMI2_MIN or ent<ENT_MIN:
            continue
        in_ref = (ng in ref_set)
        rows.append({
            "term": ng, "n":2, "freq":int(f), "doc_freq":int(bi_df[ng]),
            "PMI":round(pmi,3), "LRent":round(ent,3),
            "in_reference": in_ref, "status": "in_reference" if in_ref else "new"
        })
    return _finalize_df(
        rows,
        cols_order=["term","n","freq","doc_freq","PMI","LRent","in_reference","status"],
        sort_cols=["status","PMI","freq"],
        ascending=[True,False,False]
    )

def select_trigram():
    rows=[]
    for ng,f in tri_freq.items():
        if f<MINF_TRI or tri_df[ng]<MINDF_TRI:
            continue
        if bad_edge_ngram(ng):
            continue
        pmi = PMI3(ng); ent = lr_entropy_ng(ng)
        if pmi<PMI3_MIN or ent<ENT_MIN:
            continue
        in_ref = (ng in ref_set)
        rows.append({
            "term": ng, "n":3, "freq":int(f), "doc_freq":int(tri_df[ng]),
            "PMI":round(pmi,3), "LRent":round(ent,3),
            "in_reference": in_ref, "status": "in_reference" if in_ref else "new"
        })
    return _finalize_df(
        rows,
        cols_order=["term","n","freq","doc_freq","PMI","LRent","in_reference","status"],
        sort_cols=["status","PMI","freq"],
        ascending=[True,False,False]
    )


改进的代码，前面代码运行不了

In [None]:
# --- Step 1: 生成三个结果表 ---
res_uni = select_unigram()
res_bi  = select_bigram()
res_tri = select_trigram()

# --- Step 2: 定义四字格检测函数 ---
def is_4char_ng(ng):
    chars = "".join(ng.split())   # 去掉空格，直接按汉字数算
    return len(chars) == 4

# --- Step 3: 找到所有四字格三元组 ---
good_tri_4 = set(t for t in res_tri["term"] if is_4char_ng(t))

# --- Step 4: 用四字格覆盖过滤 bigram ---
if len(good_tri_4) > 0:
    def covered_by_4(bi):
        b = "".join(bi.split())
        for tri in good_tri_4:
            if b in "".join(tri.split()):
                return True
        return False
    res_bi = res_bi[~res_bi["term"].apply(covered_by_4)]

print("四字格数：", len(good_tri_4), "；过滤后 bigram 数：", len(res_bi))


四字格数： 14298 ；过滤后 bigram 数： 0


## 11) 导出分表 + 合并总表（长项优先保留）
最终表：`neo_final.csv`。  
> 注意：修正了原始脚本末尾打印文件名的小笔误。

In [None]:
# 导出分表
res_uni.to_csv(OUT_DIR/"chi_neo_uni.csv", index=False, encoding="utf-8-sig")
res_bi.to_csv( OUT_DIR/"chi_neo_bi.csv",  index=False, encoding="utf-8-sig")
res_tri.to_csv(OUT_DIR/"chi_neo_tri.csv", index=False, encoding="utf-8-sig")

# 合并并去重（如果同一个短项被更长项包含，则保留更长的）
chosen = []
def norm_chars(s): return "".join(s.split())

for df_part in [res_tri, res_bi, res_uni]:  # 优先保留更长
    for _, r in df_part.iterrows():
        t = r["term"]; t_norm = norm_chars(t)
        # 若被已有更长项完全覆盖，则跳过
        if any(t_norm in norm_chars(x["term"]) for x in chosen if len(norm_chars(x["term"])) >= len(t_norm)):
            continue
        chosen.append(r)

res_final = pd.DataFrame(chosen).reset_index(drop=True)
res_final.to_csv(OUT_DIR/"neo_final.csv", index=False, encoding="utf-8-sig")

print("已保存：")
print(" -", OUT_DIR/"chi_neo_uni.csv")
print(" -", OUT_DIR/"chi_neo_bi.csv")
print(" -", OUT_DIR/"chi_neo_tri.csv")
print(" -", OUT_DIR/"chi_neo_final.csv")

print(f"候选总数：uni={len(res_uni)}, bi={len(res_bi)}, tri={len(res_tri)}, final={len(res_final)}")
display(res_final.head(10))

已保存：
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi_neo_uni.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi_neo_bi.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi_neo_tri.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi_neo_final.csv
候选总数：uni=13726, bi=0, tri=64841, final=68174


Unnamed: 0,term,n,freq,doc_freq,PMI,LRent,in_reference,status,first_year
0,木质 擀 面杖,3,4,2,24.002,2.023,False,new,
1,对齐 排尾 跟进,3,3,3,23.569,2.167,False,new,
2,大字 熠熠生辉 1945年,3,3,3,23.443,2.254,False,new,
3,千伏 输变 电线,3,4,4,23.399,2.109,False,new,
4,延安 枣园 为人民服务,3,3,3,23.399,2.084,False,new,
5,急就章 竭泽而渔 唯,3,3,3,23.399,2.084,False,new,
6,苏 Wesley So,3,3,3,23.399,2.119,False,new,
7,战士 卫生员 机要员,3,3,3,23.281,2.003,False,new,
8,宗地 分宗 事宜,3,3,3,23.227,2.293,False,new,
9,白沙县 畜牧 兽医,3,3,3,23.176,2.172,False,new,


## 12) 小贴士与调参建议
- **阈值**：先放宽（如 PMI/ENT 稍低），看 Top-N 结果，再逐步收紧。
- **边界停用词**：注意分词器产出是否将“的/了/之”等独立出来，必要时扩充 `STOP`。
- **去嵌套**：如发现很多高质量 bigram 被强势 trigram “吞没”，可将 `ABSORB_RATIO` 调低些（如 0.6~0.7）。
- **四字格优先**：仅在你目标确实偏成语/四字格时启用。

#Méthode 3 - 1-5gramme

In [None]:
#Étape 0 — Imports & configuration

from pathlib import Path
import pandas as pd
from collections import Counter, defaultdict
import numpy as np
import unicodedata, re, math

# --- Chemins (adapte si besoin) ---
IN_REF = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all_ch.txt")
IN_CORPUS = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/ch2_all_data_2015_2025_tok_thulac.csv")
OUT_DIR = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
OUT_DIR.mkdir(parents=True, exist_ok=True)
print("Sortie ->", OUT_DIR)

# --- Seuils (tu peux ajuster) ---
MINF = {1:5, 2:5, 3:3, 4:3, 5:3}        # fréquence min
MINDF = {1:3, 2:3, 3:2, 4:2, 5:2}       # doc-freq min
PMI_MIN = {2:3.5, 3:2.5, 4:2.0, 5:1.5}  # seuil PMI par n
ENT_MIN = 2.0                            # somme entropie gauche+droite (L+R)
ABSORB_RATIO = 0.80                      # suppression des emboîtements (k absorbé par k+1)
ALPHA = 1.0                              # lissage Laplace
EPS = 1e-12

# Délimiteurs de phrase (conseillé : conserver semblables aux tiens)
SENT_SEP = {"。","！","？","!","?","；",";","…","……"}
CLOSE_TAIL = {"」","』","）",")","]","】","”","’","'"}

# Mots “faibles” aux bords
STOP = {
    "的","了","和","与","及","等","在","把","被","对","于","之","其",
    "并","或","而","但","若","即","乃","则","所","从","自","向","往",
    "比","将","因","由","以","而是","还是","还是","以及","其中","通过",
    "作为","对于","关于","按照","根据","由于"
}


Sortie -> /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification


In [None]:
#Étape 1 — Lecture des données & normalisation

# Référence (liste blanche/lexique)。这里这一段要改了，不然它把 all_ch.txt 的每一行都原封不动加进去了。
#→ 所以 "的"、"在"、"是" 这些 function words 也进去了。
#→ 在 select_unigram() 里判定 in_ref = (w in ref_set) 时，结果就是常见字都被当成 "in_reference"。

#ref_set = set()
#with IN_REF.open("r", encoding="utf-8") as f:
#    for line in f:
#        t = line.strip()
#        if t:
# ref_set.add(t)
#print("Entrées référence:", len(ref_set))

# Référence (liste blanche/lexique) — version corrigée
ref_set = set()
zh_char_re = re.compile(r"[\u4e00-\u9fff]")

def is_cjk_string(x: str) -> bool:
    return bool(x) and all(zh_char_re.match(ch) for ch in x)

def hanzi_len(x: str) -> int:
    return sum(1 for ch in x if zh_char_re.match(ch))

with IN_REF.open("r", encoding="utf-8") as f:
    for line in f:
        t = line.strip()
        if t and is_cjk_string(t) and 1 <= hanzi_len(t) <= 5:
            ref_set.add(t)

print("Entrées référence (净化后):", len(ref_set))





# Corpus
df = pd.read_csv(IN_CORPUS, encoding="utf-8")
if "content" not in df.columns:
    raise ValueError("Le CSV doit contenir une colonne 'content'.")
df["date_parsed"] = pd.to_datetime(df.get("date", None), errors="coerce")
df["year"] = df["date_parsed"].dt.year
df["content"] = df["content"].fillna("").astype(str)

def normalize(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u3000", " ").strip()
    return s

df["content"] = df["content"].map(normalize)
print(df.shape, df.columns.tolist())
display(df.head(2))

Entrées référence (净化后): 123778
(1282, 6) ['author', 'title', 'date', 'content', 'date_parsed', 'year']


Unnamed: 0,author,title,date,content,date_parsed,year
0,(王连香、高雷),,,人民网 北京 9月 28日 电 ( 记者 王 连 香 ) 在 今日 举行 的 交通 运输部 ...,NaT,
1,(王连香、高雷),,,人民网 北京 9月 28日 电 ( 记者 王 连 香 ) 据 中国 国 家 铁路 集团 有限...,NaT,


In [None]:
#Étape 2 — Utilitaires (nettoyage token, découpe phrase, filtres de bord)

# Détections "mauvais tokens"
def punct_or_space(ch): return unicodedata.category(ch).startswith("P") or ch.isspace()

def is_punct_token(tok): return all(punct_or_space(ch) for ch in tok)

num_pat = re.compile(r"^[0-9０-９一二三四五六七八九十百千万亿〇零壹贰叁肆伍陆柒捌玖拾佰仟]+([\.．][0-9０-９]+)?$")
def is_number(tok): return bool(num_pat.match(tok))

percent_pat = re.compile(r"^([0-9０-９]+(\.[0-9０-９]+)?%|百分之[一二三四五六七八九十百千万亿〇零壹贰叁肆伍陆柒捌玖拾佰仟]+)$")
def is_percent(tok): return bool(percent_pat.match(tok))

url_pat = re.compile(r"https?://|www\.", re.I)
mention_pat = re.compile(r"^[@#]")

def bad_token(tok):
    if not tok: return True
    if is_punct_token(tok) or is_number(tok) or is_percent(tok): return True
    if url_pat.search(tok) or mention_pat.search(tok): return True
    return False

def split_by_sentence(tokens, keep_sep=False):
    sents, cur = [], []
    i, n = 0, len(tokens)
    while i < n:
        t = tokens[i]
        cur.append(t)
        if t in SENT_SEP:
            j = i + 1
            while j < n and tokens[j] in CLOSE_TAIL:
                cur.append(tokens[j]); j += 1
            if not keep_sep:
                for k in range(len(cur)-1, -1, -1):
                    if cur[k] in SENT_SEP:
                        cur.pop(k); break
            sents.append(cur); cur = []; i = j; continue
        i += 1
    if cur: sents.append(cur)
    return sents

def _as_parts(ng):
    return ng.split() if isinstance(ng, str) else list(ng)

def bad_edge_ngram(ng, STOP=STOP, bad_token_fn=bad_token, min_len=1):
    parts = _as_parts(ng)
    if not parts or len(parts) < min_len: return True
    left, right = parts[0], parts[-1]
    if STOP and (left in STOP or right in STOP): return True
    if bad_token_fn and (bad_token_fn(left) or bad_token_fn(right)): return True
    if all(punct_or_space(ch) for ch in left): return True
    if all(punct_or_space(ch) for ch in right): return True
    return False

def normalize_token(t: str) -> str:
    return unicodedata.normalize("NFKC", t).strip()


In [None]:
#Étape 3 — Comptage 1→5-grammes + DF + métadonnées

# Freq/DF par n (1..5)
freq = {n: Counter() for n in range(1,6)}
dfreq = {n: Counter() for n in range(1,6)}

# Contexte pour entropies (sur tokens unitaires)
left_ctx, right_ctx = defaultdict(Counter), defaultdict(Counter)

# Méta pour unigrams
first_year = {}
year_counts_uni = defaultdict(lambda: Counter())

for row in df.itertuples(index=False):
    year = getattr(row, "year", None)
    content = getattr(row, "content", "")

    tokens = []
    for t in content.split():
        t = normalize_token(t)
        if t and not bad_token(t):
            tokens.append(t)

    # Contexte gauche/droite (unités)
    for i, w in enumerate(tokens):
        if i > 0: left_ctx[w][tokens[i-1]] += 1
        if i < len(tokens)-1: right_ctx[w][tokens[i+1]] += 1

    # Phrases
    sents = split_by_sentence(tokens)

    # DF par document (sets)
    seen = {n: set() for n in range(1,6)}

    for sent in sents:
        clean = [w for w in sent if w and w not in SENT_SEP]
        L = len(clean)
        if not L: continue

        # 1-gram
        freq[1].update(clean)
        seen[1].update(clean)

        # n=2..5 (fenêtre glissante intra-phrase)
        for n in (2,3,4,5):
            if L < n: continue
            for i in range(L - n + 1):
                parts = clean[i:i+n]
                if bad_edge_ngram(parts):
                    continue
                key = " ".join(parts)
                freq[n][key] += 1
                seen[n].add(key)

    # DF
    for n in range(1,6):
        if seen[n]:
            dfreq[n].update(seen[n])

    # First-year & stats sur unigrams
    try:
        if pd.notna(year):
            y = int(year)
            for w in seen[1]:
                if w not in first_year:
                    first_year[w] = y
                year_counts_uni[y][w] += 1
    except Exception:
        pass

print("Comptage terminé :",
      "uni=",len(freq[1]), "bi=",len(freq[2]), "tri=",len(freq[3]),
      "quad=",len(freq[4]), "penta=",len(freq[5]))


Comptage terminé : uni= 52475 bi= 384247 tri= 576035 quad= 627828 penta= 643772


In [None]:
#Étape 4 — Probabilités lissées & PMI généralisé

# Totaux & vocabulaires
N = {n: sum(freq[n].values()) for n in range(1,6)}
V = {n: len(freq[n])          for n in range(1,6)}

def p_uni(w):
    return (freq[1][w] + ALPHA) / (N[1] + ALPHA * max(V[1],1))

def p_ng(n, key):
    # key "a b ...", n>=2
    return (freq[n][key] + ALPHA) / (N[n] + ALPHA * max(V[n],1))

def PMI_n(n, key, min_count=3, kind="pmi"):
    """
    PMI généralisé pour n>=2 :
    PMI_n = log( p(w1..wn) / prod_i p(wi) )
    kind : "pmi" | "ppmi" | "npmi"
    """
    parts = key.split()
    if len(parts) != n: return float("-inf")
    # seuils de fréquence (sécurité)
    if freq[n][key] < min_count or any(freq[1][w] < min_count for w in parts):
        return 0.0 if kind=="ppmi" else float("-inf")
    p_joint = p_ng(n, key)
    p_prod  = 1.0
    for w in parts: p_prod *= p_uni(w)
    val = math.log((p_joint / (p_prod + EPS)) + EPS)
    if kind == "ppmi": return max(val, 0.0)
    if kind == "npmi": return val / (-math.log(p_joint + EPS))
    return val

def entropy(counter: Counter):
    total = sum(counter.values())
    if total <= 0: return 0.0
    H = 0.0
    for c in counter.values():
        p = c / total
        H -= p * math.log(p + EPS)
    return H

def lr_entropy_ng(key):
    parts = key.split()
    return entropy(left_ctx[parts[0]]) + entropy(right_ctx[parts[-1]])


In [None]:
#Étape 5 — Suppression des emboîtements (k absorbé par k+1)

def suppress_nested_k_by_kplus1(freq_k: Counter, freq_k1: Counter,
                                ratio: float = 0.8, min_k: int = 3,
                                entropy_fn=None, entropy_min: float = None) -> Counter:
    """
    Supprime les k-grammes surtout "absorbés" par des (k+1)-grammes qui les contiennent
    (à gauche et à droite). Protection par entropie optionnelle.
    """
    if not freq_k or not freq_k1: return freq_k

    cover_sum = defaultdict(int)
    for key_k1, f_k1 in freq_k1.items():
        parts = key_k1.split()
        if len(parts) != len(parts): pass  # placeholder pour lisibilité
        # toutes les sous-fenêtres de taille k dans le k+1
        for i in range(len(parts)-1):
            sub = " ".join(parts[i:i+len(parts)-1])
        # correction : faire k= len(parts)-1
    # Correction propre :
    cover_sum = defaultdict(int)
    for key_k1, f_k1 in freq_k1.items():
        parts = key_k1.split()
        k1 = len(parts)
        k = k1 - 1
        if k < 1 or f_k1 <= 0:
            continue
        for i in range(k1 - k + 1):  # = 2 positions
            sub = " ".join(parts[i:i+k])
            cover_sum[sub] += f_k1

    keep = {}
    for key_k, f_k in freq_k.items():
        if f_k < min_k:
            keep[key_k] = f_k
            continue
        covered = min(cover_sum.get(key_k, 0), f_k)
        cover_ratio = covered / max(f_k, 1)

        if entropy_fn is not None and entropy_min is not None:
            try:
                if entropy_fn(key_k) >= entropy_min:
                    keep[key_k] = f_k
                    continue
            except Exception:
                pass

        if cover_ratio >= ratio:
            continue  # absorbé
        keep[key_k] = f_k

    return Counter(keep)

# Appliquer séquentiellement : 2<-3, 3<-4, 4<-5
freq[2] = suppress_nested_k_by_kplus1(freq[2], freq[3], ratio=ABSORB_RATIO, min_k=3)
freq[3] = suppress_nested_k_by_kplus1(freq[3], freq[4], ratio=ABSORB_RATIO, min_k=3)
freq[4] = suppress_nested_k_by_kplus1(freq[4], freq[5], ratio=ABSORB_RATIO, min_k=3)

print("Après suppression :",
      "bi=",len(freq[2]), "tri=",len(freq[3]), "quad=",len(freq[4]))


Après suppression : bi= 318552 tri= 524287 quad= 583443


In [None]:
#Étape 6 — Sélections par n (filtres freq/DF + bords + PMI + entropies)

def _finalize_df(rows, cols_order, sort_cols, ascending):
    df_out = pd.DataFrame(rows)
    if df_out.empty:
        return pd.DataFrame(columns=cols_order)
    if "status" in df_out.columns:
        df_out["status"] = pd.Categorical(df_out["status"],
                                          categories=["in_reference", "new"], ordered=True)
    cols = [c for c in cols_order if c in df_out.columns]
    cols += [c for c in df_out.columns if c not in cols]
    return df_out[cols].sort_values(sort_cols, ascending=ascending).reset_index(drop=True)

#这一段换成下面这一段
"""
def select_unigram():
    rows=[]
    for w, f in freq[1].items():
        if f < MINF[1] or dfreq[1][w] < MINDF[1]:
            continue
        #in_ref = (w in ref_set)
        #rows.append({
        #    "term": w, "n":1, "freq":int(f), "doc_freq":int(dfreq[1][w]),
        #    "first_year": first_year.get(w), "in_reference": in_ref,
        #    "status": "in_reference" if in_ref else "new"
        #})

        # 改这里：只要不在 reference 就算新词
        if w in ref_set:
            continue  # 丢弃掉在 reference 里的
        rows.append({
            "term": w, "n":1, "freq":int(f), "doc_freq":int(dfreq[1][w]),
            "first_year": first_year.get(w),
            "in_reference": False,
            "status": "new"
        })

    return _finalize_df(
        rows,
        cols_order=["term","n","freq","doc_freq","first_year","in_reference","status"],
        sort_cols=["status","freq","doc_freq"],
        ascending=[True,False,False]
    )
"""


#换成下面这一段以改变unigramme -> new

# 先在这里加一个工具函数（放在 select_unigram 前面即可）
zh_char_re = re.compile(r"[\u4e00-\u9fff]")

def is_cjk_char(x: str) -> bool:
    return bool(x) and all(zh_char_re.match(ch) for ch in x)

# 然后改写 select_unigram
def select_unigram():
    rows=[]
    for w, f in freq[1].items():
        if f < MINF[1] or dfreq[1][w] < MINDF[1]:
            continue
        # 新增过滤：只保留单个汉字
        if not is_cjk_char(w) or len(w) != 1:
            continue
        #  丢弃参考词表里的
        if w in ref_set:
            continue
        rows.append({
            "term": w, "n":1, "freq":int(f), "doc_freq":int(dfreq[1][w]),
            "first_year": first_year.get(w),
            "in_reference": False,
            "status": "new"
        })
    return _finalize_df(
        rows,
        cols_order=["term","n","freq","doc_freq","first_year","in_reference","status"],
        sort_cols=["freq","doc_freq"],
        ascending=[False,False]
    )




def select_ng(n):
    assert 2 <= n <= 5
    rows=[]
    for key, f in freq[n].items():
        if f < MINF[n] or dfreq[n][key] < MINDF[n]:
            continue
        if bad_edge_ngram(key):    # bords faibles/parasites
            continue
        pmi = PMI_n(n, key, min_count=MINF[n], kind="pmi")
        ent = lr_entropy_ng(key)
        if pmi < PMI_MIN[n] or ent < ENT_MIN:
            continue
        in_ref = (key in ref_set)    # 2-5 正常的原因
        rows.append({
            "term": key, "n":n, "freq":int(f), "doc_freq":int(dfreq[n][key]),
            "PMI": round(pmi,3), "LRent": round(ent,3),
            "in_reference": in_ref, "status": "in_reference" if in_ref else "new"  #2-5正常的原因
        })
    return _finalize_df(
        rows,
        cols_order=["term","n","freq","doc_freq","PMI","LRent","in_reference","status"],
        sort_cols=["status","PMI","freq"],
        ascending=[True,False,False]
    )

# Générer les 5 tableaux
res_uni = select_unigram()
res_bi  = select_ng(2)
res_tri = select_ng(3)
res_quad= select_ng(4)
res_pent= select_ng(5)

for name, part in [("uni",res_uni),("bi",res_bi),("tri",res_tri),("quad",res_quad),("penta",res_pent)]:
    print(name, part.shape)
    display(part.head(5))


uni (15, 7)


Unnamed: 0,term,n,freq,doc_freq,first_year,in_reference,status
0,們,1,46,10,,False,new
1,網,1,18,15,,False,new
2,個,1,18,10,2019.0,False,new
3,電,1,16,16,,False,new
4,動,1,16,8,,False,new


bi (188, 8)


Unnamed: 0,term,n,freq,doc_freq,PMI,LRent,in_reference,status
0,助 學金,2,8,4,10.252,2.718,False,new
1,丝羽 乌骨鸡,2,8,4,10.252,2.857,False,new
2,犯 强奸罪,2,9,3,10.229,2.622,False,new
3,抽样 复制,2,6,3,9.665,3.838,False,new
4,猎鹰9 火箭,2,7,3,9.624,5.128,False,new


tri (1092, 8)


Unnamed: 0,term,n,freq,doc_freq,PMI,LRent,in_reference,status
0,自然 和谐 共生,3,30,18,16.293,6.44,False,new
1,盈 转 亏,3,11,3,15.903,2.891,False,new
2,非 繁殖 季节,3,12,4,15.749,6.083,False,new
3,基 补足 精神,3,8,8,15.579,5.826,False,new
4,实践 专题 宣介会,3,6,6,15.256,4.072,False,new


quad (944, 8)


Unnamed: 0,term,n,freq,doc_freq,PMI,LRent,in_reference,status
0,习近平 同志 为 核心,4,68,37,17.31,8.483,False,new
1,广发 价 值 领航,4,24,3,16.619,6.794,False,new
2,习近平 总书记 关于 党,4,19,7,16.18,7.448,False,new
3,出席 在 阿斯塔纳 举行,4,13,13,16.019,8.31,False,new
4,哈萨克斯坦 是 山水相连 唇齿相依,4,12,12,15.968,3.344,False,new


penta (39425, 8)


Unnamed: 0,term,n,freq,doc_freq,PMI,LRent,in_reference,status
0,新 时代 中国 特色 社会主义,5,134,47,18.297,8.571,False,new
1,时代 中国 特色 社会主义 思想,5,129,47,18.261,7.683,False,new
2,分享 让 更 多 人,5,127,127,18.241,10.992,False,new
3,让 更 多 人 看到,5,127,127,18.239,10.213,False,new
4,http w w w people,5,121,2,18.198,2.228,False,new


In [None]:
#Étape 7 — (Optionnel) Règle “quatre caractères” (chinois) pour filtrer des bigrammes trop génériques

def is_4char_ng(ng):
    return len("".join(ng.split())) == 4

good_len4_longer = set()
for df_part in [res_tri, res_quad, res_pent]:
    good_len4_longer |= set(t for t in df_part["term"] if is_4char_ng(t))

if len(good_len4_longer) > 0:
    def covered_by_len4(bi):
        b = "".join(bi.split())
        for longer in good_len4_longer:
            if b in "".join(longer.split()):
                return True
        return False
    res_bi = res_bi[~res_bi["term"].apply(covered_by_len4)]

print("Nb n-grammes (longueur caractères = 4) repérés parmi n≥3 :", len(good_len4_longer))
print("Bigrammes après filtre longueur-4 :", len(res_bi))


Nb n-grammes (longueur caractères = 4) repérés parmi n≥3 : 165
Bigrammes après filtre longueur-4 : 186


In [None]:
#Étape 8 — Export CSV 1→5-grammes + fusion finale sans doublons

# Export par niveau
res_uni.to_csv(OUT_DIR/"chi3_neo_uni.csv",  index=False, encoding="utf-8-sig")
res_bi.to_csv( OUT_DIR/"chi3_neo_bi.csv",   index=False, encoding="utf-8-sig")
res_tri.to_csv(OUT_DIR/"chi3_neo_tri.csv",  index=False, encoding="utf-8-sig")
res_quad.to_csv(OUT_DIR/"chi3_neo_quad.csv",index=False, encoding="utf-8-sig")
res_pent.to_csv(OUT_DIR/"chi3_neo_penta.csv",index=False, encoding="utf-8-sig")

# Fusion (on garde les plus longs en premier)
def norm_chars(s): return "".join(s.split())

chosen = []
for df_part in [res_pent, res_quad, res_tri, res_bi, res_uni]:  # priorité aux plus longs
    for _, r in df_part.iterrows():
        t = r["term"]; t_norm = norm_chars(t)
        # si déjà couvert totalement par un terme plus long choisi, on saute
        if any(t_norm in norm_chars(x["term"]) for x in chosen
               if len(norm_chars(x["term"])) >= len(t_norm)):
            continue
        chosen.append(r)

res_final = pd.DataFrame(chosen).reset_index(drop=True)
res_final.to_csv(OUT_DIR/"chi_neo_final.csv", index=False, encoding="utf-8-sig")

print("Fichiers enregistrés :")
print(" -", OUT_DIR/"chi3_neo_uni.csv")
print(" -", OUT_DIR/"chi3_neo_bi.csv")
print(" -", OUT_DIR/"chi3_neo_tri.csv")
print(" -", OUT_DIR/"chi3_neo_quad.csv")
print(" -", OUT_DIR/"chi3_neo_penta.csv")
print(" -", OUT_DIR/"chi3_neo_final.csv")

print(f"Tailles : uni={len(res_uni)}, bi={len(res_bi)}, tri={len(res_tri)}, quad={len(res_quad)}, penta={len(res_pent)}, final={len(res_final)}")
display(res_final.head(10))


Fichiers enregistrés :
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi3_neo_uni.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi3_neo_bi.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi3_neo_tri.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi3_neo_quad.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi3_neo_penta.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi3_neo_final.csv
Tailles : uni=15, bi=186, tri=1092, quad=944, penta=39425, final=40505


Unnamed: 0,term,n,freq,doc_freq,PMI,LRent,in_reference,status,first_year
0,新 时代 中国 特色 社会主义,5,134,47,18.297,8.571,False,new,
1,时代 中国 特色 社会主义 思想,5,129,47,18.261,7.683,False,new,
2,分享 让 更 多 人,5,127,127,18.241,10.992,False,new,
3,让 更 多 人 看到,5,127,127,18.239,10.213,False,new,
4,http w w w people,5,121,2,18.198,2.228,False,new,
5,w w w people com,5,121,2,18.198,2.422,False,new,
6,w w people com cn,5,121,2,18.198,2.956,False,new,
7,com cn n c html,5,121,2,18.198,2.056,False,new,
8,政务 微博 办 实事 排行榜,5,118,2,18.173,6.184,False,new,
9,cn n c html 政务,5,117,2,18.165,2.083,False,new,


Notes d’usage / réglages rapides

Seuils : commence avec ceux fournis, puis ajuste PMI_MIN (par n) et ENT_MIN selon la densité/qualité du corpus.

Absorption : si trop agressif, baisse ABSORB_RATIO (ex. 0.7) ou augmente min_k dans suppress_nested_k_by_kplus1.

PMI : change kind="pmi" en "ppmi" dans PMI_n si tu veux des valeurs ≥0, souvent utiles pour des tris rapides.

Multilingue : la logique ne dépend pas de la langue tant que la tokenisation est déjà faite (espaces entre tokens).

Référence : si tu veux surligner/filtrer uniquement les n-grammes non référencés, garde la colonne status.

#Méthode 4 ： 1-5 characters

In [None]:
# Améliorations par rapport au code initial :
# 1) Passage d'un n-gram "mots" -> balayage caractère (句内滑窗) ;
# 2) Ne compter QUE les entrées déjà présentes dans le lexique ;
# 3) Filtrage du lexique : uniquement CJK + longueur 1~5；
# 4) Fréquence totale + DF + année de 1ère apparition；
# 5) Exports séparés 1/2/3/4/5 汉字为 5 个 CSV。
# ============================================================

# Étape 1 — Paramètres & imports
from pathlib import Path
import pandas as pd
from collections import Counter, defaultdict
import numpy as np
import unicodedata, re, math

# === Chemins à adapter si besoin ===
IN_REF = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all_ch.txt")
IN_CORPUS = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/ch2_all_data_2015_2025_tok_thulac.csv")
OUT_DIR = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Dossier de sortie :", OUT_DIR)

# Séparateurs de phrase courants（中文）
SENT_SEP = set("。！？!?；;：:")

# --- Utilitaires Unicode / normalisation ---
def normalize(s: str) -> str:
    """Normalisation Unicode (+ nettoyage des espaces atypiques)."""
    if s is None:
        return ""
    s = unicodedata.normalize("NFKC", str(s))
    s = s.replace("\u3000", " ").replace("\u00A0", " ")
    return s.strip()

# Filtre “是汉字吗？”
zh_char_re = re.compile(r"[\u4e00-\u9fff]")

def is_cjk_string(x: str) -> bool:
    """True si la chaîne est composée uniquement de 汉字（CJK 基本区）."""
    return bool(x) and all(zh_char_re.match(ch) for ch in x)

def hanzi_len(x: str) -> int:
    """Nombre de 汉字（仅统计 CJK 基本区字符数）."""
    return sum(1 for ch in x if zh_char_re.match(ch))

# Étape 2 — Charger le lexique de référence & le corpus
# ----------------------------------------------------
# Lexique de référence → ne garder que：纯汉字 & 长度 1~5
ref_set_raw = set()
with IN_REF.open("r", encoding="utf-8") as f:
    for line in f:
        t = normalize(line)
        if t:
            ref_set_raw.add(t)

# 过滤为“仅CJK & 长度 1~5”
ref_by_len = {n: set() for n in range(1, 6)}
for t in ref_set_raw:
    # 仅保留纯汉字（避免夹杂阿拉伯数字/字母/标点）
    if not is_cjk_string(t):
        continue
    L = hanzi_len(t)
    if 1 <= L <= 5:
        ref_by_len[L].add(t)

# 汇总：有效词条总数
ref_total = sum(len(s) for s in ref_by_len.values())
print("Entrées dans la liste de référence (CJK pur, len 1~5) :", ref_total)
for n in range(1, 6):
    print(f" - {n} 字：{len(ref_by_len[n])} 条")

# Corpus (THULAC tokenisé：colonne 'content' = tokens séparés par espaces)
df = pd.read_csv(IN_CORPUS, encoding="utf-8")
if "content" not in df.columns:
    raise ValueError("Le CSV doit contenir une colonne 'content' (texte tokenisé).")

# Normalisation et année
df["content"] = df["content"].fillna("").astype(str).map(normalize)
df["date_parsed"] = pd.to_datetime(df.get("date", None), errors="coerce")
df["year"] = df["date_parsed"].dt.year

print("Corpus chargé :", df.shape, "colonnes =", list(df.columns)[:10], "...")

# Étape 3 — 转为“字符序列”，句内滑窗扫描（只匹配词表）
# ----------------------------------------------------
def to_char_list(text: str):
    """
    从 THULAC 的空格分词文本中提取：句读符 + 纯汉字。
    目的：做“句内滑窗”，避免跨句匹配。
    """
    chars = []
    for ch in text:
        if ch in SENT_SEP:
            chars.append(ch)
        elif zh_char_re.match(ch):
            chars.append(ch)
        # 非 CJK & 非句读 → 忽略（过滤掉空格、字母、数字等）
    return chars

def split_char_sent(chars):
    """按句读符切分为句子（每句为字符列表）。"""
    sents, cur = [], []
    for ch in chars:
        cur.append(ch)
        if ch in SENT_SEP:
            sents.append(cur)
            cur = []
    if cur:
        sents.append(cur)
    return sents

def char_windows_in_sentence(sent_chars, n):
    """产出句内长度为 n 的连续汉字窗口（允许重叠）。"""
    clean = [c for c in sent_chars if c not in SENT_SEP]
    for i in range(len(clean) - n + 1):
        yield "".join(clean[i:i+n])

# Étape 4 — Comptage ciblé（仅词表内 1~5 字）
# ----------------------------------------------------
# 频次 & 文档频次（DF）
freq_by_len = {n: Counter() for n in range(1, 6)}
df_by_len   = {n: Counter() for n in range(1, 6)}
first_year  = {n: {} for n in range(1, 6)}  # {n: {term: year}}

for idx, row in df.iterrows():
    year = row.get("year", np.nan)
    content = row["content"]

    chars = to_char_list(content)
    sents = split_char_sent(chars)

    # 本文档内已出现（用于 DF）
    seen_doc = {n: set() for n in range(1, 6)}

    for sent in sents:
        # 针对每个长度（只在词表内查找，加速）
        for n in range(1, 6):
            if not ref_by_len[n]:
                continue
            for cand in char_windows_in_sentence(sent, n):
                # 只统计“在词表里”的候选
              #  if cand in ref_by_len[n]:   #代码在 滑窗统计时只保留 cand in ref_by_len[n] 的候选，也就是： 它只会统计 已经在参考词表里 的词。
              #     freq_by_len[n][cand] += 1
              #     seen_doc[n].add(cand)

  # 只统计 “不在参考词表里的” 候选
                if cand not in ref_by_len[n]:
                    freq_by_len[n][cand] += 1
                    seen_doc[n].add(cand)

    # 累积 DF + 首见年份
    for n in range(1, 6):
        if not seen_doc[n]:
            continue
        df_by_len[n].update(seen_doc[n])
        if not np.isnan(year):
            y = int(year)
            for t in seen_doc[n]:
                if t not in first_year[n]:
                    first_year[n][t] = y

print("Comptage terminé.")
for n in range(1, 6):
    print(f"  len={n} : {len(freq_by_len[n])} 条有计数")



    # Étape 5 — 构建结果表并导出 CSV（各 1~5 字各一份）
# ----------------------------------------------------
def build_df_for_len(n: int) -> pd.DataFrame:
    rows = []
    fcnt = freq_by_len[n]
    dcnt = df_by_len[n]
    for term, f in fcnt.items():
        rows.append({
            "term": term,
            "n_chars": n,
            "freq": int(f),
            "doc_freq": int(dcnt.get(term, 0)),
            "first_year": first_year[n].get(term, np.nan),
           # "in_reference": True,
           # "status": "in_reference"
            "in_reference": False,
            "status": "new_word"

        })
    if not rows:
        return pd.DataFrame(columns=["term","n_chars","freq","doc_freq","first_year","in_reference","status"])
    out = pd.DataFrame(rows)
    out = out.sort_values(["status","freq","doc_freq","term"], ascending=[True, False, False, True])
    return out.reset_index(drop=True)

# === 导出文件，命名与原始风格保持一致 ===
outputs = {}
for n in range(1, 6):
    df_n = build_df_for_len(n)
    out_path = OUT_DIR / f"chi4_neo_char{n}.csv"   # 修改命名规则
    df_n.to_csv(out_path, index=False, encoding="utf-8-sig")
    outputs[n] = (df_n.shape, out_path)

print("\nExports：")
for n in range(1, 6):
    shape, path_ = outputs[n]
    print(f" - {path_}  (rows={shape[0]})")


Dossier de sortie : /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification
Entrées dans la liste de référence (CJK pur, len 1~5) : 123778
 - 1 字：9069 条
 - 2 字：81210 条
 - 3 字：17524 条
 - 4 字：14880 条
 - 5 字：1095 条
Corpus chargé : (1282, 6) colonnes = ['author', 'title', 'date', 'content', 'date_parsed', 'year'] ...
Comptage terminé.
  len=1 : 327 条有计数
  len=2 : 254343 条有计数
  len=3 : 770010 条有计数
  len=4 : 1064463 条有计数
  len=5 : 1182217 条有计数

Exports：
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi4_neo_char1.csv  (rows=327)
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi4_neo_char2.csv  (rows=254343)
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi4_neo_char3.csv  (rows=770010)
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/chi4_neo_char4.csv  (rows=1064463)
 - /content/drive/MyDrive/Colab Notebooks/S

#En vietnamien

In [None]:
#Étape 1 — Imports & paramètres
from pathlib import Path
import pandas as pd
from collections import Counter, defaultdict
import unicodedata, re

# === Chemins (Vietnamien) ===
IN_REF = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all2004_2015_vi copie.txt")
IN_CORPUS = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/vi_all_data_2015_2025_tokenized.csv")
OUT_DIR = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Dossier de sortie :", OUT_DIR)


Dossier de sortie : /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification


In [None]:
#Étape 2 — Charger lexique & corpus


# Lexique de référence
ref_set = set()
with IN_REF.open("r", encoding="utf-8") as f:
    for line in f:
        t = line.strip()
        if t:
            ref_set.add(t)
print(f"Entrées dans la liste de référence : {len(ref_set)}")

# Corpus
df = pd.read_csv(IN_CORPUS, encoding="utf-8")
if "content" not in df.columns:
    raise ValueError("Le CSV doit contenir une colonne 'content'")

df["date_parsed"] = pd.to_datetime(df.get("date", None), errors="coerce")
df["year"] = df["date_parsed"].dt.year
df["content"] = df["content"].fillna("").astype(str)
print(df.shape, df.columns.tolist())
df.head(2)


Entrées dans la liste de référence : 73438
(758, 10) ['author', 'title', 'date', 'content', 'url', 'source', 'year_hint', 'date_parsed', 'content_tokenized', 'year']


  df["date_parsed"] = pd.to_datetime(df.get("date", None), errors="coerce")


Unnamed: 0,author,title,date,content,url,source,year_hint,date_parsed,content_tokenized,year
0,,Những bài học của Tesla và Big Tech trong năm ...,31/12/2022 10:03 AM,Công thức 20 năm\nTheo tờ Financial Times (FT)...,,cafebize,2022,2022-12-31 10:03:00,Công_thức 20 năm Theo tờ Financial_Times ( FT ...,2022.0
1,,Những bài học của Tesla và Big Tech trong năm ...,31/12/2022 10:03 AM,Công thức 20 năm\nTheo tờ Financial Times (FT)...,,cafebize,2022,2022-12-31 10:03:00,Công_thức 20 năm Theo tờ Financial_Times ( FT ...,2022.0


In [None]:
#Étape 3 — Fonctions de nettoyage (tokens)


# ponctuation
def is_punctuation(token: str) -> bool:
    return all(unicodedata.category(ch).startswith("P") or ch.isspace() for ch in token)

# chiffres (0-9 et variantes)
def has_any_digit(token: str) -> bool:
    return any(ch.isdigit() for ch in token)

# pourcentages : "50%", "50,5%", "50.5%"
percent_pattern = re.compile(r"^\d+(?:[.,]\d+)?%$")
def is_percentage(token: str) -> bool:
    return bool(percent_pattern.match(token))

def bad_token(tok):
    return (not tok) or is_punctuation(tok) or has_any_digit(tok) or is_percentage(tok)


In [None]:
#Étape 4 — Comptage fréquence & première apparition

freq = Counter()
first_year = {}

for _, row in df.iterrows():
    year = row["year"]
    tokens = [tok for tok in row["content"].split() if not bad_token(tok)]
    freq.update(tokens)
    if pd.isna(year):
        continue
    y = int(year)
    for tok in tokens:
        if tok not in first_year:
            first_year[tok] = y



In [None]:
#Étape 5 — Construire le tableau resultat

rows = []
for tok, f in freq.items():
    in_ref = tok in ref_set
    rows.append({
        "term": tok,
        "frequency": int(f),
        "first_year": first_year.get(tok, None),
        "in_reference": in_ref,
        "status": "in_reference" if in_ref else "new"
    })

result = pd.DataFrame(rows).sort_values(
    by=["status","frequency","first_year","term"],
    ascending=[True,False,True,True]
).reset_index(drop=True)

new_df = result[result["status"]=="new"].copy()
old_df = result[result["status"]=="in_reference"].copy()

# Create res_uni, res_bi, and res_tri for Vietnamese section
# Since only unigrams were calculated, res_bi and res_tri will be empty DataFrames with appropriate columns
res_uni = result.copy() # All results in 'result' are unigrams in this section

# Define columns for bi and tri grams based on the Chinese section output structure
bi_cols = ["term","n_words","freq","doc_freq","PMI","LRent","in_reference","status"]
tri_cols = ["term","n_words","freq","doc_freq","PMI","LRent","in_reference","status"]

res_bi = pd.DataFrame(columns=bi_cols)
res_tri = pd.DataFrame(columns=tri_cols)

In [None]:
#Étape 6 — Sauvegarde



# niveau mot
res_uni.to_csv(OUT_DIR/"vi2_neo_uni.csv", index=False, encoding="utf-8-sig")
# Use the defined empty dataframes for bi and tri
res_bi.to_csv( OUT_DIR/"vi2_neo_bi.csv",  index=False, encoding="utf-8-sig")
res_tri.to_csv(OUT_DIR/"vi2_neo_tri.csv", index=False, encoding="utf-8-sig")

print("Export (Vietnamien, niveau mot) :")
print(" -", OUT_DIR/"vi2_neo_uni.csv")
print(" -", OUT_DIR/"vi2_neo_bi.csv")
print(" -", OUT_DIR/"vi2_neo_tri.csv")

Export (Vietnamien, niveau mot) :
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi2_neo_uni.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi2_neo_bi.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi2_neo_tri.csv


In [None]:
#Étape 7 — Vérification rapide

result.head(20)


Unnamed: 0,term,frequency,first_year,in_reference,status
0,của,8168,2022.0,True,in_reference
1,và,7739,2022.0,True,in_reference
2,là,6481,2022.0,True,in_reference
3,có,6140,2022.0,True,in_reference
4,trong,5079,2022.0,True,in_reference
5,với,4883,2022.0,True,in_reference
6,các,4881,2022.0,True,in_reference
7,được,4808,2022.0,True,in_reference
8,một,4425,2022.0,True,in_reference
9,những,4104,2022.0,True,in_reference


#Méthode amélioration 3

In [3]:
#Étape 1 – Préparer les chemins et importer les modules

from pathlib import Path
import pandas as pd
from collections import Counter, defaultdict
import unicodedata, re, math

# =========================
# Config (Vietnamien)
# =========================
IN_REF = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all2004_2015_vi copie.txt")
IN_CORPUS = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/vi_all_data_2015_2025_tokenized.csv")
OUT_DIR = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Seuils (ajustables selon ton corpus vietnamien)
MINF_UNI, MINDF_UNI = 5, 3
MINF_BI,  MINDF_BI  = 5, 3
MINF_TRI, MINDF_TRI = 3, 2
PMI2_MIN, PMI3_MIN  = 3.0, 2.5
ENT_MIN             = 2.0
ABSORB_RATIO        = 0.80
SENT_SEP = set(".!?;:")  # ponctuation vietnamienne après tokenisation


In [4]:
#Étape 2 – Charger les données


# Référence
ref_set = set()
with IN_REF.open("r", encoding="utf-8") as f:
    for line in f:
        t = line.strip()
        if t:
            ref_set.add(t)
print(f"Nb termes référence : {len(ref_set)}")

# Corpus
df = pd.read_csv(IN_CORPUS, encoding="utf-8")
df["date_parsed"] = pd.to_datetime(df["date"], errors="coerce")
df = df.sort_values("date_parsed").reset_index(drop=True)
df["year"] = df["date_parsed"].dt.year
df["content"] = df["content"].fillna("").astype(str)


Nb termes référence : 73438


  df["date_parsed"] = pd.to_datetime(df["date"], errors="coerce")


In [5]:
#Étape 3 – Fonctions de nettoyage pour le vietnamien

def is_punct_token(tok):
    return all(unicodedata.category(ch).startswith("P") or ch.isspace() for ch in tok)

# Nombres (ex : 2025, 3,14)
num_pat = re.compile(r"^[0-9]+([.,][0-9]+)?$")
def is_number(tok):
    return bool(num_pat.match(tok))

# Pourcentage (50%, 12,3%)
percent_pat = re.compile(r"^[0-9]+([.,][0-9]+)?%$")
def is_percent(tok):
    return bool(percent_pat.match(tok))

def bad_token(tok):
    if not tok:
        return True
    if is_punct_token(tok) or is_number(tok) or is_percent(tok):
        return True
    return False


In [6]:
#Étape 4 – Compter les n-grammes

uni_freq, bi_freq, tri_freq = Counter(), Counter(), Counter()
uni_df,   bi_df,   tri_df   = Counter(), Counter(), Counter()
first_year = {}

for _, row in df.iterrows():
    year = row["year"]
    tokens = [t for t in row["content"].split() if not bad_token(t)]

    seen_uni, seen_bi, seen_tri = set(), set(), set()

    for i,w in enumerate(tokens):
        uni_freq[w]+=1
        seen_uni.add(w)
        if i<len(tokens)-1:
            bigram = f"{tokens[i]} {tokens[i+1]}"
            bi_freq[bigram]+=1
            seen_bi.add(bigram)
        if i<len(tokens)-2:
            trigram = f"{tokens[i]} {tokens[i+1]} {tokens[i+2]}"
            tri_freq[trigram]+=1
            seen_tri.add(trigram)

    uni_df.update(seen_uni)
    bi_df.update(seen_bi)
    tri_df.update(seen_tri)

    if pd.notna(year):
        y=int(year)
        for w in seen_uni:
            if w not in first_year:
                first_year[w]=y


In [7]:
#Étape 5 – Fonctions PMI et entropie

N_uni, N_bi, N_tri = sum(uni_freq.values()), sum(bi_freq.values()), sum(tri_freq.values())
EPS=1e-12

def p_uni(w): return (uni_freq[w]+1)/(N_uni+len(uni_freq))
def p_bi(ng): return (bi_freq[ng]+1)/(N_bi+len(bi_freq))
def p_tri(ng): return (tri_freq[ng]+1)/(N_tri+len(tri_freq))

def PMI2(ng):
    a,b=ng.split()
    return math.log((p_bi(ng))/(p_uni(a)*p_uni(b))+EPS)

def PMI3(ng):
    a,b,c=ng.split()
    return math.log((p_tri(ng))/(p_uni(a)*p_uni(b)*p_uni(c))+EPS)

# Contexte gauche/droite
left_ctx, right_ctx = defaultdict(Counter), defaultdict(Counter)
for _, row in df.iterrows():
    toks=[t for t in row["content"].split() if not bad_token(t)]
    for i,w in enumerate(toks):
        if i>0: left_ctx[w][toks[i-1]]+=1
        if i<len(toks)-1: right_ctx[w][toks[i+1]]+=1

def entropy(counter):
    total=sum(counter.values()) or 1
    return -sum((c/total)*math.log((c/total)+EPS) for c in counter.values())

def lr_entropy_ng(ng):
    parts=ng.split()
    return entropy(left_ctx[parts[0]])+entropy(right_ctx[parts[-1]])


In [8]:

# Étape 6 – Sélection des candidats (只保留新词)

def select_unigram():
    rows = []
    for w, f in uni_freq.items():
        if f < MINF_UNI or uni_df[w] < MINDF_UNI:
            continue
        in_ref = (w in ref_set)
        if in_ref:   #  跳过参考表里已有的
            continue
        rows.append({
            "term": w, "n": 1, "freq": f, "doc_freq": uni_df[w],
            "first_year": first_year.get(w),
            "in_reference": False,
            "status": "new"
        })
    return pd.DataFrame(rows)


def select_bigram():
    rows = []
    for ng, f in bi_freq.items():
        if f < MINF_BI or bi_df[ng] < MINDF_BI:
            continue
        pmi = PMI2(ng); ent = lr_entropy_ng(ng)
        if pmi < PMI2_MIN or ent < ENT_MIN:
            continue
        in_ref = (ng in ref_set)
        if in_ref:   #  跳过参考表里已有的
            continue
        rows.append({
            "term": ng, "n": 2, "freq": f, "doc_freq": bi_df[ng],
            "PMI": round(pmi, 3), "LRent": round(ent, 3),
            "in_reference": False,
            "status": "new"
        })
    return pd.DataFrame(rows)


def select_trigram():
    rows = []
    for ng, f in tri_freq.items():
        if f < MINF_TRI or tri_df[ng] < MINDF_TRI:
            continue
        pmi = PMI3(ng); ent = lr_entropy_ng(ng)
        if pmi < PMI3_MIN or ent < ENT_MIN:
            continue
        in_ref = (ng in ref_set)
        if in_ref:   # 跳过参考表里已有的
            continue
        rows.append({
            "term": ng, "n": 3, "freq": f, "doc_freq": tri_df[ng],
            "PMI": round(pmi, 3), "LRent": round(ent, 3),
            "in_reference": False,
            "status": "new"
        })
    return pd.DataFrame(rows)



In [9]:
#Étape 7 – Export
res_uni=select_unigram()
res_bi=select_bigram()
res_tri=select_trigram()

res_uni.to_csv(OUT_DIR/"vi3_neo_uni.csv",index=False,encoding="utf-8-sig")
res_bi.to_csv(OUT_DIR/"vi3_neo_bi.csv",index=False,encoding="utf-8-sig")
res_tri.to_csv(OUT_DIR/"vi3_neo_tri.csv",index=False,encoding="utf-8-sig")

print("Export terminé :")
print(" -", OUT_DIR/"vi3_neo_uni.csv")
print(" -", OUT_DIR/"vi3_neo_bi.csv")
print(" -", OUT_DIR/"vi3_neo_tri.csv")


Export terminé :
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi3_neo_uni.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi3_neo_bi.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi3_neo_tri.csv


4 Gramme & 5 Gramme

In [10]:
# ============================================================
# Détection candidats (vi) en 1..5-grammes avec PMI & Entropie
# ============================================================

from pathlib import Path
import pandas as pd
from collections import Counter, defaultdict
import unicodedata, re, math

# =========================
# Config (Vietnamien)
# =========================
IN_REF = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all2004_2015_vi copie.txt")
IN_CORPUS = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/vi_all_data_2015_2025_tokenized.csv")
OUT_DIR = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Seuils (ajustables selon ton corpus)
MINF_UNI, MINDF_UNI = 5, 3       # 1-gram
MINF_BI,  MINDF_BI  = 5, 3       # 2-gram
MINF_TRI, MINDF_TRI = 3, 2       # 3-gram
MINF_QUA, MINDF_QUA = 2, 2       # 4-gram
MINF_PEN, MINDF_PEN = 2, 2       # 5-gram

PMI2_MIN, PMI3_MIN, PMI4_MIN, PMI5_MIN = 3.0, 2.5, 2.0, 1.5
ENT_MIN = 2.0
EPS = 1e-12

print("Dossier de sortie :", OUT_DIR)

# =========================
# Étape 2 – Charger les données
# =========================

# Référence
ref_set = set()
with IN_REF.open("r", encoding="utf-8") as f:
    for line in f:
        t = line.strip()
        if t:
            ref_set.add(t)
print(f"Nb termes dans la liste de référence : {len(ref_set)}")

# Corpus (supposé tokenisé : content = mots séparés par espaces)
df = pd.read_csv(IN_CORPUS, encoding="utf-8")
if not {"content", "date"}.issubset(df.columns):
    raise ValueError("Le CSV doit contenir les colonnes 'content' et 'date'.")

df["date_parsed"] = pd.to_datetime(df["date"], errors="coerce")
df = df.sort_values("date_parsed").reset_index(drop=True)
df["year"] = df["date_parsed"].dt.year
df["content"] = df["content"].fillna("").astype(str)

# =========================
# Étape 3 – Nettoyage (VI)
# =========================

def is_punct_token(tok: str) -> bool:
    # Un token est « ponctuation » s'il n'est composé que de ponctuation / espaces
    return all(unicodedata.category(ch).startswith("P") or ch.isspace() for ch in tok)

# Nombres (ex: 2025, 3,14)
num_pat = re.compile(r"^[0-9]+([.,][0-9]+)?$")
def is_number(tok: str) -> bool:
    return bool(num_pat.match(tok))

# Pourcentage (ex: 50%, 12,3%)
percent_pat = re.compile(r"^[0-9]+([.,][0-9]+)?%$")
def is_percent(tok: str) -> bool:
    return bool(percent_pat.match(tok))

# URL / mentions (optionnel, souvent utile)
url_pat = re.compile(r"https?://|www\.", re.I)
mention_pat = re.compile(r"^[@#]")

def bad_token(tok: str) -> bool:
    if not tok:
        return True
    if is_punct_token(tok) or is_number(tok) or is_percent(tok):
        return True
    if url_pat.search(tok) or mention_pat.search(tok):
        return True
    return False

# =========================
# Étape 4 – Compter les n-grammes (1..5)
# =========================
uni_freq, bi_freq, tri_freq, quad_freq, penta_freq = Counter(), Counter(), Counter(), Counter(), Counter()
uni_df,   bi_df,   tri_df,   quad_df,   penta_df   = Counter(), Counter(), Counter(), Counter(), Counter()
first_year = {}  # pour unigrams

for _, row in df.iterrows():
    year = row["year"]
    tokens = [t for t in row["content"].split() if not bad_token(t)]

    seen_uni, seen_bi, seen_tri, seen_quad, seen_penta = set(), set(), set(), set(), set()

    for i, w in enumerate(tokens):
        # 1-gram
        uni_freq[w] += 1
        seen_uni.add(w)

        # 2-gram
        if i < len(tokens) - 1:
            bigram = f"{tokens[i]} {tokens[i+1]}"
            bi_freq[bigram] += 1
            seen_bi.add(bigram)

        # 3-gram
        if i < len(tokens) - 2:
            trigram = f"{tokens[i]} {tokens[i+1]} {tokens[i+2]}"
            tri_freq[trigram] += 1
            seen_tri.add(trigram)

        # 4-gram
        if i < len(tokens) - 3:
            quadgram = f"{tokens[i]} {tokens[i+1]} {tokens[i+2]} {tokens[i+3]}"
            quad_freq[quadgram] += 1
            seen_quad.add(quadgram)

        # 5-gram
        if i < len(tokens) - 4:
            pentagram = f"{tokens[i]} {tokens[i+1]} {tokens[i+2]} {tokens[i+3]} {tokens[i+4]}"
            penta_freq[pentagram] += 1
            seen_penta.add(pentagram)

    # DF (document frequency)
    uni_df.update(seen_uni)
    bi_df.update(seen_bi)
    tri_df.update(seen_tri)
    quad_df.update(seen_quad)
    penta_df.update(seen_penta)

    # première année d'apparition (unigrammes)
    if pd.notna(year):
        y = int(year)
        for w in seen_uni:
            if w not in first_year:
                first_year[w] = y

print("Comptage terminé :",
      f"uni={len(uni_freq)}, bi={len(bi_freq)}, tri={len(tri_freq)}, quad={len(quad_freq)}, penta={len(penta_freq)}")

# =========================
# Étape 5 – Probabilités & PMI & Entropie
# =========================
N_uni = sum(uni_freq.values()); V_uni = len(uni_freq)
N_bi  = sum(bi_freq.values());  V_bi  = len(bi_freq)
N_tri = sum(tri_freq.values()); V_tri = len(tri_freq)
N_qua = sum(quad_freq.values()); V_qua = len(quad_freq)
N_pen = sum(penta_freq.values()); V_pen = len(penta_freq)

ALPHA = 1.0  # lissage Laplace

def p_uni(w):   return (uni_freq[w]  + ALPHA) / (max(N_uni, 0) + ALPHA * max(V_uni, 1))
def p_bi(ng):   return (bi_freq[ng]  + ALPHA) / (max(N_bi,  0) + ALPHA * max(V_bi,  1))
def p_tri(ng):  return (tri_freq[ng] + ALPHA) / (max(N_tri, 0) + ALPHA * max(V_tri, 1))
def p_qua(ng):  return (quad_freq[ng]+ ALPHA) / (max(N_qua, 0) + ALPHA * max(V_qua, 1))
def p_pen(ng):  return (penta_freq[ng]+ALPHA) / (max(N_pen, 0) + ALPHA * max(V_pen, 1))

def PMI2(ng):
    a, b = ng.split()
    return math.log((p_bi(ng)) / (p_uni(a) * p_uni(b)) + EPS)

def PMI3(ng):
    a, b, c = ng.split()
    return math.log((p_tri(ng)) / (p_uni(a) * p_uni(b) * p_uni(c)) + EPS)

def PMI4(ng):
    a, b, c, d = ng.split()
    return math.log((p_qua(ng)) / (p_uni(a) * p_uni(b) * p_uni(c) * p_uni(d)) + EPS)

def PMI5(ng):
    a, b, c, d, e = ng.split()
    return math.log((p_pen(ng)) / (p_uni(a) * p_uni(b) * p_uni(c) * p_uni(d) * p_uni(e)) + EPS)

# Contexte gauche/droite (pour entropie)
left_ctx, right_ctx = defaultdict(Counter), defaultdict(Counter)
for _, row in df.iterrows():
    toks = [t for t in row["content"].split() if not bad_token(t)]
    for i, w in enumerate(toks):
        if i > 0:
            left_ctx[w][toks[i-1]] += 1
        if i < len(toks) - 1:
            right_ctx[w][toks[i+1]] += 1

def entropy(counter: Counter) -> float:
    total = sum(counter.values())
    if total <= 0:
        return 0.0
    H = 0.0
    for c in counter.values():
        p = c / total
        H -= p * math.log(p + EPS)
    return H

def lr_entropy_ng(ng: str) -> float:
    parts = ng.split()
    # entropie gauche du 1er token + entropie droite du dernier token
    return entropy(left_ctx[parts[0]]) + entropy(right_ctx[parts[-1]])


# =========================
# Étape 6 – Sélection (1..5) 只保留新词
# =========================
def select_unigram():
    rows = []
    for w, f in uni_freq.items():
        if f < MINF_UNI or uni_df[w] < MINDF_UNI:
            continue
        in_ref = (w in ref_set)
        if in_ref:   # 跳过参考表里已有的
            continue
        rows.append({
            "term": w, "n": 1, "freq": int(f), "doc_freq": int(uni_df[w]),
            "first_year": first_year.get(w),
            "in_reference": False, "status": "new"
        })
    df_uni = pd.DataFrame(rows)
    if len(df_uni):
        df_uni = df_uni.sort_values(["freq", "doc_freq"], ascending=[False, False])
    return df_uni


def select_bigram():
    rows = []
    for ng, f in bi_freq.items():
        if f < MINF_BI or bi_df[ng] < MINDF_BI:
            continue
        pmi = PMI2(ng); ent = lr_entropy_ng(ng)
        if pmi < PMI2_MIN or ent < ENT_MIN:
            continue
        in_ref = (ng in ref_set)
        if in_ref:
            continue
        rows.append({
            "term": ng, "n": 2, "freq": int(f), "doc_freq": int(bi_df[ng]),
            "PMI": round(pmi, 3), "LRent": round(ent, 3),
            "in_reference": False, "status": "new"
        })
    df_bi = pd.DataFrame(rows)
    if len(df_bi):
        df_bi = df_bi.sort_values(["PMI", "freq"], ascending=[False, False])
    return df_bi


def select_trigram():
    rows = []
    for ng, f in tri_freq.items():
        if f < MINF_TRI or tri_df[ng] < MINDF_TRI:
            continue
        pmi = PMI3(ng); ent = lr_entropy_ng(ng)
        if pmi < PMI3_MIN or ent < ENT_MIN:
            continue
        in_ref = (ng in ref_set)
        if in_ref:
            continue
        rows.append({
            "term": ng, "n": 3, "freq": int(f), "doc_freq": int(tri_df[ng]),
            "PMI": round(pmi, 3), "LRent": round(ent, 3),
            "in_reference": False, "status": "new"
        })
    df_tri = pd.DataFrame(rows)
    if len(df_tri):
        df_tri = df_tri.sort_values(["PMI", "freq"], ascending=[False, False])
    return df_tri


def select_quadgram():
    rows = []
    for ng, f in quad_freq.items():
        if f < MINF_QUA or quad_df[ng] < MINDF_QUA:
            continue
        pmi = PMI4(ng); ent = lr_entropy_ng(ng)
        if pmi < PMI4_MIN or ent < ENT_MIN:
            continue
        in_ref = (ng in ref_set)
        if in_ref:
            continue
        rows.append({
            "term": ng, "n": 4, "freq": int(f), "doc_freq": int(quad_df[ng]),
            "PMI": round(pmi, 3), "LRent": round(ent, 3),
            "in_reference": False, "status": "new"
        })
    df_qua = pd.DataFrame(rows)
    if len(df_qua):
        df_qua = df_qua.sort_values(["PMI", "freq"], ascending=[False, False])
    return df_qua


def select_pentagram():
    rows = []
    for ng, f in penta_freq.items():
        if f < MINF_PEN or penta_df[ng] < MINDF_PEN:
            continue
        pmi = PMI5(ng); ent = lr_entropy_ng(ng)
        if pmi < PMI5_MIN or ent < ENT_MIN:
            continue
        in_ref = (ng in ref_set)
        if in_ref:
            continue
        rows.append({
            "term": ng, "n": 5, "freq": int(f), "doc_freq": int(penta_df[ng]),
            "PMI": round(pmi, 3), "LRent": round(ent, 3),
            "in_reference": False, "status": "new"
        })
    df_pen = pd.DataFrame(rows)
    if len(df_pen):
        df_pen = df_pen.sort_values(["PMI", "freq"], ascending=[False, False])
    return df_pen


# =========================
# Étape 7 – Exports CSV
# =========================
res_uni  = select_unigram()
res_bi   = select_bigram()
res_tri  = select_trigram()
res_qua  = select_quadgram()
res_pent = select_pentagram()

res_uni.to_csv( OUT_DIR / "vi4_neo_uni.csv",  index=False, encoding="utf-8-sig")
res_bi.to_csv(  OUT_DIR / "vi4_neo_bi.csv",   index=False, encoding="utf-8-sig")
res_tri.to_csv( OUT_DIR / "vi4_neo_tri.csv",  index=False, encoding="utf-8-sig")
res_qua.to_csv( OUT_DIR / "vi4_neo_quad.csv", index=False, encoding="utf-8-sig")
res_pent.to_csv(OUT_DIR / "vi4_neo_penta.csv",index=False, encoding="utf-8-sig")

print("Exports :")
print(" -", OUT_DIR / "vi4_neo_uni.csv")
print(" -", OUT_DIR / "vi4_neo_bi.csv")
print(" -", OUT_DIR / "vi4_neo_tri.csv")
print(" -", OUT_DIR / "vi4_neo_quad.csv")
print(" -", OUT_DIR / "vi4_neo_penta.csv")


Dossier de sortie : /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification
Nb termes dans la liste de référence : 73438


  df["date_parsed"] = pd.to_datetime(df["date"], errors="coerce")


Comptage terminé : uni=31423, bi=230055, tri=431254, quad=520384, penta=547855
Exports :
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi4_neo_uni.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi4_neo_bi.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi4_neo_tri.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi4_neo_quad.csv
 - /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi4_neo_penta.csv
