In [None]:
#步骤 1：将所有 CSV 文件合并成一个文件

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


#En Chinois

#méthode 1 avec stanza

In [None]:
# -*- coding: utf-8 -*-
from pathlib import Path
import pandas as pd
from collections import Counter

# ========= 配置文件路径 =========
path_refs = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all_ch.txt")
path_corpus = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/ch_all_data_2015_2025_tok_stanza.csv")

# 输出目录与文件
out_dir = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
out_file = out_dir / "ch_néologisme_data.csv"

# 确保输出目录存在
out_dir.mkdir(parents=True, exist_ok=True)

# ========= 读取参考词表 =========
ref_terms = []
with path_refs.open("r", encoding="utf-8") as f:
    for line in f:
        t = " ".join(line.strip().split())
        if t:
            ref_terms.append(t)

seen = set()
ref_terms = [t for t in ref_terms if not (t in seen or seen.add(t))]

def ntoks(s: str) -> int:
    return len(s.split())

max_len = max(ntoks(t) for t in ref_terms) if ref_terms else 1

# ========= 读取语料 =========
df = pd.read_csv(path_corpus, encoding="utf-8")
if "content" not in df.columns:
    raise ValueError("CSV 中没有 'content' 列。")

contents = df["content"].fillna("").astype(str).tolist()

# ========= 构建 n-gram 频数字典 =========
ngram_counter = Counter()
for text in contents:
    toks = text.split()
    n_tokens = len(toks)
    if n_tokens == 0:
        continue
    up_to = min(max_len, n_tokens)
    for n in range(1, up_to + 1):
        for i in range(0, n_tokens - n + 1):
            ngram = " ".join(toks[i:i+n])
            ngram_counter[ngram] += 1

# ========= 统计结果 =========
rows = []
for term in ref_terms:
    freq = ngram_counter.get(term, 0)
    rows.append({
        "term": term,
        "n_tokens": ntoks(term),
        "frequency": int(freq),
        "found": bool(freq > 0),
    })

result = pd.DataFrame(rows).sort_values(
    by=["found", "frequency", "n_tokens", "term"],
    ascending=[False, False, True, True]
).reset_index(drop=True)

# ========= 保存结果 =========
result.to_csv(out_file, index=False, encoding="utf-8-sig")

print(f"结果已保存到：{out_file}")
print(f"总参考项数：{len(ref_terms)}")
print(f"新词数量：{len(result[result['found'] == False])}")


结果已保存到：/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/ch_néologisme_data.csv
总参考项数：126358
新词数量：111120


#méthode 2 avec stanza

In [None]:
#添加对应的年份


# -*- coding: utf-8 -*-
from pathlib import Path
import pandas as pd
from collections import Counter

# ========= 配置文件路径 =========
path_refs = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all_ch.txt")
path_corpus = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/ch_all_data_2015_2025_tok_stanza.csv")

# 输出目录与文件
out_dir = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
out_file = out_dir / "ch_néologisme_data_years.csv"
out_dir.mkdir(parents=True, exist_ok=True)

# ========= 读取参考词表 =========
ref_terms = []
with path_refs.open("r", encoding="utf-8") as f:
    for line in f:
        t = " ".join(line.strip().split())
        if t:
            ref_terms.append(t)

# 去重保序
seen = set()
ref_terms = [t for t in ref_terms if not (t in seen or seen.add(t))]

def ntoks(s: str) -> int:
    return len(s.split())

max_len = max(ntoks(t) for t in ref_terms) if ref_terms else 1

# ========= 读取语料 =========
df = pd.read_csv(path_corpus, encoding="utf-8")
if not {"content", "date"}.issubset(df.columns):
    raise ValueError("CSV 中必须有 'content' 和 'date' 列。")

df["content"] = df["content"].fillna("").astype(str)

# 如果日期是 YYYY-MM-DD 格式，提取年份
df["year"] = pd.to_datetime(df["date"], errors="coerce").dt.year

# ========= 构建 n-gram 频数字典 =========
ngram_counter = Counter()
# 保存首次出现年份
first_year_dict = {}

for idx, row in df.iterrows():
    toks = row["content"].split()
    year = row["year"]
    n_tokens = len(toks)
    if n_tokens == 0:
        continue
    up_to = min(max_len, n_tokens)
    for n in range(1, up_to + 1):
        for i in range(0, n_tokens - n + 1):
            ngram = " ".join(toks[i:i+n])
            ngram_counter[ngram] += 1
            # 记录首次出现年份
            if ngram not in first_year_dict and pd.notnull(year):
                first_year_dict[ngram] = int(year)

# ========= 统计结果 =========
rows = []
for term in ref_terms:
    freq = ngram_counter.get(term, 0)
    first_year = first_year_dict.get(term, "")
    rows.append({
        "term": term,
        "n_tokens": ntoks(term),
        "frequency": int(freq),
        "found": bool(freq > 0),
        "first_year": first_year
    })

result = pd.DataFrame(rows).sort_values(
    by=["found", "frequency", "n_tokens", "term"],
    ascending=[False, False, True, True]
).reset_index(drop=True)

# ========= 保存 =========
result.to_csv(out_file, index=False, encoding="utf-8-sig")

print(f"结果已保存到：{out_file}")
print(f"总参考项数：{len(ref_terms)}")
print(f"新词数量：{len(result[result['found'] == False])}")


  df["year"] = pd.to_datetime(df["date"], errors="coerce").dt.year


结果已保存到：/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/ch_néologisme_data_years.csv
总参考项数：126358
新词数量：111120


#méthode 3 avec stanza

---



In [None]:
#参考词汇不在语料库


# -*- coding: utf-8 -*-
from pathlib import Path
import pandas as pd
from collections import Counter

# ========== 配置路径 ==========
# 参考词表（2015 年之前的词典词汇），每行一个词/词组
path_refs = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all_ch.txt")

# 新闻语料（2015–2025），列包含 author, title, date, content, url
# 其中 content 已经是分词后的空格序列
path_corpus = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/ch_all_data_2015_2025_tok_stanza.csv")




# 输出目录与文件
out_dir = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
out_all  = out_dir / "ch_néologisme_data_all.csv"
out_new  = out_dir / "ch_néologisme_new_only.csv"
out_old  = out_dir / "ch_néologisme_in_reference.csv"
out_dir.mkdir(parents=True, exist_ok=True)

# ========== 读取参考词表 ==========
# 假设每一行一个词（与语料的 token 一致；若参考表含多词短语且内部带空格，则本脚本按整行当作一个条目处理）
ref_terms = []
with path_refs.open("r", encoding="utf-8") as f:
    for line in f:
        t = line.strip()
        if t:
            ref_terms.append(t)

# 去重保序
seen = set()
ref_terms = [t for t in ref_terms if not (t in seen or seen.add(t))]
ref_set = set(ref_terms)

# ========== 读取语料 ==========
df = pd.read_csv(path_corpus, encoding="utf-8")

required_cols = {"content", "date"}
if not required_cols.issubset(df.columns):
    raise ValueError(f"CSV 缺少必要列：{required_cols - set(df.columns)}")

# 解析日期 -> 年份，并按日期升序排序，确保“首次出现年份”正确
df["date_parsed"] = pd.to_datetime(df["date"], errors="coerce")
df = df.sort_values("date_parsed").reset_index(drop=True)
df["year"] = df["date_parsed"].dt.year

# content 规范化
df["content"] = df["content"].fillna("").astype(str)

# ========== 统计语料词频 & 首次出现年份 ==========
freq = Counter()
first_year = {}

for _, row in df.iterrows():
    year = row["year"]
    if pd.isna(year):
        # 没有有效日期的样本不参与首次年份统计，但仍可计数（如需也排除计数，可改为 continue）
        tokens = row["content"].split()
        freq.update(tokens)
        continue

    tokens = row["content"].split()
    freq.update(tokens)
    y = int(year)
    for tok in tokens:
        # 因为 df 已按日期升序排序，第一次赋值即为最早年份
        if tok not in first_year:
            first_year[tok] = y

# ========== 形成总表 ==========
rows = []
for tok, f in freq.items():
    is_new = tok not in ref_set          # 不在参考词表 => 新词
    fy = first_year.get(tok, None)       # 语料中的首次出现年份（可能为 None：该词只出现在无日期样本）
    rows.append({
        "term": tok,
        "frequency": int(f),
        "first_year": fy,
        "in_reference": (not is_new),    # True=旧词；False=新词
        "status": "new" if is_new else "in_reference"
    })

result = pd.DataFrame(rows)

# 便于观察：优先显示新词、频次高、年份早
result = result.sort_values(
    by=["status", "frequency", "first_year", "term"],
    ascending=[True, False, True, True]
).reset_index(drop=True)

# ========== 拆分并保存 ==========
new_df = result[result["status"] == "new"].copy()
old_df = result[result["status"] == "in_reference"].copy()

# 保存 CSV（带 BOM，便于 Excel 打开）
result.to_csv(out_all, index=False, encoding="utf-8-sig")
new_df.to_csv(out_new, index=False, encoding="utf-8-sig")
old_df.to_csv(out_old, index=False, encoding="utf-8-sig")

print(f"总词数（语料词表大小）：{len(result)}")
print(f"新词数（不在参考表）：{len(new_df)}")
print(f"旧词数（在参考表）：{len(old_df)}")
print(f"已保存：\n- {out_all}\n- {out_new}\n- {out_old}")


  df["date_parsed"] = pd.to_datetime(df["date"], errors="coerce")


总词数（语料词表大小）：32811
新词数（不在参考表）：17573
旧词数（在参考表）：15238
已保存：
- /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/ch_néologisme_data_all.csv
- /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/ch_néologisme_new_only.csv
- /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/ch_néologisme_in_reference.csv


#méthode avec Thulac

In [None]:
# 去掉标点符号和数字，百分比

from pathlib import Path
import pandas as pd
from collections import Counter
import unicodedata
import re

# ========== 配置路径 ==========
path_refs = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all_ch.txt")
#path_corpus = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/ch_all_data_2015_2025_tok_stanza.csv")
path_corpus = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/ch2_all_data_2015_2025_tok_thulac.csv")

out_dir = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
out_all  = out_dir / "ch1_néologisme_data_all.csv"
out_new  = out_dir / "ch1_néologisme_new_only.csv"
out_old  = out_dir / "ch1_néologisme_in_reference.csv"
out_dir.mkdir(parents=True, exist_ok=True)

# ========== 读取参考词表 ==========
ref_terms = []
with path_refs.open("r", encoding="utf-8") as f:
    for line in f:
        t = line.strip()
        if t:
            ref_terms.append(t)

seen = set()
ref_terms = [t for t in ref_terms if not (t in seen or seen.add(t))]
ref_set = set(ref_terms)

# ========== 读取语料 ==========
df = pd.read_csv(path_corpus, encoding="utf-8")

required_cols = {"content", "date"}
if not required_cols.issubset(df.columns):
    raise ValueError(f"CSV 缺少必要列：{required_cols - set(df.columns)}")

df["date_parsed"] = pd.to_datetime(df["date"], errors="coerce")
df = df.sort_values("date_parsed").reset_index(drop=True)
df["year"] = df["date_parsed"].dt.year
df["content"] = df["content"].fillna("").astype(str)

# ========== 过滤函数 ==========
# 标点过滤
def is_punctuation(token):
    return all(unicodedata.category(ch).startswith("P") or ch.isspace() for ch in token)

# 数字过滤（阿拉伯数字、中文数字大写/小写）
num_pattern = re.compile(r"^[0-9一二三四五六七八九十百千万亿〇零壹贰叁肆伍陆柒捌玖拾佰仟]+$")
def is_number(token):
    return bool(num_pattern.match(token))

# 百分比过滤（包含阿拉伯数字+% 或 中文“百分之”结构）
percent_pattern = re.compile(r"^([0-9]+%|百分之[一二三四五六七八九十百千万亿〇零壹贰叁肆伍陆柒捌玖拾佰仟]+)$")
def is_percentage(token):
    return bool(percent_pattern.match(token))

# ========== 统计词频 & 首次出现年份 ==========
freq = Counter()
first_year = {}

for _, row in df.iterrows():
    year = row["year"]
    # 去掉标点 & 数字 & 百分比
    tokens = [
        tok for tok in row["content"].split()
        if tok and not is_punctuation(tok) and not is_number(tok) and not is_percentage(tok)
    ]
    freq.update(tokens)
    if pd.isna(year):
        continue
    y = int(year)
    for tok in tokens:
        if tok not in first_year:
            first_year[tok] = y

# ========== 生成结果表 ==========
rows = []
for tok, f in freq.items():
    is_new = tok not in ref_set
    fy = first_year.get(tok, None)
    rows.append({
        "term": tok,
        "frequency": int(f),
        "first_year": fy,
        "in_reference": (not is_new),
        "status": "new" if is_new else "in_reference"
    })

result = pd.DataFrame(rows).sort_values(
    by=["status", "frequency", "first_year", "term"],
    ascending=[True, False, True, True]
).reset_index(drop=True)

# ========== 保存结果 ==========
new_df = result[result["status"] == "new"].copy()
old_df = result[result["status"] == "in_reference"].copy()

result.to_csv(out_all, index=False, encoding="utf-8-sig")
new_df.to_csv(out_new, index=False, encoding="utf-8-sig")
old_df.to_csv(out_old, index=False, encoding="utf-8-sig")

print(f"总词数（去标点+数字+百分比）：{len(result)}")
print(f"新词数：{len(new_df)}")
print(f"旧词数：{len(old_df)}")
print(f"已保存：\n- {out_all}\n- {out_new}\n- {out_old}")


总词数（去标点+数字+百分比）：52674
新词数：29972
旧词数：22702
已保存：
- /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/ch1_néologisme_data_all.csv
- /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/ch1_néologisme_new_only.csv
- /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/ch1_néologisme_in_reference.csv


#méthode 2 avec Thulac

In [None]:
# 去掉标点符号和数字，百分比

from pathlib import Path
import pandas as pd
from collections import Counter
import unicodedata
import re

# ========== 配置路径 ==========
path_refs = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all_ch.txt")
path_corpus = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/ch2_all_data_2015_2025_tok_thulac.csv")

out_dir = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
out_all  = out_dir / "ch2_néologisme_data_all.csv"
out_new  = out_dir / "ch2_néologisme_new_only.csv"
out_old  = out_dir / "ch2_néologisme_in_reference.csv"
out_dir.mkdir(parents=True, exist_ok=True)

# ========== 读取参考词表 ==========
ref_terms = []
with path_refs.open("r", encoding="utf-8") as f:
    for line in f:
        t = line.strip()
        if t:
            ref_terms.append(t)

seen = set()
ref_terms = [t for t in ref_terms if not (t in seen or seen.add(t))]
ref_set = set(ref_terms)

# ========== 读取语料 ==========
df = pd.read_csv(path_corpus, encoding="utf-8")

required_cols = {"content", "date"}
if not required_cols.issubset(df.columns):
    raise ValueError(f"CSV 缺少必要列：{required_cols - set(df.columns)}")

df["date_parsed"] = pd.to_datetime(df["date"], errors="coerce")
df = df.sort_values("date_parsed").reset_index(drop=True)
df["year"] = df["date_parsed"].dt.year
df["content"] = df["content"].fillna("").astype(str)

# ========== 过滤函数 ==========
# 标点过滤
def is_punctuation(token):
    return all(unicodedata.category(ch).startswith("P") or ch.isspace() for ch in token)

# 数字过滤（阿拉伯数字、中文数字大写/小写）
num_pattern = re.compile(r"^[0-9一二三四五六七八九十百千万亿〇零壹贰叁肆伍陆柒捌玖拾佰仟]+$")
def is_number(token):
    return bool(num_pattern.match(token))

# 百分比过滤（包含阿拉伯数字+% 或 中文“百分之”结构）
percent_pattern = re.compile(r"^([0-9]+%|百分之[一二三四五六七八九十百千万亿〇零壹贰叁肆伍陆柒捌玖拾佰仟]+)$")
def is_percentage(token):
    return bool(percent_pattern.match(token))

# ========== 统计词频 & 首次出现年份 ==========
freq = Counter()
first_year = {}

for _, row in df.iterrows():
    year = row["year"]
    # 去掉标点 & 数字 & 百分比
    tokens = [
        tok for tok in row["content"].split()
        if tok and not is_punctuation(tok) and not is_number(tok) and not is_percentage(tok)
    ]
    freq.update(tokens)
    if pd.isna(year):
        continue
    y = int(year)
    for tok in tokens:
        if tok not in first_year:
            first_year[tok] = y

# ========== 生成结果表 ==========
rows = []
for tok, f in freq.items():
    is_new = tok not in ref_set
    fy = first_year.get(tok, None)
    rows.append({
        "term": tok,
        "frequency": int(f),
        "first_year": fy,
        "in_reference": (not is_new),
        "status": "new" if is_new else "in_reference"
    })

result = pd.DataFrame(rows).sort_values(
    by=["status", "frequency", "first_year", "term"],
    ascending=[True, False, True, True]
).reset_index(drop=True)

# ========== 保存结果 ==========
new_df = result[result["status"] == "new"].copy()
old_df = result[result["status"] == "in_reference"].copy()

result.to_csv(out_all, index=False, encoding="utf-8-sig")
new_df.to_csv(out_new, index=False, encoding="utf-8-sig")
old_df.to_csv(out_old, index=False, encoding="utf-8-sig")

print(f"总词数（去标点+数字+百分比）：{len(result)}")
print(f"新词数：{len(new_df)}")
print(f"旧词数：{len(old_df)}")
print(f"已保存：\n- {out_all}\n- {out_new}\n- {out_old}")


总词数（去标点+数字+百分比）：52674
新词数：29972
旧词数：22702
已保存：
- /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/ch2_néologisme_data_all.csv
- /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/ch2_néologisme_new_only.csv
- /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/ch2_néologisme_in_reference.csv


#méthode 3 avec Thulac

In [None]:
# 去掉标点符号和数字，百分比

from pathlib import Path
import pandas as pd
from collections import Counter
import unicodedata
import re

# ========== 配置路径 ==========
path_refs = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all_ch.txt")
path_corpus = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/ch2_all_data_2015_2025_tok_thulac.csv")

out_dir = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
out_all  = out_dir / "ch3_néologisme_data_all.csv"
out_new  = out_dir / "ch3_néologisme_new_only.csv"
out_old  = out_dir / "ch3_néologisme_in_reference.csv"
out_dir.mkdir(parents=True, exist_ok=True)

# ========== 读取参考词表 ==========
ref_terms = []
with path_refs.open("r", encoding="utf-8") as f:
    for line in f:
        t = line.strip()
        if t:
            ref_terms.append(t)

seen = set()
ref_terms = [t for t in ref_terms if not (t in seen or seen.add(t))]
ref_set = set(ref_terms)

# ========== 读取语料 ==========
df = pd.read_csv(path_corpus, encoding="utf-8")

required_cols = {"content", "date"}
if not required_cols.issubset(df.columns):
    raise ValueError(f"CSV 缺少必要列：{required_cols - set(df.columns)}")

df["date_parsed"] = pd.to_datetime(df["date"], errors="coerce")
df = df.sort_values("date_parsed").reset_index(drop=True)
df["year"] = df["date_parsed"].dt.year
df["content"] = df["content"].fillna("").astype(str)

# ========== 过滤函数 ==========
# 标点过滤
def is_punctuation(token):
    return all(unicodedata.category(ch).startswith("P") or ch.isspace() for ch in token)

# 数字过滤（阿拉伯数字、中文数字大写/小写）
num_pattern = re.compile(r"^[0-9一二三四五六七八九十百千万亿〇零壹贰叁肆伍陆柒捌玖拾佰仟]+$")
def is_number(token):
    return bool(num_pattern.match(token))

# 百分比过滤（包含阿拉伯数字+% 或 中文“百分之”结构）
percent_pattern = re.compile(r"^([0-9]+%|百分之[一二三四五六七八九十百千万亿〇零壹贰叁肆伍陆柒捌玖拾佰仟]+)$")
def is_percentage(token):
    return bool(percent_pattern.match(token))

# ========== 统计词频 & 首次出现年份 ==========
freq = Counter()
first_year = {}

for _, row in df.iterrows():
    year = row["year"]
    # 去掉标点 & 数字 & 百分比
    tokens = [
        tok for tok in row["content"].split()
        if tok and not is_punctuation(tok) and not is_number(tok) and not is_percentage(tok)
    ]
    freq.update(tokens)
    if pd.isna(year):
        continue
    y = int(year)
    for tok in tokens:
        if tok not in first_year:
            first_year[tok] = y

# ========== 生成结果表 ==========
rows = []
for tok, f in freq.items():
    is_new = tok not in ref_set
    fy = first_year.get(tok, None)
    rows.append({
        "term": tok,
        "frequency": int(f),
        "first_year": fy,
        "in_reference": (not is_new),
        "status": "new" if is_new else "in_reference"
    })

result = pd.DataFrame(rows).sort_values(
    by=["status", "frequency", "first_year", "term"],
    ascending=[True, False, True, True]
).reset_index(drop=True)

# ========== 保存结果 ==========
new_df = result[result["status"] == "new"].copy()
old_df = result[result["status"] == "in_reference"].copy()

result.to_csv(out_all, index=False, encoding="utf-8-sig")
new_df.to_csv(out_new, index=False, encoding="utf-8-sig")
old_df.to_csv(out_old, index=False, encoding="utf-8-sig")

print(f"总词数（去标点+数字+百分比）：{len(result)}")
print(f"新词数：{len(new_df)}")
print(f"旧词数：{len(old_df)}")
print(f"已保存：\n- {out_all}\n- {out_new}\n- {out_old}")


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all_ch.txt'

#méthode amélioré

1.   Élément de liste
2.   Élément de liste



In [None]:
import time
start = time.time()

from tqdm.auto import tqdm
def log(msg):
    print(f"[{time.time() - start:8.2f}s] {msg}")





from pathlib import Path
import pandas as pd
from collections import Counter, defaultdict
import numpy as np
import unicodedata, re, math

# =========================
# 配置
# =========================
IN_REF = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all_ch.txt")
IN_CORPUS = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/ch2_all_data_2015_2025_tok_thulac.csv")
OUT_DIR = Path("/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# 阈值（可按语料调整）
MINF_UNI, MINDF_UNI = 5, 3     # unigram 词频/文档频
MINF_BI,  MINDF_BI  = 5, 3     # bigram 词频/文档频
MINF_TRI, MINDF_TRI = 3, 2     # trigram 词频/文档频
PMI2_MIN, PMI3_MIN  = 3.5, 2.5 # PMI 阈值
ENT_MIN             = 2.0      # 左右熵阈值（L+R）
ABSORB_RATIO        = 0.80     # 去嵌套：被更长单位覆盖比例阈值
SENT_SEP = set("。！？!?；;：:") # 断句标记（基于分好词后仍保留的中文标点）

STOP = {"的","了","和","与","及","等","在","把","被","对","于","之","其"}  # 边界停用词

# =========================
# 读取参考词表
# =========================
ref_set = set()
with IN_REF.open("r", encoding="utf-8") as f:
    for line in f:
        t = line.strip()
        if t: ref_set.add(t)

# =========================
# 读取语料
# 需包含：content（空格分词后文本）、date（可选）
# =========================
df = pd.read_csv(IN_CORPUS, encoding="utf-8")
if "content" not in df.columns:
    raise ValueError("CSV 缺少 'content' 列。")
df["date_parsed"] = pd.to_datetime(df.get("date", None), errors="coerce")
df["year"] = df["date_parsed"].dt.year
df["content"] = df["content"].fillna("").astype(str)

def normalize(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u3000", " ").strip()
    return s

df["content"] = df["content"].map(normalize)

# 噪声过滤（与之前一致）
punct_or_space = lambda ch: unicodedata.category(ch).startswith("P") or ch.isspace()
def is_punct_token(tok):  # 全是标点/空白
    return all(punct_or_space(ch) for ch in tok)

num_pat = re.compile(r"^[0-9０-９一二三四五六七八九十百千万亿〇零壹贰叁肆伍陆柒捌玖拾佰仟]+$")
def is_number(tok): return bool(num_pat.match(tok))

percent_pat = re.compile(r"^([0-9０-９]+%|百分之[一二三四五六七八九十百千万亿〇零壹贰叁肆伍陆柒捌玖拾佰仟]+)$")
def is_percent(tok): return bool(percent_pat.match(tok))

url_pat = re.compile(r"https?://|www\.", re.I)
mention_pat = re.compile(r"^[@#]")
def bad_token(tok):
    if not tok: return True
    if is_punct_token(tok) or is_number(tok) or is_percent(tok): return True
    if url_pat.search(tok) or mention_pat.search(tok): return True
    return False

# =========================
# 工具：按句生成 n-gram（不跨句）
# =========================
def split_by_sentence(tokens):
    sents, cur = [], []
    for t in tokens:
        cur.append(t)
        if t in SENT_SEP:
            sents.append(cur); cur = []
    if cur: sents.append(cur)
    return sents

def bad_edge_ngram(ng):
    parts = ng.split()
    return (parts[0] in STOP) or (parts[-1] in STOP)

# =========================
# 统计（分别计数 1/2/3-gram）
# =========================
uni_freq, bi_freq, tri_freq = Counter(), Counter(), Counter()
uni_df,   bi_df,   tri_df   = Counter(), Counter(), Counter()
first_year = {}
year_counts_uni = defaultdict(lambda: Counter())  # 仅用于首现/爆发度（以 unigram 为主，也可扩展）

for _, row in df.iterrows():
    year = row["year"]
    tokens = [t for t in row["content"].split() if not bad_token(t)]
    sents = split_by_sentence(tokens)

    seen_uni, seen_bi, seen_tri = set(), set(), set()

    for sent in sents:
        clean = [w for w in sent if w.strip() and w not in SENT_SEP]

        # 1-gram
        uni_freq.update(clean)
        seen_uni.update(set(clean))

        # 2-gram / 3-gram（句内滚动，但过滤边界停用词）
        for n in (2,3):
            for i in range(len(clean)-n+1):
                ng = " ".join(clean[i:i+n])
                if bad_edge_ngram(ng):
                    continue
                if n == 2:
                    bi_freq[ng] += 1; seen_bi.add(ng)
                else:
                    tri_freq[ng] += 1; seen_tri.add(ng)

    # 文档频
    uni_df.update(seen_uni)
    bi_df.update(seen_bi)
    tri_df.update(seen_tri)

    # 首现年份（以 unigram 为例，n-gram 也可扩展）
    if pd.notna(year):
        y = int(year)
        for w in seen_uni:
            if w not in first_year:
                first_year[w] = y
            year_counts_uni[y][w] += 1

# =========================
# PMI（仅 n>=2）
# =========================
N_uni = sum(uni_freq.values())
def p_uni(w): return (uni_freq[w] + 1) / (N_uni + len(uni_freq))

def p_bi(ng):
    N_bi = sum(bi_freq.values())
    return (bi_freq[ng] + 1) / (N_bi + len(bi_freq) + 1e-9)

def p_tri(ng):
    N_tri = sum(tri_freq.values())
    return (tri_freq[ng] + 1) / (N_tri + len(tri_freq) + 1e-9)

def PMI2(ng):
    a,b = ng.split()
    return math.log( p_bi(ng) / (p_uni(a)*p_uni(b)) + 1e-12 )

def PMI3(ng):
    a,b,c = ng.split()
    return math.log( p_tri(ng) / (p_uni(a)*p_uni(b)*p_uni(c)) + 1e-12 )

# =========================
# 左右熵（衡量凝固度）
# =========================
left_ctx, right_ctx = defaultdict(Counter), defaultdict(Counter)
for _, row in df.iterrows():
    toks = [t for t in row["content"].split() if t.strip()]
    for i, w in enumerate(toks):
        if i>0: left_ctx[w][toks[i-1]] += 1
        if i<len(toks)-1: right_ctx[w][toks[i+1]] += 1

def entropy(counter):
    total = sum(counter.values()) or 1
    return -sum((c/total)*math.log((c/total)+1e-12) for c in counter.values())

def lr_entropy_ng(ng):
    parts = ng.split()
    return entropy(left_ctx[parts[0]]) + entropy(right_ctx[parts[-1]])

# =========================
# 去嵌套（C-value 思路）
# bigram 若大多被某个 trigram 覆盖，则剔除
# =========================
def suppress_nested_bi(bi_freq, tri_freq, ratio=0.8):
    keep = {}
    index_max = defaultdict(int)  # bigram -> 覆盖它的 trigram 最大频次
    for tri, ftri in tri_freq.items():
        parts = tri.split()
        for i in range(2):
            index_max[" ".join(parts[i:i+2])] = max(index_max[" ".join(parts[i:i+2])], ftri)
    for bi, fbi in bi_freq.items():
        if index_max[bi] >= ratio * fbi:
            continue
        keep[bi] = fbi
    return Counter(keep)

bi_freq = suppress_nested_bi(bi_freq, tri_freq, ABSORB_RATIO)

# （可选）unigram 被高频 bigram“吸收”时降权/剔除，这里先不做，保留 unigram 的可见性

# =========================
# 选择与打标签（按 n 分表）
# =========================
def select_unigram():
    rows=[]
    for w,f in uni_freq.items():
        if f<MINF_UNI or uni_df[w]<MINDF_UNI:
            continue
        in_ref = (w in ref_set)
        rows.append({
            "term": w, "n":1, "freq":int(f), "doc_freq":int(uni_df[w]),
            "first_year": first_year.get(w), "in_reference": in_ref,
            "status": "in_reference" if in_ref else "new"
        })
    return pd.DataFrame(rows).sort_values(["status","freq","doc_freq"], ascending=[True,False,False])

def select_bigram():
    rows=[]
    for ng,f in bi_freq.items():
        if f<MINF_BI or bi_df[ng]<MINDF_BI:
            continue
        if bad_edge_ngram(ng):
            continue
        pmi = PMI2(ng); ent = lr_entropy_ng(ng)
        if pmi<PMI2_MIN or ent<ENT_MIN:
            continue
        in_ref = (ng in ref_set)
        rows.append({
            "term": ng, "n":2, "freq":int(f), "doc_freq":int(bi_df[ng]),
            "PMI":round(pmi,3), "LRent":round(ent,3),
            "in_reference": in_ref, "status": "in_reference" if in_ref else "new"
        })
    return pd.DataFrame(rows).sort_values(["status","PMI","freq"], ascending=[True,False,False])

def select_trigram():
    rows=[]
    for ng,f in tri_freq.items():
        if f<MINF_TRI or tri_df[ng]<MINDF_TRI:
            continue
        if bad_edge_ngram(ng):
            continue
        pmi = PMI3(ng); ent = lr_entropy_ng(ng)
        if pmi<PMI3_MIN or ent<ENT_MIN:
            continue
        in_ref = (ng in ref_set)
        rows.append({
            "term": ng, "n":3, "freq":int(f), "doc_freq":int(tri_df[ng]),
            "PMI":round(pmi,3), "LRent":round(ent,3),
            "in_reference": in_ref, "status": "in_reference" if in_ref else "new"
        })
    return pd.DataFrame(rows).sort_values(["status","PMI","freq"], ascending=[True,False,False])

res_uni = select_unigram()
res_bi  = select_bigram()
res_tri = select_trigram()

# =========================
# “成语/四字格”优先：保留连续四字（无空格），删除被其覆盖的子 bigram
# =========================
def is_4char_ng(ng):
    chars = "".join(ng.split())
    return len(chars) == 4

good_tri_4 = set(t for t in res_tri["term"] if is_4char_ng(t))
if len(good_tri_4) > 0:
    def covered_by_4(bi):
        b = "".join(bi.split())
        for tri in good_tri_4:
            if b in "".join(tri.split()):
                return True
        return False
    res_bi = res_bi[~res_bi["term"].apply(covered_by_4)]

# =========================
# 导出分表 + 合并总表（按 n 优先级：tri > bi > uni）
# =========================
res_uni.to_csv(OUT_DIR/"chi_neo_uni.csv", index=False, encoding="utf-8-sig")
res_bi.to_csv( OUT_DIR/"chi_neo_bi.csv",  index=False, encoding="utf-8-sig")
res_tri.to_csv( OUT_DIR/"chi_neo_tri.csv", index=False, encoding="utf-8-sig")

# 合并并去重（如果同一个短项被更长项包含，则保留更长的）
chosen = []
taken = set()

def norm_chars(s): return "".join(s.split())

for df_part in [res_tri, res_bi, res_uni]:  # 优先保留更长
    for _, r in df_part.iterrows():
        t = r["term"]; t_norm = norm_chars(t)
        # 若被已有更长项完全覆盖，则跳过
        if any(t_norm in norm_chars(x["term"]) for x in chosen if len(norm_chars(x["term"])) >= len(t_norm)):
            continue
        chosen.append(r)

res_final = pd.DataFrame(chosen).reset_index(drop=True)
res_final.to_csv(OUT_DIR/"neo_final.csv", index=False, encoding="utf-8-sig")

print("已保存：")
print(" -", OUT_DIR/"chi_neo_uni.csv")
print(" -", OUT_DIR/"chi_neo_bi.csv")
print(" -", OUT_DIR/"chi_neo_tri.csv")
print(" -", OUT_DIR/"chi_neo_final.csv")
print(f"候选总数：uni={len(res_uni)}, bi={len(res_bi)}, tri={len(res_tri)}, final={len(res_final)}")


end = time.time()
print(f"总耗时：{end - start:.2f} 秒")


#En vientnamien

In [None]:
# 去掉标点符号 + 含数字的词 + 百分比（越南语版）

from pathlib import Path
import pandas as pd
from collections import Counter
import unicodedata
import re

# ========== 配置路径（越南语） ==========
# 2015 年之前的参考词表（越南语）
path_refs = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/liste de référence/all2004_2015_vi copie.txt")

# 2015–2025 越南语新闻语料（content 已分词：空格分隔）
path_corpus = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/Token/vi_all_data_2015_2025_tokenized.csv")

# 输出目录与文件（越南语前缀）
out_dir = Path(r"/content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification")
out_all  = out_dir / "vi_néologisme_data_all.csv"
out_new  = out_dir / "vi_néologisme_new_only.csv"
out_old  = out_dir / "vi_néologisme_in_reference.csv"
out_dir.mkdir(parents=True, exist_ok=True)

# ========== 读取参考词表 ==========
ref_terms = []
with path_refs.open("r", encoding="utf-8") as f:
    for line in f:
        t = line.strip()
        if t:
            ref_terms.append(t)

# 去重保序
seen = set()
ref_terms = [t for t in ref_terms if not (t in seen or seen.add(t))]
ref_set = set(ref_terms)

# ========== 读取语料 ==========
df = pd.read_csv(path_corpus, encoding="utf-8")

required_cols = {"content", "date"}
if not required_cols.issubset(df.columns):
    raise ValueError(f"CSV 缺少必要列：{required_cols - set(df.columns)}")

# 解析日期 -> 年份；按日期升序，保证“首次出现年份”正确
df["date_parsed"] = pd.to_datetime(df["date"], errors="coerce")
df = df.sort_values("date_parsed").reset_index(drop=True)
df["year"] = df["date_parsed"].dt.year
df["content"] = df["content"].fillna("").astype(str)

# ========== 过滤函数 ==========
# 仅标点（Unicode 类别 P）或空白的 token
def is_punctuation(token: str) -> bool:
    return all(unicodedata.category(ch).startswith("P") or ch.isspace() for ch in token)

# 是否包含任意“数字字符”（含 0-9、全角数字等）
def has_any_digit(token: str) -> bool:
    return any(ch.isdigit() for ch in token)

# 百分比（阿拉伯数字 + 可选小数（逗号或点）+ %），如 50%、50,5%、50.5%
percent_pattern = re.compile(r"^\d+(?:[.,]\d+)?%$")
def is_percentage(token: str) -> bool:
    return bool(percent_pattern.match(token))

# ========== 统计词频 & 首次出现年份 ==========
freq = Counter()
first_year = {}

for _, row in df.iterrows():
    year = row["year"]
    # 过滤：标点 / 含数字 / 百分比
    tokens = [
        tok for tok in row["content"].split()
        if tok and not is_punctuation(tok) and not has_any_digit(tok) and not is_percentage(tok)
    ]
    freq.update(tokens)
    if pd.isna(year):
        continue
    y = int(year)
    for tok in tokens:
        if tok not in first_year:
            first_year[tok] = y  # 已排序，首次赋值即最早年份

# ========== 生成结果表 ==========
rows = []
for tok, f in freq.items():
    in_ref = tok in ref_set
    rows.append({
        "term": tok,
        "frequency": int(f),
        "first_year": first_year.get(tok, None),  # 出现在语料中的首年
        "in_reference": in_ref,                   # True=旧词；False=新词
        "status": "in_reference" if in_ref else "new"
    })

result = pd.DataFrame(rows).sort_values(
    by=["status", "frequency", "first_year", "term"],
    ascending=[True, False, True, True]
).reset_index(drop=True)

# ========== 拆分并保存 ==========
new_df = result[result["status"] == "new"].copy()
old_df = result[result["status"] == "in_reference"].copy()

# 带 BOM，便于 Excel
result.to_csv(out_all, index=False, encoding="utf-8-sig")
new_df.to_csv(out_new, index=False, encoding="utf-8-sig")
old_df.to_csv(out_old, index=False, encoding="utf-8-sig")

print(f"总词数（去标点/含数字/百分比）：{len(result)}")
print(f"新词数：{len(new_df)}")
print(f"旧词数：{len(old_df)}")
print(f"已保存：\n- {out_all}\n- {out_new}\n- {out_old}")


  df["date_parsed"] = pd.to_datetime(df["date"], errors="coerce")


总词数（去标点/含数字/百分比）：28083
新词数：24416
旧词数：3667
已保存：
- /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi_néologisme_data_all.csv
- /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi_néologisme_new_only.csv
- /content/drive/MyDrive/Colab Notebooks/STAGE_CRLAO_CNRS/Corpus_data/identification/vi_néologisme_in_reference.csv


#Version améliorée