In [10]:
#!/usr/bin/env python3
import re
import os
import json
import time
import requests
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# ===== Paths =====
INPUT_CSV  = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/zephyr_7b_beta_all_questions_with_metrics.csv"
OUTPUT_CSV = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/zephyr_7b_beta_all_questions_with_metrics_with_popularity.csv"

# Кэши рядом с OUTPUT
CACHE_LABEL2QID = Path(OUTPUT_CSV).with_suffix(".label2qid.json")
CACHE_SITELINKS = Path(OUTPUT_CSV).with_suffix(".sitelinks_cache.json")

# ===== Settings =====
LANGS = ["en"]   # порядок поиска меток , "ru", "de"
SLEEP_BETWEEN = 0.05         # чтобы не ловить 429
TIMEOUT = 20

QID_RE = re.compile(r"Q\d+")

# ===== HTTP session =====
def build_session():
    sess = requests.Session()
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        respect_retry_after_header=True,
    )
    adapter = HTTPAdapter(max_retries=retries)
    sess.mount("https://", adapter)
    sess.headers.update({
        "User-Agent": "UNLamb-Wikidata/1.0 (contact: youremail@example.com)",
        "Accept": "application/json",
    })
    return sess

# ===== Caching helpers =====
def load_json(path):
    p = Path(path)
    if p.exists():
        try:
            with open(p, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return {}
    return {}

def save_json(path, data):
    try:
        with open(path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False)
    except Exception:
        pass

# ===== QID / sitelinks =====
def extract_qid(val):
    """Если в строке уже есть QID — вернём его, иначе None."""
    if pd.isna(val):
        return None
    s = str(val)
    m = QID_RE.search(s)
    return m.group(0) if m else None

def search_wikidata_qid(label, session, langs=LANGS, timeout=TIMEOUT):
    """
    Ищем QID по текстовому label через wbsearchentities.
    1) пробуем найти точное совпадение label (case-insensitive)
    2) иначе берём первый результат.
    Возвращает QID или None.
    """
    label_stripped = str(label).strip()
    if not label_stripped:
        return None

    # частый кейс: в строке прямо QID
    q = extract_qid(label_stripped)
    if q:
        return q

    for lang in langs:
        params = {
            "action": "wbsearchentities",
            "format": "json",
            "language": lang,
            "uselang": lang,
            "type": "item",
            "search": label_stripped,
            "limit": 5,
        }
        try:
            r = session.get("https://www.wikidata.org/w/api.php", params=params, timeout=timeout)
            r.raise_for_status()
            data = r.json()
            results = data.get("search", []) or []

            if not results:
                continue

            # 1) ищем точное совпадение по label
            lower = label_stripped.lower()
            for item in results:
                if item.get("label", "").lower() == lower:
                    return item.get("id")

            # 2) иначе берём первый
            return results[0].get("id")
        except Exception:
            continue

    return None

def get_sitelinks_count(qid, session, timeout=TIMEOUT):
    """Количество sitelinks по QID."""
    if not qid:
        return 0
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    try:
        r = session.get(url, timeout=timeout)
        if r.status_code == 404:
            return 0
        r.raise_for_status()
        data = r.json()
        ent = data.get("entities", {}).get(qid, {})
        return len(ent.get("sitelinks", {}))
    except Exception:
        return 0

# ===== Main =====
def main():
    df = pd.read_csv(INPUT_CSV)

    if "subject" not in df.columns or "object" not in df.columns:
        raise ValueError("Ожидаю колонки 'subject' и 'object' в датасете.")

    session = build_session()

    # Соберём уникальные строки
    subj_vals = df["subject"].fillna("").astype(str)
    obj_vals  = df["object"].fillna("").astype(str)
    unique_labels = sorted(set(subj_vals.tolist() + obj_vals.tolist()))

    # Загрузим кэши
    label2qid = load_json(CACHE_LABEL2QID)
    qid2sitelinks = load_json(CACHE_SITELINKS)

    # 1) Разрешаем label → QID
    to_resolve = [lbl for lbl in unique_labels if lbl and lbl not in label2qid]
    if to_resolve:
        for lbl in tqdm(to_resolve, desc="Resolve labels → QID"):
            qid = search_wikidata_qid(lbl, session)
            label2qid[lbl] = qid or ""  # пустая строка если не нашли
            time.sleep(SLEEP_BETWEEN)
        save_json(CACHE_LABEL2QID, label2qid)

    # Добавим вспомогательные колонки с QID
    df["subject_qid"] = subj_vals.map(lambda s: label2qid.get(s, "") or extract_qid(s) or "")
    df["object_qid"]  = obj_vals.map(lambda s: label2qid.get(s, "") or extract_qid(s) or "")

    # 2) QID → sitelinks
    qids = sorted({q for q in pd.concat([df["subject_qid"], df["object_qid"]]).tolist() if q})
    to_fetch = [q for q in qids if q not in qid2sitelinks]
    if to_fetch:
        for q in tqdm(to_fetch, desc="Fetch sitelinks"):
            qid2sitelinks[q] = get_sitelinks_count(q, session)
            time.sleep(SLEEP_BETWEEN)
        save_json(CACHE_SITELINKS, qid2sitelinks)

    # 3) Преобразуем в счётчики
    df["subject_popularity_sitelinks"] = df["subject_qid"].map(lambda q: int(qid2sitelinks.get(q, 0)))
    df["object_popularity_sitelinks"]  = df["object_qid"].map(lambda q: int(qid2sitelinks.get(q, 0)))
    df["popularity_sitelinks_sum"]     = df["subject_popularity_sitelinks"] + df["object_popularity_sitelinks"]

    # Немного статистики в консоль
    resolved_subject = (df["subject_qid"] != "").mean() * 100
    resolved_object  = (df["object_qid"]  != "").mean() * 100
    print(f"Resolved subject labels to QIDs: {resolved_subject:.1f}%")
    print(f"Resolved object  labels to QIDs: {resolved_object:.1f}%")

    # Сохраняем
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"✅ Saved: {OUTPUT_CSV}")
    print(f"ℹ️ Caches: {CACHE_LABEL2QID}, {CACHE_SITELINKS}")

if __name__ == "__main__":
    main()


Resolve labels → QID: 100%|██████████| 25972/25972 [1:55:12<00:00,  3.76it/s]  
Fetch sitelinks: 100%|██████████| 25857/25857 [3:22:42<00:00,  2.13it/s]  


Resolved subject labels to QIDs: 99.9%
Resolved object  labels to QIDs: 99.9%
✅ Saved: /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/zephyr_7b_beta_all_questions_with_metrics_with_popularity.csv
ℹ️ Caches: /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/zephyr_7b_beta_all_questions_with_metrics_with_popularity.label2qid.json, /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/zephyr_7b_beta_all_questions_with_metrics_with_popularity.sitelinks_cache.json


In [11]:
pd.read_csv('/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/zephyr_7b_beta_all_questions_with_metrics_with_popularity.csv')

Unnamed: 0,file,question,answer,subject,relation,object,PPL_Llama3_1-8B_Instruct,best_gen_Llama_8b_Instract,gen_recall_Llama_8b_Instract,bert_sim_Llama_8b_Instract,...,bert_sim_Gemma_7b_IT,PPL_Zephyr_7B_Beta,best_gen_Zephyr_7b_Beta,gen_recall_Zephyr_7b_Beta,bert_sim_Zephyr_7b_Beta,subject_qid,object_qid,subject_popularity_sitelinks,object_popularity_sitelinks,popularity_sitelinks_sum
0,technology_programming_language.csv,What does JavaScript has use?,web development,JavaScript,has use,web development,35865.451403,Front-end web development and scripting language,1.000000,0.699027,...,0.665370,7.220331e+09,JavaScript is a scripting language used for cr...,0.000000,0.417127,Q2005,Q386275,157,38,195
1,technology_programming_language.csv,What is the programming paradigm of JavaScript?,aspect-oriented programming,JavaScript,programming paradigm,aspect-oriented programming,200.512601,"Object-Oriented, Functional programming",0.666667,0.639641,...,0.197448,1.977540e+06,JavaScript is a multiparadigm programming,0.333333,0.225238,Q2005,Q30267,157,33,190
2,technology_programming_language.csv,What is the programming paradigm of JavaScript?,event-driven programming,JavaScript,programming paradigm,event-driven programming,236.456111,"Object-oriented, event-driven, and scripting l...",0.666667,0.786737,...,0.416374,8.271734e+07,JavaScript follows a multiparadigm programming,0.333333,0.198329,Q2005,Q1135914,157,29,186
3,technology_programming_language.csv,What is the programming paradigm of JavaScript?,imperative programming,JavaScript,programming paradigm,imperative programming,42.875039,"Multi-paradigm: OOP, imperative,",0.500000,0.574319,...,0.378432,2.660150e+08,JavaScript's programming paradigm is dynamic,0.500000,0.404129,Q2005,Q275596,157,50,207
4,technology_programming_language.csv,What is the programming paradigm of JavaScript?,generic programming,JavaScript,programming paradigm,generic programming,386.563954,Multithreaded object-oriented imperative progr...,0.500000,0.439199,...,0.541070,2.157036e+10,JavaScript is a multi-paradigm,0.000000,0.332423,Q2005,Q1051282,157,33,190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55350,event_film.csv,What is the location of Moscow Jewish Film Fes...,Moscow,Moscow Jewish Film Festival,location,Moscow,1499.950819,Moscow,1.000000,1.000000,...,0.600137,1.235002e+11,The location of the Moscow Jewish Film Festiva...,1.000000,0.540264,Q30124890,Q649,3,332,335
55351,event_film.csv,What is the location of Moscow Jewish Film Fes...,Moscow,Moscow Jewish Film Festival,location,Moscow,1499.950819,"Tchaikovsky Passage, Moscow, Russia",1.000000,0.577317,...,0.548567,1.235002e+11,The location of the Moscow Jewish Film Festiva...,1.000000,0.540264,Q30124890,Q649,3,332,335
55352,event_film.csv,What is the official language of Huesca Intern...,English,Huesca International Film Festival,official language,English,815.949818,"Spanish, English",1.000000,0.811516,...,1.000000,5.337788e+13,The official language of Huesca International ...,0.000000,0.349974,Q59590889,Q1860,5,392,397
55353,event_film.csv,What is the official language of Huesca Intern...,Spanish,Huesca International Film Festival,official language,Spanish,2148.108094,"Spanish, Aragonese",1.000000,0.714963,...,0.698930,1.346292e+09,The official language of Huesca International ...,0.000000,0.370240,Q59590889,Q1321,5,349,354


In [1]:
#!/usr/bin/env python3
import pandas as pd
from pathlib import Path

# --- входные файлы ---
BASE_CSV  = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/zephyr_7b_beta_all_questions_with_metrics_with_popularity.csv"
DONOR_CSV = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/phi3_5_mini_instruct_all_questions_with_metrics.csv"
OUT_CSV   = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/zephyr_7b_beta_all_questions_WITH_PHI35_cols.csv"

# --- какие колонки забираем из DONOR ---
PHI_COLS = [
    "PPL_Phi3_5_mini_Instruct",
    "best_gen_Phi3_5_mini_Instruct",
    "gen_recall_Phi3_5_mini_Instruct",
    "bert_sim_Phi3_5_mini_Instruct",
]

# ==== load ====
base  = pd.read_csv(BASE_CSV)
donor = pd.read_csv(DONOR_CSV)

# Проверка наличия колонок
for c in PHI_COLS:
    if c not in donor.columns:
        raise ValueError(f"В донорском файле нет колонки: {c}")
if "question" not in base.columns or "answer" not in base.columns:
    raise ValueError("В базовом файле ожидаются колонки 'question' и 'answer'.")
if "question" not in donor.columns or "answer" not in donor.columns:
    raise ValueError("В донорском файле ожидаются колонки 'question' и 'answer'.")

# ==== сначала пробуем прямое выравнивание по индексу ====
use_index_align = False
if len(base) == len(donor):
    # на всякий случай проверим долю совпадений question по позициям
    same_q = (base["question"].astype(str).fillna("") == donor["question"].astype(str).fillna("")).mean()
    # если совпадает >= 95% — считаем, что порядок одинаковый
    if same_q >= 0.95:
        use_index_align = True

if use_index_align:
    # выравнивание по индексу
    for c in PHI_COLS:
        base[c] = donor[c].values
    method = "index"
else:
    # безопасный merge по (question, answer)
    # Сужаем donor до ключей + нужных колонок
    donor_slim = donor[["question", "answer"] + PHI_COLS].copy()

    # Если в donor есть дубликаты по ключу, возьмем первое вхождение
    if donor_slim.duplicated(["question", "answer"]).any():
        donor_slim = donor_slim.drop_duplicates(["question", "answer"], keep="first")

    merged = base.merge(donor_slim, on=["question", "answer"], how="left", suffixes=("", "_donor"))

    # На случай, если какие-то из PHI_COLS уже есть в base — перезапишем их одноимёнными из donor
    for c in PHI_COLS:
        if c in merged.columns:
            # уже пришёл из donor с точным именем
            pass
        elif c + "_donor" in merged.columns:
            merged[c] = merged[c + "_donor"]
            merged.drop(columns=[c + "_donor"], inplace=True)
        else:
            # не найдено после merge — создаём пустую колонку
            merged[c] = pd.NA

    base = merged
    method = "merge(question,answer)"

# ==== отчёт и сохранение ====
added_info = {c: base[c].notna().mean()*100 for c in PHI_COLS}
print(f"Alignment method: {method}")
for c, pct in added_info.items():
    print(f"Filled {c}: {pct:.1f}% rows")

# Сохраняем
Path(OUT_CSV).parent.mkdir(parents=True, exist_ok=True)
base.to_csv(OUT_CSV, index=False)
print(f"✅ Saved: {OUT_CSV}")
base

Alignment method: index
Filled PPL_Phi3_5_mini_Instruct: 100.0% rows
Filled best_gen_Phi3_5_mini_Instruct: 99.9% rows
Filled gen_recall_Phi3_5_mini_Instruct: 100.0% rows
Filled bert_sim_Phi3_5_mini_Instruct: 100.0% rows
✅ Saved: /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/zephyr_7b_beta_all_questions_WITH_PHI35_cols.csv


Unnamed: 0,file,question,answer,subject,relation,object,PPL_Llama3_1-8B_Instruct,best_gen_Llama_8b_Instract,gen_recall_Llama_8b_Instract,bert_sim_Llama_8b_Instract,...,bert_sim_Zephyr_7b_Beta,subject_qid,object_qid,subject_popularity_sitelinks,object_popularity_sitelinks,popularity_sitelinks_sum,PPL_Phi3_5_mini_Instruct,best_gen_Phi3_5_mini_Instruct,gen_recall_Phi3_5_mini_Instruct,bert_sim_Phi3_5_mini_Instruct
0,technology_programming_language.csv,What does JavaScript has use?,web development,JavaScript,has use,web development,35865.451403,Front-end web development and scripting language,1.000000,0.699027,...,0.417127,Q2005,Q386275,157,38,195,1.109592e+14,Semicolons (;) as statements terminators,0.000000,0.071977
1,technology_programming_language.csv,What is the programming paradigm of JavaScript?,aspect-oriented programming,JavaScript,programming paradigm,aspect-oriented programming,200.512601,"Object-Oriented, Functional programming",0.666667,0.639641,...,0.225238,Q2005,Q30267,157,33,190,4.402190e+09,Object-Oriented Programming (OOP),0.666667,0.630127
2,technology_programming_language.csv,What is the programming paradigm of JavaScript?,event-driven programming,JavaScript,programming paradigm,event-driven programming,236.456111,"Object-oriented, event-driven, and scripting l...",0.666667,0.786737,...,0.198329,Q2005,Q1135914,157,29,186,8.118268e+05,Object-Oriented Programming (OOP),0.333333,0.484787
3,technology_programming_language.csv,What is the programming paradigm of JavaScript?,imperative programming,JavaScript,programming paradigm,imperative programming,42.875039,"Multi-paradigm: OOP, imperative,",0.500000,0.574319,...,0.404129,Q2005,Q275596,157,50,207,1.513656e+11,"Multi-paradigm: imperative,",0.500000,0.566366
4,technology_programming_language.csv,What is the programming paradigm of JavaScript?,generic programming,JavaScript,programming paradigm,generic programming,386.563954,Multithreaded object-oriented imperative progr...,0.500000,0.439199,...,0.332423,Q2005,Q1051282,157,33,190,2.520896e+16,Object-Oriented Programming (OOP),0.500000,0.507858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55350,event_film.csv,What is the location of Moscow Jewish Film Fes...,Moscow,Moscow Jewish Film Festival,location,Moscow,1499.950819,Moscow,1.000000,1.000000,...,0.540264,Q30124890,Q649,3,332,335,6.292632e+17,"Moscow, Russia",1.000000,0.941657
55351,event_film.csv,What is the location of Moscow Jewish Film Fes...,Moscow,Moscow Jewish Film Festival,location,Moscow,1499.950819,"Tchaikovsky Passage, Moscow, Russia",1.000000,0.577317,...,0.540264,Q30124890,Q649,3,332,335,6.292632e+17,"Moscow, Russia",1.000000,0.941657
55352,event_film.csv,What is the official language of Huesca Intern...,English,Huesca International Film Festival,official language,English,815.949818,"Spanish, English",1.000000,0.811516,...,0.349974,Q59590889,Q1860,5,392,397,2.912514e+17,Spanish is the official language used during t...,0.000000,0.339224
55353,event_film.csv,What is the official language of Huesca Intern...,Spanish,Huesca International Film Festival,official language,Spanish,2148.108094,"Spanish, Aragonese",1.000000,0.714963,...,0.370240,Q59590889,Q1321,5,349,354,1.603046e+17,Spanish,1.000000,1.000000


In [2]:
#!/usr/bin/env python3
import pandas as pd
from pathlib import Path

# ВХОД/ВЫХОД — при необходимости поменяй путь на свой файл
INPUT_CSV  = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/zephyr_7b_beta_all_questions_WITH_PHI35_cols.csv"
OUTPUT_CSV = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/final_QA_triplets.csv"

# Какие колонки унести в конец
TAIL_COLS = [
    "subject_qid",
    "object_qid",
    "subject_popularity_sitelinks",
    "object_popularity_sitelinks",
    "popularity_sitelinks_sum",
]

df = pd.read_csv(INPUT_CSV)

# Оставляем только реально существующие из списка (на случай, если чего-то нет)
tail_existing = [c for c in TAIL_COLS if c in df.columns]

# Новый порядок: все остальные + хвост
new_cols = [c for c in df.columns if c not in tail_existing] + tail_existing
df = df.reindex(columns=new_cols)

# Сортировка по возрастанию popularity_sitelinks_sum (если колонка есть)
if "popularity_sitelinks_sum" in df.columns:
    df = df.sort_values("popularity_sitelinks_sum", ascending=True, kind="mergesort")  # стабильная

# Сохранение
Path(OUTPUT_CSV).parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUTPUT_CSV, index=False)
print(f"✅ Saved: {OUTPUT_CSV}")


✅ Saved: /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/final_QA_triplets.csv


In [3]:
df

Unnamed: 0,file,question,answer,subject,relation,object,PPL_Llama3_1-8B_Instruct,best_gen_Llama_8b_Instract,gen_recall_Llama_8b_Instract,bert_sim_Llama_8b_Instract,...,bert_sim_Zephyr_7b_Beta,PPL_Phi3_5_mini_Instruct,best_gen_Phi3_5_mini_Instruct,gen_recall_Phi3_5_mini_Instruct,bert_sim_Phi3_5_mini_Instruct,subject_qid,object_qid,subject_popularity_sitelinks,object_popularity_sitelinks,popularity_sitelinks_sum
932,business_industry.csv,What did book industry said to be the same as?,book publishing,book industry,said to be the same as,book publishing,1865.235713,Publishing was said to be like running a river,0.5,0.512184,...,0.468156,1.587546e+17,Digital distribution platforms Book industry o...,0.50,0.615013,Q56560668,Q112165919,0,0,0
2751,places_city.csv,What did Calais twinned administrative body?,Xiangtan,Calais,twinned administrative body,Xiangtan,46.282780,France – via Lille and Kortrijk (,0.0,0.260455,...,-0.016905,1.862347e+15,Not applicable,0.00,0.205666,Q87300250,Q113491656,0,0,0
13126,places_city.csv,What is the located in the administrative terr...,Tlajomulco de Zúñiga Municipality,San Agustín (Jalisco),located in the administrative territorial entity,Tlajomulco de Zúñiga Municipality,3.066345,Mina Grande de los Alamos municipality in N,0.5,0.356967,...,0.324772,2.733490e+00,"Plaza de Armas San Agustín,",0.25,0.311132,,,0,0,0
15089,health_symptom.csv,What is the drug or therapy used for treatment...,propantheline,spasm,drug or therapy used for treatment,propantheline,5.538411,"Examples include baclofen, dantrolene",0.0,0.279883,...,0.355749,4.429714e+12,"Beta-blockers, muscle relaxants",0.00,0.342819,Q65632660,Q95594627,0,0,0
16992,health_disease.csv,What is the health specialty of sleeprunning?,sleep medicine,sleeprunning,health specialty,sleep medicine,2457.903565,Sleep Medicine,1.0,1.000000,...,0.653341,5.246328e+16,Sleep Medicine,1.00,1.000000,,Q15762248,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27854,places_country.csv,What is the country of Japan?,Japan,Japan,country,Japan,12709.168794,Japan is a country,1.0,0.744425,...,0.636226,4.784381e+14,Japan is a country in East Asia,1.00,0.697189,Q17,Q17,410,410,820
32683,places_country.csv,What is the country of Russia?,Russia,Russia,country,Russia,11398.500244,Russia is a country in Eastern Europe and Nort...,1.0,0.672289,...,0.595697,8.400571e+15,Russia is a transcontinental country in Northe...,1.00,0.605403,Q159,Q159,410,410,820
32742,places_country.csv,What is the diplomatic relation of Russia?,Japan,Russia,diplomatic relation,Japan,1651.651733,Russia has relations with:,0.0,0.302459,...,0.202271,7.640358e+17,"Multifaceted and complex, with partners",0.00,0.091473,Q159,Q17,410,410,820
30641,places_country.csv,What is the diplomatic relation of Turkey?,Japan,Turkey,diplomatic relation,Japan,3224.247831,Turkey has diplomatic relations with 195 UN re...,0.0,0.119216,...,0.057623,2.014042e+18,Relations vary with each country; please speci...,0.00,0.371126,Q43,Q17,414,410,824


In [3]:
import pandas as pd
df = pd.read_csv('/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.csv')

In [4]:
import pandas as pd

# Кол-во пропусков по колонкам
na_counts = df.isna().sum().sort_values(ascending=False)
print("=== Missing values per column (count) ===")
print(na_counts.to_string())

# Дополнительно: с долей в %
missing_report = (
    pd.DataFrame({
        "missing": df.isna().sum(),
        "missing_%": (df.isna().mean() * 100).round(2),
    })
    .sort_values("missing", ascending=False)
)
print("\n=== Missing values per column (count & %) ===")
print(missing_report.to_string())


=== Missing values per column (count) ===
subject_qid                        56
object_qid                         40
file                                0
answer                              0
question                            0
subject                             0
relation                            0
best_gen_Llama_8b_Instract          0
gen_recall_Llama_8b_Instract        0
object                              0
PPL_Llama3_1-8B_Instruct            0
best_gen_Llama_3b_Instract          0
gen_recall_Llama_3b_Instract        0
bert_sim_Llama_3b_Instract          0
PPL_Llama3_2-1B_Instruct            0
best_gen_Llama_1b_Instract          0
gen_recall_Llama_1b_Instract        0
bert_sim_Llama_8b_Instract          0
PPL_Llama3_2-3B_Instruct            0
PPL_Gemma_7B_IT                     0
bert_sim_Llama_1b_Instract          0
best_gen_Gemma_7b_IT                0
gen_recall_Gemma_7b_IT              0
best_gen_Zephyr_7b_Beta             0
gen_recall_Zephyr_7b_Beta           0
bert_sim

In [9]:
#!/usr/bin/env python3
import re
import json
import pandas as pd
from pathlib import Path

# поменяй путь при необходимости
PATH = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.csv"

# что считаем пропуском для *_qid
def is_missing_qid(x) -> bool:
    if pd.isna(x):
        return True
    s = str(x).strip().lower()
    return s in {"", "unknown", "uknown", "n/a", "-", "none"}

df = pd.read_csv(PATH)

# базовые проверки
for col in ["subject","object","subject_qid","object_qid"]:
    if col not in df.columns:
        raise ValueError(f"Колонка отсутствует: {col}")

m_subj = df["subject_qid"].apply(is_missing_qid)
m_obj  = df["object_qid"].apply(is_missing_qid)

print("=== Missing counts ===")
print(f"subject_qid: {int(m_subj.sum())}")
print(f"object_qid : {int(m_obj.sum())}")

# ------- детальные таблицы по строкам -------
subj_missing_rows = (
    df.loc[m_subj, ["subject","subject_qid"]]
      .reset_index()
      .rename(columns={"index":"row_id"})
)
obj_missing_rows = (
    df.loc[m_obj, ["object","object_qid"]]
      .reset_index()
      .rename(columns={"index":"row_id"})
)

print("\n=== First few missing SUBJECT rows ===")
print(subj_missing_rows.head(10).to_string(index=False))
print("\n=== First few missing OBJECT rows ===")
print(obj_missing_rows.head(10).to_string(index=False))

# ------- уникальные значения + частоты -------
subj_unique = (
    df.loc[m_subj, "subject"]
      .astype(str).str.strip()
      .value_counts(dropna=False)
      .rename_axis("subject")
      .reset_index(name="count")
)
obj_unique = (
    df.loc[m_obj, "object"]
      .astype(str).str.strip()
      .value_counts(dropna=False)
      .rename_axis("object")
      .reset_index(name="count")
)

print("\n=== Unique unresolved SUBJECT values (top 20) ===")
print(subj_unique.head(20).to_string(index=False))
print("\n=== Unique unresolved OBJECT values (top 20) ===")
print(obj_unique.head(20).to_string(index=False))

# ------- маппинги: значение -> список индексов строк -------
subj_map = (
    df.loc[m_subj, ["subject"]]
      .reset_index()
      .groupby("subject")["index"]
      .apply(list)
      .to_dict()
)
obj_map = (
    df.loc[m_obj, ["object"]]
      .reset_index()
      .groupby("object")["index"]
      .apply(list)
      .to_dict()
)

# ------- сохранение результатов рядом с исходным файлом -------
out_dir = Path(PATH).parent
subj_rows_csv = out_dir / "missing_subject_qid_rows.csv"
obj_rows_csv  = out_dir / "missing_object_qid_rows.csv"
subj_unique_csv = out_dir / "missing_subject_qid_unique.csv"
obj_unique_csv  = out_dir / "missing_object_qid_unique.csv"
subj_idx_json = out_dir / "missing_subject_qid_indices.json"
obj_idx_json  = out_dir / "missing_object_qid_indices.json"

subj_missing_rows.to_csv(subj_rows_csv, index=False)
obj_missing_rows.to_csv(obj_rows_csv, index=False)
subj_unique.to_csv(subj_unique_csv, index=False)
obj_unique.to_csv(obj_unique_csv, index=False)

subj_idx_json.write_text(json.dumps(subj_map, ensure_ascii=False, indent=2), encoding="utf-8")
obj_idx_json.write_text(json.dumps(obj_map, ensure_ascii=False, indent=2), encoding="utf-8")

print(f"\n✅ Saved:")
print(f"- {subj_rows_csv}")
print(f"- {obj_rows_csv}")
print(f"- {subj_unique_csv}")
print(f"- {obj_unique_csv}")
print(f"- {subj_idx_json}")
print(f"- {obj_idx_json}")


=== Missing counts ===
subject_qid: 0
object_qid : 0

=== First few missing SUBJECT rows ===
Empty DataFrame
Columns: [row_id, subject, subject_qid]
Index: []

=== First few missing OBJECT rows ===
Empty DataFrame
Columns: [row_id, object, object_qid]
Index: []

=== Unique unresolved SUBJECT values (top 20) ===
Empty DataFrame
Columns: [subject, count]
Index: []

=== Unique unresolved OBJECT values (top 20) ===
Empty DataFrame
Columns: [object, count]
Index: []

✅ Saved:
- /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/missing_subject_qid_rows.csv
- /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/missing_object_qid_rows.csv
- /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/missing_subject_qid_unique.csv
- /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/missing_object_qid_unique.csv
- /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/missing_subject_qid_indice

In [6]:
#!/usr/bin/env python3
import re
import json
import time
import unicodedata
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# ===== IO =====
INPUT_CSV  = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/final_QA_triplets.csv"
OUT        = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.csv"

CACHE_LABEL2QID = Path(OUT).with_suffix(".label2qid.json")
CACHE_SITELINKS = Path(OUT).with_suffix(".sitelinks_cache.json")

# ===== SETTINGS =====
LANGS_WB   = ["en", "es", "ru", "de", "fr", "it", "uk", "pl"]
WIKI_SITES = ["enwiki", "eswiki", "ruwiki", "dewiki", "frwiki"]
SLEEP_BETWEEN = 0.08
TIMEOUT       = 20
QID_RE = re.compile(r"\bQ\d+\b", re.IGNORECASE)

# ——— РУЧНЫЕ ПРАВКИ (добавляй сюда найденные тобой соответствия) ———
# ключ — НОРМАЛИЗОВАННАЯ строка (см. canon_label), значение — QID
MANUAL_QID = {
    # Примеры (раскомментируй/поправь после проверки):
    "Bayamo, Cuba" : "Q115382",
    "Konstantin Tsiolkovskii" : "Q41239",
    
    # "bayamo cuba": "Q155833",                     # Bayamo
    # "konstantin tsiolkovskii": "Q6679",          # Konstantin Tsiolkovsky (проверь QID)
    # "cayetana fitz james stuarthu": "Q155962",   # Cayetana Fitz-James Stuart (проверь QID)
    # "milowice sosnowiec": "Q...?",               # район в Sosnowiec
    # "tlajomulco de zuniga municipality": "Q...", # муниципалитет (Mexico)
    # "liam pane": "Q340608",                      # Liam Payne (если это опечатка)
    # "lhaj adam opel": "Q158251",                 # Adam Opel (если «lhaj» это «haj»/титул)
    # "sleeprunning": "",                          # если сущности нет — пусть останется пустым
    # "azulfina": "",                              # если сущности нет
}

# ===== HTTP =====
def build_session():
    sess = requests.Session()
    retries = Retry(
        total=6, backoff_factor=0.6,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"], respect_retry_after_header=True,
    )
    adapter = HTTPAdapter(max_retries=retries)
    sess.mount("https://", adapter)
    sess.headers.update({
        "User-Agent": "UNLamb-Wikidata/1.2 (contact: youremail@example.com)",
        "Accept": "application/json",
    })
    return sess

# ===== NORMALIZATION =====
STOPWORDS_PREFIX = {"el","al","haj","lhaj","sheikh","shaikh","mr","mrs","ms","dr"}
DROP_TAIL_WORDS = {"municipality","city","district","county","province","state","town","village"}

def strip_accents(s: str) -> str:
    norm = unicodedata.normalize("NFKD", s or "")
    return "".join(ch for ch in norm if not unicodedata.combining(ch))

def normalize_base(s: str) -> str:
    s = str(s or "").strip()
    s = s.replace("—","-").replace("–","-").replace("’","'").replace("‘","'").replace("“",'"').replace("”",'"')
    s = re.sub(r"\s*\(.*?\)\s*", " ", s)          # удалить скобки
    s = re.sub(r"[^\w\s,\-]", " ", s)             # оставить буквы/цифры/пробел/дефис/зпт
    s = s.replace("_"," ").strip()
    s = re.sub(r"\s+", " ", s)
    return s

def canon_label(s: str) -> str:
    s = normalize_base(s).lower().rstrip(".").strip()
    s = strip_accents(s)
    # убрать префиксные титулы
    parts = [p for p in s.split() if p]
    while parts and parts[0] in STOPWORDS_PREFIX:
        parts = parts[1:]
    s = " ".join(parts)
    # удалить завершающее служебное слово
    tail = s.split()
    if tail and tail[-1] in DROP_TAIL_WORDS:
        s = " ".join(tail[:-1])
    return s

def generate_variants(orig: str):
    """Порождаем набор кандидатных строк для поиска."""
    raw  = str(orig or "").strip()
    norm = normalize_base(raw)
    fold = strip_accents(norm)

    base = canon_label(raw)
    out = []
    def add(x):
        if x and x not in out:
            out.append(x)

    add(raw)
    add(norm)
    add(fold)
    add(base)
    # убрать точку
    add(base.rstrip("."))
    # часть до запятой (часто 'City, Country')
    if "," in base:
        left = base.split(",", 1)[0].strip()
        right = base.split(",", 1)[1].strip()
        add(left)
        add(right)
        # перестановка
        add(f"{right} {left}".strip())
    # удалить слово 'municipality' в середине
    add(re.sub(r"\bmunicipality\b", " ", base).strip())
    # исправление 'ii' -> 'y' для фамилий типа Tsiolkovskii
    add(re.sub(r"skii\b", "sky", base))
    # убрать одиночные служебные слова в начале
    toks = base.split()
    if toks and toks[0] in STOPWORDS_PREFIX:
        add(" ".join(toks[1:]))

    # уникальные
    return [x for x in out if x]

# ===== WIKIDATA =====
def wbsearchentities(label, session, lang="en", limit=10, timeout=TIMEOUT):
    params = {
        "action":"wbsearchentities","format":"json",
        "language":lang,"uselang":lang,"type":"item",
        "search":label,"limit":limit,
    }
    r = session.get("https://www.wikidata.org/w/api.php", params=params, timeout=timeout)
    r.raise_for_status()
    return (r.json() or {}).get("search", []) or []

def pick_qid_from_results(label_raw: str, results: list) -> str | None:
    if not results: return None
    lab_norm = strip_accents(normalize_base(label_raw)).lower()
    # точное совпадение по label
    for it in results:
        lab = strip_accents(normalize_base(it.get("label",""))).lower()
        if lab and lab == lab_norm:
            return it.get("id")
    # точное совпадение по алиасам
    for it in results:
        for al in it.get("aliases") or []:
            if strip_accents(normalize_base(al)).lower() == lab_norm:
                return it.get("id")
    # иначе первый
    return results[0].get("id")

def wiki_search_title(term: str, session, lang="en", timeout=TIMEOUT):
    params = {"action":"query","list":"search","format":"json",
              "srsearch":term,"srlimit":1,"srwhat":"nearmatch","srprop":""}
    r = session.get(f"https://{lang}.wikipedia.org/w/api.php", params=params, timeout=timeout)
    r.raise_for_status()
    hits = ((r.json() or {}).get("query", {}) or {}).get("search", []) or []
    return hits[0].get("title") if hits else None

def wiki_title_to_qid(site: str, title: str, session, timeout=TIMEOUT):
    params = {"action":"wbgetentities","format":"json","sites":site,"titles":title,"props":"info"}
    r = session.get("https://www.wikidata.org/w/api.php", params=params, timeout=timeout)
    r.raise_for_status()
    for k in (r.json() or {}).get("entities", {}):
        if k.startswith("Q"): return k
    return None

def resolve_label_to_qid(label: str, session, manual_map: dict) -> str | None:
    if not label or str(label).strip().lower() in {"unknown","uknown","n/a","-","none"}:
        return None

    q = extract_qid(label)
    if q: return q

    # 1) ручной словарь
    key = canon_label(label)
    if key in manual_map and manual_map[key]:
        return manual_map[key]

    # 2) варианты + wbsearchentities
    variants = generate_variants(label)
    for cand in variants:
        for lang in LANGS_WB:
            try:
                res = wbsearchentities(cand, session, lang=lang, limit=10)
                qid = pick_qid_from_results(cand, res)
                if qid:
                    return qid
            except Exception:
                pass
            time.sleep(SLEEP_BETWEEN/2)

    # 3) wiki search → title → QID
    for cand in variants:
        for l, site in zip(["en","es","ru","de","fr"], WIKI_SITES):
            try:
                title = wiki_search_title(cand, session, lang=l)
                if title:
                    qid = wiki_title_to_qid(site, title, session)
                    if qid:
                        return qid
            except Exception:
                pass
            time.sleep(SLEEP_BETWEEN)

    return None

def extract_qid(val):
    if pd.isna(val): return None
    m = QID_RE.search(str(val))
    return m.group(0).upper() if m else None

def get_sitelinks_count(qid, session, timeout=TIMEOUT):
    if not qid: return 0
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    try:
        r = session.get(url, timeout=timeout)
        if r.status_code == 404: return 0
        r.raise_for_status()
        ent = (r.json() or {}).get("entities", {}).get(qid, {})
        return len(ent.get("sitelinks", {}))
    except Exception:
        return 0

# ===== MAIN =====
def is_missing_qid(x) -> bool:
    if pd.isna(x): return True
    s = str(x).strip().lower()
    return s in {"", "unknown", "uknown", "n/a", "-", "none"}

def main():
    df = pd.read_csv(INPUT_CSV)

    for col in ["subject","object"]:
        if col not in df.columns:
            raise ValueError(f"Нет обязательной колонки: {col}")

    # гарантируем наличие целевых полей
    for col in ["subject_qid","object_qid","subject_popularity_sitelinks","object_popularity_sitelinks","popularity_sitelinks_sum"]:
        if col not in df.columns:
            df[col] = pd.NA

    m_subj = df["subject_qid"].apply(is_missing_qid)
    m_obj  = df["object_qid"].apply(is_missing_qid)
    print(f"Missing subject_qid: {int(m_subj.sum())}")
    print(f"Missing object_qid : {int(m_obj.sum())}")

    if m_subj.sum() == 0 and m_obj.sum() == 0:
        df.to_csv(OUT, index=False)
        print(f"✅ Nothing to fill. Saved: {OUT}")
        return

    session = build_session()
    label2qid = {}  # локальный кэш текущего прогона + manual
    label2qid.update(MANUAL_QID)
    # подгрузи предыдущий кэш, если был
    if CACHE_LABEL2QID.exists():
        try:
            prev = json.loads(CACHE_LABEL2QID.read_text("utf-8"))
            label2qid.update(prev)
        except Exception:
            pass

    # SUBJECT
    for idx in tqdm(df.index[m_subj], desc="resolve subject_qid"):
        lbl = str(df.at[idx, "subject"])
        qid = resolve_label_to_qid(lbl, session, label2qid)
        # сохраняем в label2qid по каноническому ключу, чтобы ускорять следующие попадания
        label2qid[canon_label(lbl)] = qid or label2qid.get(canon_label(lbl), "")
        if qid:
            df.at[idx, "subject_qid"] = qid
        time.sleep(SLEEP_BETWEEN)

    # OBJECT
    for idx in tqdm(df.index[m_obj], desc="resolve object_qid"):
        lbl = str(df.at[idx, "object"])
        qid = resolve_label_to_qid(lbl, session, label2qid)
        label2qid[canon_label(lbl)] = qid or label2qid.get(canon_label(lbl), "")
        if qid:
            df.at[idx, "object_qid"] = qid
        time.sleep(SLEEP_BETWEEN)

    # сохранить кэш маппинга
    try:
        CACHE_LABEL2QID.write_text(json.dumps(label2qid, ensure_ascii=False), encoding="utf-8")
    except Exception:
        pass

    # подтянем sitelinks только для новых QID из строк, где ИЗНАЧАЛЬНО были пропуски
    touched = (m_subj | m_obj)
    # загрузим существующий кэш sitelinks
    qid2sitelinks = {}
    if CACHE_SITELINKS.exists():
        try:
            qid2sitelinks.update(json.loads(CACHE_SITELINKS.read_text("utf-8")))
        except Exception:
            pass

    new_qids = set()
    for idx in df.index[touched]:
        for col in ["subject_qid", "object_qid"]:
            q = str(df.at[idx, col]).strip()
            if q and q.upper().startswith("Q"):
                new_qids.add(q)

    to_fetch = [q for q in sorted(new_qids) if q not in qid2sitelinks]
    if to_fetch:
        print(f"Fetching sitelinks for {len(to_fetch)} new QIDs...")
        for q in tqdm(to_fetch, desc="qid→sitelinks"):
            qid2sitelinks[q] = get_sitelinks_count(q, session)
            time.sleep(SLEEP_BETWEEN)
        try:
            CACHE_SITELINKS.write_text(json.dumps(qid2sitelinks, ensure_ascii=False), encoding="utf-8")
        except Exception:
            pass

    # пересчёт популярностей только на touched
    rows = df.index[touched]
    if len(rows) > 0:
        sub_vals = df.loc[rows, "subject_qid"].astype(str).str.strip()
        obj_vals = df.loc[rows, "object_qid"].astype(str).str.strip()
        df.loc[rows, "subject_popularity_sitelinks"] = sub_vals.map(lambda q: int(qid2sitelinks.get(q, 0)) if q.upper().startswith("Q") else 0)
        df.loc[rows, "object_popularity_sitelinks"]  = obj_vals.map(lambda q: int(qid2sitelinks.get(q, 0)) if q.upper().startswith("Q") else 0)
        df.loc[rows, "popularity_sitelinks_sum"]     = (
            df.loc[rows, "subject_popularity_sitelinks"].fillna(0).astype(int)
            + df.loc[rows, "object_popularity_sitelinks"].fillna(0).astype(int)
        )

    # отчёт по остаткам — и выгрузки для ручной правки
    still_subj = df["subject_qid"].apply(is_missing_qid)
    still_obj  = df["object_qid"].apply(is_missing_qid)
    print(f"\nRemaining subject_qid: {int(still_subj.sum())}")
    print(f"Remaining object_qid : {int(still_obj.sum())}")

    # выведем уникальные нерешённые значения и их индексы — чтобы ты дописала MANUAL_QID
    def dump_missing(col_label, col_qid, fname_prefix):
        miss = df[col_qid].apply(is_missing_qid)
        rows = df.loc[miss, [col_label, col_qid]].reset_index().rename(columns={"index":"row_id"})
        uniq = (
            rows[col_label].astype(str).str.strip()
            .value_counts(dropna=False)
            .rename_axis(col_label)
            .reset_index(name="count")
        )
        out_dir = Path(OUT).parent
        rows.to_csv(out_dir / f"{fname_prefix}_rows.csv", index=False)
        uniq.to_csv(out_dir / f"{fname_prefix}_unique.csv", index=False)

        # маппинг значение -> индексы
        mapping = rows.groupby(col_label)["row_id"].apply(list).to_dict()
        (out_dir / f"{fname_prefix}_indices.json").write_text(json.dumps(mapping, ensure_ascii=False, indent=2), encoding="utf-8")

        print(f"\nSaved unresolved {col_label} lists to:")
        print(f"- {(out_dir / f'{fname_prefix}_rows.csv')}")
        print(f"- {(out_dir / f'{fname_prefix}_unique.csv')}")
        print(f"- {(out_dir / f'{fname_prefix}_indices.json')}")

    if still_subj.any():
        dump_missing("subject", "subject_qid", "missing_subject_qid")
    if still_obj.any():
        dump_missing("object", "object_qid", "missing_object_qid")

    # save
    Path(OUT).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(OUT, index=False)
    print(f"\n✅ Saved: {OUT}")

if __name__ == "__main__":
    main()


Missing subject_qid: 71
Missing object_qid : 46


resolve subject_qid: 100%|██████████| 71/71 [02:32<00:00,  2.15s/it]
resolve object_qid: 100%|██████████| 46/46 [03:55<00:00,  5.12s/it]


Fetching sitelinks for 19 new QIDs...


qid→sitelinks: 100%|██████████| 19/19 [00:08<00:00,  2.25it/s]



Remaining subject_qid: 15
Remaining object_qid : 24

Saved unresolved subject lists to:
- /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/missing_subject_qid_rows.csv
- /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/missing_subject_qid_unique.csv
- /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/missing_subject_qid_indices.json

Saved unresolved object lists to:
- /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/missing_object_qid_rows.csv
- /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/missing_object_qid_unique.csv
- /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/missing_object_qid_indices.json

✅ Saved: /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.csv


In [7]:
#!/usr/bin/env python3
import re
import json
import time
import unicodedata
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# ===== IO =====
INPUT_CSV  = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/final_QA_triplets.csv"
OUT        = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.csv"

CACHE_LABEL2QID = Path(OUT).with_suffix(".label2qid.json")
CACHE_SITELINKS = Path(OUT).with_suffix(".sitelinks_cache.json")

# ===== SETTINGS =====
LANGS_WB   = ["en", "es", "ru", "de", "fr", "it", "uk", "pl"]
WIKI_SITES = ["enwiki", "eswiki", "ruwiki", "dewiki", "frwiki"]
SLEEP_BETWEEN = 0.08
TIMEOUT       = 20
QID_RE = re.compile(r"\bQ\d+\b", re.IGNORECASE)

# ——— РУЧНЫЕ ПРАВКИ (можно добавлять в любом регистре; работает и с канонич. ключом) ———
MANUAL_QID = {
    "Bayamo, Cuba"              : "Q115382",  # город Bayamo (Cuba)
    "Konstantin Tsiolkovskii"   : "Q41239",   # Konstantin Tsiolkovsky
    # примеры для дописывания:
    # "Milowice, Sosnowiec": "Q672082",  # проверить реальный QID
    # "Tlajomulco de Zúñiga Municipality": "Q184216",  # примерный, проверь
    # "Liam Pane": "Q340608",  # если опечатка от Liam Payne
    # "lhaj Adam Opel": "Q158251",  # Adam Opel — если "lhaj" это титул
}

# ===== HTTP =====
def build_session():
    sess = requests.Session()
    retries = Retry(
        total=6, backoff_factor=0.6,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"], respect_retry_after_header=True,
    )
    adapter = HTTPAdapter(max_retries=retries)
    sess.mount("https://", adapter)
    sess.headers.update({
        "User-Agent": "UNLamb-Wikidata/1.2 (contact: youremail@example.com)",
        "Accept": "application/json",
    })
    return sess

# ===== NORMALIZATION =====
STOPWORDS_PREFIX = {"el","al","haj","lhaj","sheikh","shaikh","mr","mrs","ms","dr"}
DROP_TAIL_WORDS = {"municipality","city","district","county","province","state","town","village"}

def strip_accents(s: str) -> str:
    norm = unicodedata.normalize("NFKD", s or "")
    return "".join(ch for ch in norm if not unicodedata.combining(ch))

def normalize_base(s: str) -> str:
    s = str(s or "").strip()
    s = s.replace("—","-").replace("–","-").replace("’","'").replace("‘","'").replace("“",'"').replace("”",'"')
    s = re.sub(r"\s*\(.*?\)\s*", " ", s)          # удалить скобки
    s = re.sub(r"[^\w\s,\-]", " ", s)             # оставить буквы/цифры/пробел/дефис/зпт
    s = s.replace("_"," ").strip()
    s = re.sub(r"\s+", " ", s)
    return s

def canon_label(s: str) -> str:
    s = normalize_base(s).lower().rstrip(".").strip()
    s = strip_accents(s)
    # убрать префиксные титулы
    parts = [p for p in s.split() if p]
    while parts and parts[0] in STOPWORDS_PREFIX:
        parts = parts[1:]
    s = " ".join(parts)
    # удалить завершающее служебное слово
    tail = s.split()
    if tail and tail[-1] in DROP_TAIL_WORDS:
        s = " ".join(tail[:-1])
    return s

def generate_variants(orig: str):
    """Порождаем набор кандидатных строк для поиска."""
    raw  = str(orig or "").strip()
    norm = normalize_base(raw)
    fold = strip_accents(norm)

    base = canon_label(raw)
    out = []
    def add(x):
        if x and x not in out:
            out.append(x)

    add(raw)
    add(norm)
    add(fold)
    add(base)
    # убрать точку
    add(base.rstrip("."))
    # часть до запятой (часто 'City, Country')
    if "," in base:
        left = base.split(",", 1)[0].strip()
        right = base.split(",", 1)[1].strip()
        add(left); add(right); add(f"{right} {left}".strip())
    # удалить слово 'municipality' в середине
    add(re.sub(r"\bmunicipality\b", " ", base).strip())
    # исправление 'ii' -> 'y' для фамилий типа Tsiolkovskii
    add(re.sub(r"skii\b", "sky", base))
    # убрать титулы в начале
    toks = base.split()
    if toks and toks[0] in STOPWORDS_PREFIX:
        add(" ".join(toks[1:]))

    return [x for x in out if x]

# ===== WIKIDATA =====
def wbsearchentities(label, session, lang="en", limit=10, timeout=TIMEOUT):
    params = {
        "action":"wbsearchentities","format":"json",
        "language":lang,"uselang":lang,"type":"item",
        "search":label,"limit":limit,
    }
    r = session.get("https://www.wikidata.org/w/api.php", params=params, timeout=timeout)
    r.raise_for_status()
    return (r.json() or {}).get("search", []) or []

def pick_qid_from_results(label_raw: str, results: list):
    """
    Возвращает (qid, match_type):
      - 'wb-label-exact'  — точное совпадение по label (после нормализации/деакцента)
      - 'wb-alias-exact'  — точное совпадение по alias
      - 'wb-top'          — просто первый результат
    """
    if not results:
        return None, None
    lab_norm = strip_accents(normalize_base(label_raw)).lower()
    for it in results:
        lab = strip_accents(normalize_base(it.get("label",""))).lower()
        if lab and lab == lab_norm:
            return it.get("id"), "wb-label-exact"
    for it in results:
        for al in it.get("aliases") or []:
            if strip_accents(normalize_base(al)).lower() == lab_norm:
                return it.get("id"), "wb-alias-exact"
    return results[0].get("id"), "wb-top"

def wiki_search_title(term: str, session, lang="en", timeout=TIMEOUT):
    params = {"action":"query","list":"search","format":"json",
              "srsearch":term,"srlimit":1,"srwhat":"nearmatch","srprop":""}
    r = session.get(f"https://{lang}.wikipedia.org/w/api.php", params=params, timeout=timeout)
    r.raise_for_status()
    hits = ((r.json() or {}).get("query", {}) or {}).get("search", []) or []
    return hits[0].get("title") if hits else None

def wiki_title_to_qid(site: str, title: str, session, timeout=TIMEOUT):
    params = {"action":"wbgetentities","format":"json","sites":site,"titles":title,"props":"info"}
    r = session.get("https://www.wikidata.org/w/api.php", params=params, timeout=timeout)
    r.raise_for_status()
    for k in (r.json() or {}).get("entities", {}):
        if k.startswith("Q"):
            return k
    return None

def extract_qid(val):
    if pd.isna(val):
        return None
    m = QID_RE.search(str(val))
    return m.group(0).upper() if m else None

def resolve_label_to_qid(label: str, session, manual_map: dict):
    """
    Возвращает (qid, meta) где meta ~ {'method':..., 'candidate':..., 'lang':...}
    method ∈ {'manual-exact','manual-canon','wb-label-exact','wb-alias-exact','wb-top','wiki-title', None}
    """
    meta = {"method": None, "candidate": None, "lang": None}
    if not label or str(label).strip().lower() in {"unknown","uknown","n/a","-","none"}:
        return None, meta

    # уже QID?
    q = extract_qid(label)
    if q:
        meta["method"] = "given-qid"
        return q, meta

    # manual: пробуем разными ключами
    raw_key   = label
    canon_key = canon_label(label)
    lower_key = str(label).lower().strip()
    if raw_key in manual_map and manual_map[raw_key]:
        meta["method"] = "manual-exact"
        return manual_map[raw_key], meta
    if canon_key in manual_map and manual_map[canon_key]:
        meta["method"] = "manual-canon"
        return manual_map[canon_key], meta
    if lower_key in manual_map and manual_map[lower_key]:
        meta["method"] = "manual-lower"
        return manual_map[lower_key], meta

    # варианты + wbsearchentities
    variants = generate_variants(label)
    for cand in variants:
        for lang in LANGS_WB:
            try:
                res = wbsearchentities(cand, session, lang=lang, limit=10)
                qid, how = pick_qid_from_results(cand, res)
                if qid:
                    meta.update({"method": how, "candidate": cand, "lang": lang})
                    return qid, meta
            except Exception:
                pass
            time.sleep(SLEEP_BETWEEN/2)

    # wiki search → title → QID
    for cand in variants:
        for l, site in zip(["en","es","ru","de","fr"], WIKI_SITES):
            try:
                title = wiki_search_title(cand, session, lang=l)
                if title:
                    qid = wiki_title_to_qid(site, title, session)
                    if qid:
                        meta.update({"method": "wiki-title", "candidate": title, "lang": l})
                        return qid, meta
            except Exception:
                pass
            time.sleep(SLEEP_BETWEEN)

    return None, meta

def get_sitelinks_count(qid, session, timeout=TIMEOUT):
    if not qid:
        return 0
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    try:
        r = session.get(url, timeout=timeout)
        if r.status_code == 404:
            return 0
        r.raise_for_status()
        ent = (r.json() or {}).get("entities", {}).get(qid, {})
        return len(ent.get("sitelinks", {}))
    except Exception:
        return 0

# ===== MISSING CHECK =====
def is_missing_qid(x) -> bool:
    if pd.isna(x):
        return True
    s = str(x).strip().lower()
    return s in {"", "unknown", "uknown", "n/a", "-", "none"}

# ===== MAIN =====
def main():
    df = pd.read_csv(INPUT_CSV)

    for col in ["subject","object"]:
        if col not in df.columns:
            raise ValueError(f"Нет обязательной колонки: {col}")

    # гарантируем наличие целевых полей
    for col in ["subject_qid","object_qid","subject_popularity_sitelinks","object_popularity_sitelinks","popularity_sitelinks_sum"]:
        if col not in df.columns:
            df[col] = pd.NA

    m_subj = df["subject_qid"].apply(is_missing_qid)
    m_obj  = df["object_qid"].apply(is_missing_qid)
    print(f"Missing subject_qid: {int(m_subj.sum())}")
    print(f"Missing object_qid : {int(m_obj.sum())}")

    if m_subj.sum() == 0 and m_obj.sum() == 0:
        df.to_csv(OUT, index=False)
        print(f"✅ Nothing to fill. Saved: {OUT}")
        return

    session = build_session()

    # подгружаем предыдущий кэш label->qid (плюс manual)
    label2qid = {}
    label2qid.update(MANUAL_QID)
    if CACHE_LABEL2QID.exists():
        try:
            prev = json.loads(CACHE_LABEL2QID.read_text("utf-8"))
            label2qid.update(prev)
        except Exception:
            pass

    # ========= SUBJECT: только пропуски + печать найденных =========
    for idx in tqdm(df.index[m_subj], desc="resolve subject_qid"):
        lbl = str(df.at[idx, "subject"])
        qid, meta = resolve_label_to_qid(lbl, session, label2qid)

        # кэшируем ключи для быстрого повторного попадания
        label2qid[lbl] = qid or label2qid.get(lbl, "")
        label2qid[canon_label(lbl)] = qid or label2qid.get(canon_label(lbl), "")

        if qid:
            df.at[idx, "subject_qid"] = qid
            print(f"[SUBJ row {idx}] '{lbl}'  ->  {qid}  | method={meta.get('method')} cand={meta.get('candidate')} lang={meta.get('lang')}")
        else:
            print(f"[SUBJ row {idx}] '{lbl}'  ->  NOT FOUND")

        time.sleep(SLEEP_BETWEEN)

    # ========= OBJECT: только пропуски + печать найденных =========
    for idx in tqdm(df.index[m_obj], desc="resolve object_qid"):
        lbl = str(df.at[idx, "object"])
        qid, meta = resolve_label_to_qid(lbl, session, label2qid)

        label2qid[lbl] = qid or label2qid.get(lbl, "")
        label2qid[canon_label(lbl)] = qid or label2qid.get(canon_label(lbl), "")

        if qid:
            df.at[idx, "object_qid"] = qid
            print(f"[OBJ  row {idx}] '{lbl}'  ->  {qid}  | method={meta.get('method')} cand={meta.get('candidate')} lang={meta.get('lang')}")
        else:
            print(f"[OBJ  row {idx}] '{lbl}'  ->  NOT FOUND")

        time.sleep(SLEEP_BETWEEN)

    # сохранить кэш маппинга
    try:
        CACHE_LABEL2QID.write_text(json.dumps(label2qid, ensure_ascii=False), encoding="utf-8")
    except Exception:
        pass

    # ——— sitelinks только для новых QID из строк, где ИЗНАЧАЛЬНО были пропуски ———
    touched = (m_subj | m_obj)

    qid2sitelinks = {}
    if CACHE_SITELINKS.exists():
        try:
            qid2sitelinks.update(json.loads(CACHE_SITELINKS.read_text("utf-8")))
        except Exception:
            pass

    new_qids = set()
    for idx in df.index[touched]:
        for col in ["subject_qid", "object_qid"]:
            q = str(df.at[idx, col]).strip()
            if q and q.upper().startswith("Q"):
                new_qids.add(q)

    to_fetch = [q for q in sorted(new_qids) if q not in qid2sitelinks]
    if to_fetch:
        print(f"Fetching sitelinks for {len(to_fetch)} new QIDs...")
        for q in tqdm(to_fetch, desc="qid→sitelinks"):
            qid2sitelinks[q] = get_sitelinks_count(q, session)
            time.sleep(SLEEP_BETWEEN)
        try:
            CACHE_SITELINKS.write_text(json.dumps(qid2sitelinks, ensure_ascii=False), encoding="utf-8")
        except Exception:
            pass

    # ——— пересчёт популярностей только на touched ———
    rows = df.index[touched]
    if len(rows) > 0:
        sub_vals = df.loc[rows, "subject_qid"].astype(str).str.strip()
        obj_vals = df.loc[rows, "object_qid"].astype(str).str.strip()
        df.loc[rows, "subject_popularity_sitelinks"] = sub_vals.map(lambda q: int(qid2sitelinks.get(q, 0)) if q.upper().startswith("Q") else 0)
        df.loc[rows, "object_popularity_sitelinks"]  = obj_vals.map(lambda q: int(qid2sitelinks.get(q, 0)) if q.upper().startswith("Q") else 0)
        df.loc[rows, "popularity_sitelinks_sum"]     = (
            df.loc[rows, "subject_popularity_sitelinks"].fillna(0).astype(int)
            + df.loc[rows, "object_popularity_sitelinks"].fillna(0).astype(int)
        )

    # отчёт и сохранение
    still_subj = int(df["subject_qid"].apply(is_missing_qid).sum())
    still_obj  = int(df["object_qid"].apply(is_missing_qid).sum())
    print(f"\nRemaining subject_qid: {still_subj}")
    print(f"Remaining object_qid : {still_obj}")

    Path(OUT).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(OUT, index=False)
    print(f"\n✅ Saved: {OUT}")

if __name__ == "__main__":
    main()


Missing subject_qid: 71
Missing object_qid : 46


resolve subject_qid:   0%|          | 0/71 [00:00<?, ?it/s]

[SUBJ row 2] 'San Agustín (Jalisco)'  ->  Q4407227  | method=manual-exact cand=None lang=None


resolve subject_qid:   3%|▎         | 2/71 [00:04<02:18,  2.01s/it]

[SUBJ row 4] 'sleeprunning'  ->  NOT FOUND
[SUBJ row 158] 'Bayamo, Cuba.'  ->  Q115382  | method=manual-canon cand=None lang=None
[SUBJ row 618] 'lhaj Adam Opel'  ->  Q57479  | method=manual-canon cand=None lang=None


resolve subject_qid:   8%|▊         | 6/71 [00:04<00:33,  1.95it/s]

[SUBJ row 619] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None
[SUBJ row 761] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-canon cand=None lang=None
[SUBJ row 887] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None


resolve subject_qid:  11%|█▏        | 8/71 [00:04<00:21,  2.93it/s]

[SUBJ row 969] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None
[SUBJ row 1290] 'Bayamo, Cuba.'  ->  Q115382  | method=manual-exact cand=None lang=None


resolve subject_qid:  14%|█▍        | 10/71 [00:07<00:49,  1.22it/s]

[SUBJ row 1454] 'sleeprunning'  ->  NOT FOUND


resolve subject_qid:  15%|█▌        | 11/71 [00:13<01:54,  1.91s/it]

[SUBJ row 1569] 'Azulfina'  ->  NOT FOUND
[SUBJ row 1630] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None
[SUBJ row 1730] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None


resolve subject_qid:  20%|█▉        | 14/71 [00:17<01:33,  1.63s/it]

[SUBJ row 1760] 'flat feet problems'  ->  NOT FOUND
[SUBJ row 1817] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None
[SUBJ row 1996] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None


resolve subject_qid:  25%|██▌       | 18/71 [00:17<00:37,  1.40it/s]

[SUBJ row 1997] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None
[SUBJ row 2108] 'Ryn, Giżycko County'  ->  Q616895  | method=manual-canon cand=None lang=None
[SUBJ row 2865] 'Concepción, Santa Cruz'  ->  Q751077  | method=manual-exact cand=None lang=None


resolve subject_qid:  28%|██▊       | 20/71 [00:17<00:25,  1.98it/s]

[SUBJ row 3426] 'Sonsonate Oeste (Acajutla)'  ->  Q127416243  | method=manual-exact cand=None lang=None
[SUBJ row 4104] 'Aybak, Samangan'  ->  Q1020649  | method=manual-exact cand=None lang=None


resolve subject_qid:  31%|███       | 22/71 [00:24<01:07,  1.38s/it]

[SUBJ row 5994] 'Falun Mine Museum'  ->  NOT FOUND
[SUBJ row 6132] 'Ryn, Giżycko County'  ->  Q616895  | method=manual-exact cand=None lang=None


resolve subject_qid:  34%|███▍      | 24/71 [00:27<01:09,  1.48s/it]

[SUBJ row 6836] 'sleeprunning'  ->  NOT FOUND


resolve subject_qid:  35%|███▌      | 25/71 [00:37<02:15,  2.96s/it]

[SUBJ row 7627] 'Fox Glacier / Te Moeka o Tuawe'  ->  NOT FOUND
[SUBJ row 7682] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None
[SUBJ row 7683] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None


resolve subject_qid:  39%|███▉      | 28/71 [00:40<01:35,  2.23s/it]

[SUBJ row 8293] 'zahracleaning'  ->  NOT FOUND
[SUBJ row 8679] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None


resolve subject_qid:  42%|████▏     | 30/71 [00:47<01:47,  2.62s/it]

[SUBJ row 9486] 'Band 4 Band'  ->  NOT FOUND


resolve subject_qid:  44%|████▎     | 31/71 [00:53<02:14,  3.36s/it]

[SUBJ row 10082] 'Falun Mine Museum'  ->  NOT FOUND
[SUBJ row 11454] 'The Iron Gate (Jerusalem)'  ->  Q119361435  | method=manual-exact cand=None lang=None
[SUBJ row 11862] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None


resolve subject_qid:  46%|████▋     | 33/71 [00:53<01:21,  2.15s/it]

[SUBJ row 11915] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None


resolve subject_qid:  49%|████▉     | 35/71 [01:03<01:50,  3.08s/it]

[SUBJ row 12670] 'Fox Glacier / Te Moeka o Tuawe'  ->  NOT FOUND
[SUBJ row 13727] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None
[SUBJ row 14856] 'Anavatos, Chios'  ->  Q21682902  | method=manual-canon cand=None lang=None


resolve subject_qid:  55%|█████▍    | 39/71 [01:03<00:46,  1.44s/it]

[SUBJ row 15077] 'The Iron Gate (Jerusalem)'  ->  Q119361435  | method=manual-exact cand=None lang=None
[SUBJ row 15496] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None
[SUBJ row 16115] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None


resolve subject_qid:  58%|█████▊    | 41/71 [01:03<00:30,  1.02s/it]

[SUBJ row 16612] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None
[SUBJ row 17310] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None
[SUBJ row 19020] 'Naolinco, Veracruz'  ->  Q948343  | method=manual-canon cand=None lang=None


resolve subject_qid:  63%|██████▎   | 45/71 [01:04<00:13,  1.89it/s]

[SUBJ row 19875] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None
[SUBJ row 20368] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None
[SUBJ row 22199] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None


resolve subject_qid:  66%|██████▌   | 47/71 [01:04<00:09,  2.55it/s]

[SUBJ row 22531] 'Sonsonate Oeste (Acajutla)'  ->  Q127416243  | method=manual-exact cand=None lang=None


resolve subject_qid:  69%|██████▉   | 49/71 [01:10<00:27,  1.25s/it]

[SUBJ row 22592] 'Falun Mine Museum'  ->  NOT FOUND
[SUBJ row 23938] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None
[SUBJ row 24858] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None


resolve subject_qid:  72%|███████▏  | 51/71 [01:10<00:17,  1.12it/s]

[SUBJ row 26375] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None
[SUBJ row 26376] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None
[SUBJ row 28155] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None


resolve subject_qid:  77%|███████▋  | 55/71 [01:17<00:22,  1.43s/it]

[SUBJ row 28345] 'Falun Mine Museum'  ->  NOT FOUND
[SUBJ row 28630] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None
[SUBJ row 30724] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None


resolve subject_qid:  80%|████████  | 57/71 [01:17<00:14,  1.02s/it]

[SUBJ row 35183] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None
[SUBJ row 35184] 'Konstantin Tsiolkovskii'  ->  Q41239  | method=manual-exact cand=None lang=None
[SUBJ row 36318] 'The Iron Gate (Jerusalem)'  ->  Q119361435  | method=manual-exact cand=None lang=None


resolve subject_qid:  86%|████████▌ | 61/71 [01:17<00:05,  1.85it/s]

[SUBJ row 37140] 'San Matías, Santa Cruz'  ->  Q176823  | method=manual-exact cand=None lang=None
[SUBJ row 37150] 'Concepción, Santa Cruz'  ->  Q751077  | method=manual-exact cand=None lang=None


resolve subject_qid:  89%|████████▊ | 63/71 [01:24<00:10,  1.36s/it]

[SUBJ row 38567] 'Pueblo Marron'  ->  NOT FOUND
[SUBJ row 38690] 'Bayamo, Cuba.'  ->  Q115382  | method=manual-exact cand=None lang=None
[SUBJ row 38691] 'Bayamo, Cuba.'  ->  Q115382  | method=manual-exact cand=None lang=None


resolve subject_qid:  92%|█████████▏| 65/71 [01:24<00:05,  1.02it/s]

[SUBJ row 39061] 'Lundu, Sarawak'  ->  Q6704167  | method=manual-exact cand=None lang=None


resolve subject_qid:  93%|█████████▎| 66/71 [01:34<00:12,  2.52s/it]

[SUBJ row 39332] 'Fox Glacier / Te Moeka o Tuawe'  ->  NOT FOUND
[SUBJ row 41416] 'Aybak, Samangan'  ->  Q1020649  | method=manual-exact cand=None lang=None
[SUBJ row 42525] 'The Iron Gate (Jerusalem)'  ->  Q119361435  | method=manual-exact cand=None lang=None


resolve subject_qid:  99%|█████████▊| 70/71 [01:34<00:01,  1.18s/it]

[SUBJ row 44529] 'Naolinco, Veracruz'  ->  Q948343  | method=manual-exact cand=None lang=None
[SUBJ row 44538] 'San Agustín (Jalisco)'  ->  Q4407227  | method=manual-exact cand=None lang=None
[SUBJ row 44542] 'José Cardel, Veracruz'  ->  Q5938733  | method=manual-exact cand=None lang=None


resolve subject_qid: 100%|██████████| 71/71 [01:34<00:00,  1.33s/it]
resolve object_qid:   0%|          | 0/46 [00:00<?, ?it/s]

[OBJ  row 2] 'Tlajomulco de Zúñiga Municipality'  ->  Q20249211  | method=manual-canon cand=None lang=None


resolve object_qid:   4%|▍         | 2/46 [00:06<02:21,  3.22s/it]

[OBJ  row 116] 'Liam Pane'  ->  NOT FOUND
[OBJ  row 150] 'Bayamo, Cuba.'  ->  Q115382  | method=manual-exact cand=None lang=None
[OBJ  row 215] 'Milowice, Sosnowiec'  ->  Q11781308  | method=manual-canon cand=None lang=None


resolve object_qid:   9%|▊         | 4/46 [00:06<00:57,  1.37s/it]

[OBJ  row 622] 'lhaj Adam Opel'  ->  Q57479  | method=manual-exact cand=None lang=None


resolve object_qid:  13%|█▎        | 6/46 [00:13<01:34,  2.36s/it]

[OBJ  row 1241] 'Cayetana Fitz-James Stuarthu'  ->  NOT FOUND
[OBJ  row 1942] 'Tlajomulco de Zúñiga Municipality'  ->  Q20249211  | method=manual-exact cand=None lang=None


resolve object_qid:  17%|█▋        | 8/46 [00:23<02:08,  3.37s/it]

[OBJ  row 1981] 'Manufacture of machinery and equipment n.e.c.'  ->  NOT FOUND


resolve object_qid:  20%|█▉        | 9/46 [00:30<02:32,  4.12s/it]

[OBJ  row 1989] 'Princely House of Thurn and Taxis'  ->  NOT FOUND


resolve object_qid:  22%|██▏       | 10/46 [00:36<02:50,  4.73s/it]

[OBJ  row 5086] 'Santa Maria in Cosmedin Church'  ->  NOT FOUND


resolve object_qid:  24%|██▍       | 11/46 [00:43<03:00,  5.14s/it]

[OBJ  row 5087] 'Santa Maria in Cosmedin Church'  ->  NOT FOUND
[OBJ  row 5231] 'Tinn Municipality'  ->  Q2365  | method=manual-canon cand=None lang=None


resolve object_qid:  28%|██▊       | 13/46 [00:49<02:22,  4.33s/it]

[OBJ  row 5479] 'Saint Petersburg Eparchy'  ->  NOT FOUND
[OBJ  row 6077] 'Hernando, Córdoba'  ->  Q18920947  | method=manual-canon cand=None lang=None
[OBJ  row 6104] 'Ryn, Giżycko County'  ->  Q616895  | method=manual-exact cand=None lang=None


resolve object_qid:  37%|███▋      | 17/46 [00:50<00:52,  1.81s/it]

[OBJ  row 6994] 'Los Cabos, México'  ->  Q5979421  | method=manual-canon cand=None lang=None
[OBJ  row 7123] 'Los Cabos, México'  ->  Q5979421  | method=manual-exact cand=None lang=None


resolve object_qid:  39%|███▉      | 18/46 [00:56<01:16,  2.73s/it]

[OBJ  row 7127] 'Saint Petersburg Eparchy'  ->  NOT FOUND


resolve object_qid:  41%|████▏     | 19/46 [01:02<01:35,  3.55s/it]

[OBJ  row 7619] 'Saint Petersburg Eparchy'  ->  NOT FOUND
[OBJ  row 7688] 'Baalbek, Lebanon'  ->  Q178835  | method=manual-canon cand=None lang=None


resolve object_qid:  46%|████▌     | 21/46 [01:09<01:24,  3.40s/it]

[OBJ  row 7833] 'Saint Petersburg Eparchy'  ->  NOT FOUND
[OBJ  row 9930] 'BASF (Czechia)'  ->  Q9401  | method=manual-exact cand=None lang=None
[OBJ  row 10012] 'Spijk, Groningen'  ->  Q2694306  | method=manual-exact cand=None lang=None


resolve object_qid:  52%|█████▏    | 24/46 [01:15<01:07,  3.06s/it]

[OBJ  row 10878] 'Institution of Noble Ladies of the Prague Castle'  ->  NOT FOUND
[OBJ  row 12325] 'Charlotte de Constant de Rebecque (1769-1845)'  ->  Q55901812  | method=manual-exact cand=None lang=None


resolve object_qid:  57%|█████▋    | 26/46 [01:22<01:02,  3.10s/it]

[OBJ  row 12863] 'Mohammed Abdelmonem'  ->  NOT FOUND


resolve object_qid:  59%|█████▊    | 27/46 [01:28<01:12,  3.80s/it]

[OBJ  row 13730] 'Numidia Kingdom'  ->  NOT FOUND
[OBJ  row 15396] 'Sielec, Sosnowiec'  ->  Q9336466  | method=manual-canon cand=None lang=None
[OBJ  row 15397] 'Milowice, Sosnowiec'  ->  Q11781308  | method=manual-exact cand=None lang=None


resolve object_qid:  63%|██████▎   | 29/46 [01:28<00:41,  2.41s/it]

[OBJ  row 15784] 'Stellantis Italy'  ->  Q3744458  | method=manual-exact cand=None lang=None


resolve object_qid:  67%|██████▋   | 31/46 [01:35<00:39,  2.67s/it]

[OBJ  row 16444] 'Liam Pane'  ->  NOT FOUND
[OBJ  row 17986] 'Lujhu District'  ->  Q713367  | method=manual-canon cand=None lang=None
[OBJ  row 17987] 'Qieding District'  ->  Q713090  | method=manual-exact cand=None lang=None


resolve object_qid:  74%|███████▍  | 34/46 [01:42<00:33,  2.83s/it]

[OBJ  row 18135] 'MAS IPSP'  ->  NOT FOUND
[OBJ  row 18355] 'Chengguan District, Lanzhou'  ->  Q1069975  | method=manual-canon cand=None lang=None
[OBJ  row 20087] 'Huamantla, Tlaxcala'  ->  Q576636  | method=manual-canon cand=None lang=None


resolve object_qid:  80%|████████  | 37/46 [01:51<00:30,  3.35s/it]

[OBJ  row 22486] 'Ralph H. Fowwlerr'  ->  NOT FOUND


resolve object_qid:  83%|████████▎ | 38/46 [01:58<00:32,  4.04s/it]

[OBJ  row 26940] 'Taking acetaminophen after smoking cannabis'  ->  NOT FOUND


resolve object_qid:  85%|████████▍ | 39/46 [02:07<00:38,  5.43s/it]

[OBJ  row 34525] 'Tuila'epa Sa'ilele Malielegaoi'  ->  NOT FOUND


resolve object_qid:  87%|████████▋ | 40/46 [02:17<00:38,  6.46s/it]

[OBJ  row 34954] 'Yeico Cáceres'  ->  NOT FOUND


resolve object_qid:  89%|████████▉ | 41/46 [02:24<00:32,  6.51s/it]

[OBJ  row 35031] 'flag of Dominican Republic'  ->  NOT FOUND


resolve object_qid:  91%|█████████▏| 42/46 [02:33<00:29,  7.38s/it]

[OBJ  row 36591] 'Isaías Fortis'  ->  NOT FOUND


resolve object_qid:  93%|█████████▎| 43/46 [02:43<00:24,  8.04s/it]

[OBJ  row 36592] 'Isaías Fortis'  ->  NOT FOUND
[OBJ  row 36799] 'Centre-Nord Region'  ->  Q850064  | method=manual-exact cand=None lang=None


resolve object_qid:  98%|█████████▊| 45/46 [02:50<00:05,  5.96s/it]

[OBJ  row 36800] 'Est Region'  ->  NOT FOUND


resolve object_qid: 100%|██████████| 46/46 [02:56<00:00,  3.85s/it]

[OBJ  row 41453] 'Brahui tribe'  ->  NOT FOUND

Remaining subject_qid: 15
Remaining object_qid : 24






✅ Saved: /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.csv


In [8]:
import pandas as pd

PATH = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.csv"

def is_missing_qid(x) -> bool:
    if pd.isna(x):
        return True
    s = str(x).strip().lower()
    return s in {"", "unknown", "uknown", "n/a", "-", "none"}

df = pd.read_csv(PATH)

# Проверим наличие колонок
for col in ["subject_qid", "object_qid"]:
    if col not in df.columns:
        raise ValueError(f"Нет обязательной колонки: {col}")

mask_bad = df["subject_qid"].apply(is_missing_qid) | df["object_qid"].apply(is_missing_qid)

removed = int(mask_bad.sum())
kept = int(len(df) - removed)

print(f"Всего строк: {len(df)}")
print(f"Удаляю (пустые subject_qid или object_qid): {removed}")
print(f"Останется: {kept}")

df_clean = df.loc[~mask_bad].reset_index(drop=True)
df_clean.to_csv(PATH, index=False)

print(f"✅ Перезаписано: {PATH}")


Всего строк: 55355
Удаляю (пустые subject_qid или object_qid): 39
Останется: 55316
✅ Перезаписано: /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.csv


In [6]:
#!/usr/bin/env python3
import pandas as pd
import itertools as it
from pathlib import Path

# === ФАЙЛЫ ===
PATH = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/final_QA_triplets.csv"
OUT  = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/final_QA_triplets.csv"

# Точные имена best_gen-колонок (как у тебя в данных)
BEST_COLS = [
    "best_gen_Llama_1b_Instract",
    "best_gen_Phi3_5_mini_Instruct",
    "best_gen_Llama_3b_Instract",
    "best_gen_Llama_8b_Instract",
    "best_gen_Gemma_7b_IT",
]

df = pd.read_csv(PATH)

# Оставим только реально существующие (на всякий случай)
best_existing = [c for c in BEST_COLS if c in df.columns]
if not best_existing:
    raise ValueError("Не нашёл ни одной колонки из BEST_COLS в файле.")

# === 1) Анализ пересечений пропусков ===
nan_masks = {c: df[c].isna() for c in best_existing}
nan_counts = {c: m.sum() for c, m in nan_masks.items()}

print("=== NaN counts by best_gen_* column ===")
for c in best_existing:
    print(f"{c:35s}  {nan_counts[c]}")

# Попарные пересечения
pairs = list(it.combinations(best_existing, 2))
if pairs:
    print("\n=== Pairwise intersection sizes (NaN indices) ===")
    for a, b in pairs:
        inter = (nan_masks[a] & nan_masks[b]).sum()
        print(f"{a} ∩ {b}: {inter}")

# Пересечение всех
all_inter_mask = None
for c in best_existing:
    all_inter_mask = nan_masks[c] if all_inter_mask is None else (all_inter_mask & nan_masks[c])
all_inter_cnt = int(all_inter_mask.sum())
print(f"\nRows NaN in ALL listed best_gen_* columns: {all_inter_cnt}")

# === 2) Заполнение пропусков в best_gen_* -> "Uknown"
# === 3) В этих же строках нули в bert_sim_* и gen_recall_*
def suffix_from_best(colname: str) -> str:
    # 'best_gen_' -> возвращаем хвост
    return colname[len("best_gen_"):] if colname.startswith("best_gen_") else None

changed_report = {}
for best_col in best_existing:
    mask_nan = nan_masks[best_col]  # запоминаем изначальные пропуски для этой колонки
    n_rows = int(mask_nan.sum())
    if n_rows == 0:
        changed_report[best_col] = 0
        continue

    # 2) best_gen_* = "Uknown"
    df.loc[mask_nan, best_col] = "Uknown"

    # 3) найти соответствующие bert_sim_* и gen_recall_* и занулить в тех же индексах
    suf = suffix_from_best(best_col)
    if suf:
        sim_col = f"bert_sim_{suf}"
        rec_col = f"gen_recall_{suf}"
        if sim_col in df.columns:
            df.loc[mask_nan, sim_col] = 0.0
        if rec_col in df.columns:
            df.loc[mask_nan, rec_col] = 0.0

    changed_report[best_col] = n_rows

print("\n=== Filled 'Uknown' & zeroed metrics for rows that had NaN in best_gen_* ===")
for c in best_existing:
    print(f"{c:35s}  rows affected: {changed_report[c]}")

# Сохранение
Path(OUT).parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUT, index=False)
print(f"\n✅ Saved: {OUT}")


=== NaN counts by best_gen_* column ===
best_gen_Llama_1b_Instract           123
best_gen_Phi3_5_mini_Instruct        56
best_gen_Llama_3b_Instract           43
best_gen_Llama_8b_Instract           30
best_gen_Gemma_7b_IT                 2

=== Pairwise intersection sizes (NaN indices) ===
best_gen_Llama_1b_Instract ∩ best_gen_Phi3_5_mini_Instruct: 1
best_gen_Llama_1b_Instract ∩ best_gen_Llama_3b_Instract: 1
best_gen_Llama_1b_Instract ∩ best_gen_Llama_8b_Instract: 0
best_gen_Llama_1b_Instract ∩ best_gen_Gemma_7b_IT: 0
best_gen_Phi3_5_mini_Instruct ∩ best_gen_Llama_3b_Instract: 7
best_gen_Phi3_5_mini_Instruct ∩ best_gen_Llama_8b_Instract: 10
best_gen_Phi3_5_mini_Instruct ∩ best_gen_Gemma_7b_IT: 0
best_gen_Llama_3b_Instract ∩ best_gen_Llama_8b_Instract: 1
best_gen_Llama_3b_Instract ∩ best_gen_Gemma_7b_IT: 0
best_gen_Llama_8b_Instract ∩ best_gen_Gemma_7b_IT: 0

Rows NaN in ALL listed best_gen_* columns: 0

=== Filled 'Uknown' & zeroed metrics for rows that had NaN in best_gen_* ===
best_g

In [None]:
# #!/usr/bin/env python3
# import re
# import json
# import time
# import requests
# import pandas as pd
# from pathlib import Path
# from tqdm import tqdm
# from requests.adapters import HTTPAdapter
# from urllib3.util.retry import Retry

# # ======= CONFIG =======
# INPUT_CSV  = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/zephyr_7b_beta_all_questions_with_metrics.csv"
# OUTPUT_CSV = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/zephyr_7b_beta_all_questions_with_metrics_SAMPLE10_with_popularity.csv"

# # Кэши для теста (отдельно, чтобы не мешали прод-запуску)
# CACHE_LABEL2QID = Path(OUTPUT_CSV).with_suffix(".label2qid.sample10.json")
# CACHE_SITELINKS = Path(OUTPUT_CSV).with_suffix(".sitelinks.sample10.json")

# SAMPLE_N = 10            # берём 10 строк
# LANGS = ["en", "ru", "de"]
# STRICT_EXACT_ONLY = False  # если True — берём только точное совпадение метки
# SLEEP_BETWEEN = 0.05
# TIMEOUT = 20

# QID_RE = re.compile(r"Q\d+")

# # ======= HTTP session =======
# def build_session():
#     sess = requests.Session()
#     retries = Retry(
#         total=5,
#         backoff_factor=0.5,
#         status_forcelist=[429, 500, 502, 503, 504],
#         allowed_methods=["GET"],
#         respect_retry_after_header=True,
#     )
#     adapter = HTTPAdapter(max_retries=retries)
#     sess.mount("https://", adapter)
#     sess.headers.update({
#         "User-Agent": "UNLamb-Wikidata/1.0 (contact: youremail@example.com)",
#         "Accept": "application/json",
#     })
#     return sess

# # ======= utils =======
# def load_json(path):
#     p = Path(path)
#     if p.exists():
#         try:
#             return json.loads(p.read_text("utf-8"))
#         except Exception:
#             return {}
#     return {}

# def save_json(path, data):
#     try:
#         Path(path).write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
#     except Exception:
#         pass

# def extract_qid(val):
#     if pd.isna(val):
#         return None
#     s = str(val)
#     m = QID_RE.search(s)
#     return m.group(0) if m else None

# def search_wikidata_qid(label, session, langs=LANGS, timeout=TIMEOUT, strict_exact=STRICT_EXACT_ONLY):
#     label_stripped = str(label).strip()
#     if not label_stripped:
#         return None

#     # если уже есть QID в строке
#     q = extract_qid(label_stripped)
#     if q:
#         return q

#     for lang in langs:
#         params = {
#             "action": "wbsearchentities",
#             "format": "json",
#             "language": lang,
#             "uselang": lang,
#             "type": "item",
#             "search": label_stripped,
#             "limit": 5,
#         }
#         try:
#             r = session.get("https://www.wikidata.org/w/api.php", params=params, timeout=timeout)
#             r.raise_for_status()
#             results = (r.json() or {}).get("search", []) or []
#             if not results:
#                 continue

#             lower = label_stripped.lower()

#             # 1) точное совпадение по label (case-insensitive)
#             for item in results:
#                 if item.get("label", "").lower() == lower:
#                     return item.get("id")

#             # 2) если строгий режим выключен — берём топ-результат
#             if not strict_exact:
#                 return results[0].get("id")
#         except Exception:
#             continue

#     return None

# def get_sitelinks_count(qid, session, timeout=TIMEOUT):
#     if not qid:
#         return 0
#     url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
#     try:
#         r = session.get(url, timeout=timeout)
#         if r.status_code == 404:
#             return 0
#         r.raise_for_status()
#         ent = (r.json() or {}).get("entities", {}).get(qid, {})
#         return len(ent.get("sitelinks", {}))
#     except Exception:
#         return 0

# # ======= main =======
# def main():
#     df = pd.read_csv(INPUT_CSV)
#     if "subject" not in df.columns or "object" not in df.columns:
#         raise ValueError("Ожидаю колонки 'subject' и 'object'.")

#     # берём первые 10 (или случайные 10 — раскомментируй следующую строку)
#     df_test = df.head(SAMPLE_N).copy()
#     # df_test = df.sample(n=SAMPLE_N, random_state=42).copy()

#     session = build_session()
#     label2qid = load_json(CACHE_LABEL2QID)
#     qid2sitelinks = load_json(CACHE_SITELINKS)

#     subj_vals = df_test["subject"].fillna("").astype(str)
#     obj_vals  = df_test["object"].fillna("").astype(str)
#     unique_labels = sorted(set(subj_vals.tolist() + obj_vals.tolist()))

#     # 1) label -> QID
#     to_resolve = [lbl for lbl in unique_labels if lbl and lbl not in label2qid]
#     for lbl in tqdm(to_resolve, desc="Resolve labels → QID (sample10)"):
#         qid = search_wikidata_qid(lbl, session)
#         label2qid[lbl] = qid or ""
#         time.sleep(SLEEP_BETWEEN)
#     save_json(CACHE_LABEL2QID, label2qid)

#     df_test["subject_qid"] = subj_vals.map(lambda s: label2qid.get(s, "") or extract_qid(s) or "")
#     df_test["object_qid"]  = obj_vals.map(lambda s: label2qid.get(s, "") or extract_qid(s) or "")

#     # 2) QID -> sitelinks
#     qids = sorted({q for q in pd.concat([df_test["subject_qid"], df_test["object_qid"]]).tolist() if q})
#     to_fetch = [q for q in qids if q not in qid2sitelinks]
#     for q in tqdm(to_fetch, desc="Fetch sitelinks (sample10)"):
#         qid2sitelinks[q] = get_sitelinks_count(q, session)
#         time.sleep(SLEEP_BETWEEN)
#     save_json(CACHE_SITELINKS, qid2sitelinks)

#     df_test["subject_popularity_sitelinks"] = df_test["subject_qid"].map(lambda q: int(qid2sitelinks.get(q, 0)))
#     df_test["object_popularity_sitelinks"]  = df_test["object_qid"].map(lambda q: int(qid2sitelinks.get(q, 0)))
#     df_test["popularity_sitelinks_sum"]     = df_test["subject_popularity_sitelinks"] + df_test["object_popularity_sitelinks"]

#     # краткий отчёт
#     res_subj = (df_test["subject_qid"] != "").mean() * 100
#     res_obj  = (df_test["object_qid"]  != "").mean() * 100
#     print(f"Resolved subject → QID: {res_subj:.1f}%  |  object → QID: {res_obj:.1f}%")

#     cols = ["subject","object","subject_qid","object_qid",
#             "subject_popularity_sitelinks","object_popularity_sitelinks","popularity_sitelinks_sum"]
#     print("\n=== SAMPLE(10) RESULTS ===")
#     print(df_test[cols].to_string(index=False))

#     df_test.to_csv(OUTPUT_CSV, index=False)
#     print(f"\n✅ Saved sample(10) file: {OUTPUT_CSV}")
#     print(f"ℹ️ Caches: {CACHE_LABEL2QID}, {CACHE_SITELINKS}")

# if __name__ == "__main__":
#     main()


Resolve labels → QID (sample10): 100%|██████████| 12/12 [00:03<00:00,  3.47it/s]
Fetch sitelinks (sample10): 100%|██████████| 12/12 [00:05<00:00,  2.11it/s]

Resolved subject → QID: 100.0%  |  object → QID: 100.0%

=== SAMPLE(10) RESULTS ===
     subject                        object subject_qid object_qid  subject_popularity_sitelinks  object_popularity_sitelinks  popularity_sitelinks_sum
  JavaScript               web development       Q2005    Q386275                           157                           38                       195
  JavaScript   aspect-oriented programming       Q2005     Q30267                           157                           33                       190
  JavaScript      event-driven programming       Q2005   Q1135914                           157                           29                       186
  JavaScript        imperative programming       Q2005    Q275596                           157                           50                       207
  JavaScript           generic programming       Q2005   Q1051282                           157                           33                       190
  JavaScri




In [11]:
#!/usr/bin/env python3
import re
import json
import time
import unicodedata
import requests
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# ========= IO =========
INPUT_CSV  = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/final_QA_triplets.csv"
OUT        = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.csv"

# Кэши рядом с OUT
CACHE_LABEL2QID = Path(OUT).with_suffix(".label2qid.json")
CACHE_SITELINKS = Path(OUT).with_suffix(".sitelinks_cache.json")

# ========= SETTINGS =========
LANGS_WB   = ["en", "ru", "de", "fr", "es", "it", "uk", "pl"]
WIKI_SITES = ["enwiki", "ruwiki", "dewiki", "frwiki", "eswiki"]
SLEEP_BETWEEN = 0.08
TIMEOUT       = 20
QID_RE = re.compile(r"\bQ\d+\b", re.IGNORECASE)

# ========= HTTP =========
def build_session():
    sess = requests.Session()
    retries = Retry(
        total=6,
        backoff_factor=0.6,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        respect_retry_after_header=True,
    )
    adapter = HTTPAdapter(max_retries=retries)
    sess.mount("https://", adapter)
    sess.headers.update({
        "User-Agent": "UNLamb-Wikidata/1.1 (contact: youremail@example.com)",
        "Accept": "application/json",
    })
    return sess

# ========= UTILS =========
def strip_accents(text: str) -> str:
    norm = unicodedata.normalize("NFKD", text or "")
    return "".join(ch for ch in norm if not unicodedata.combining(ch))

def normalize_label(label: str) -> str:
    s = str(label or "").strip()
    if not s:
        return s
    s = s.replace("—", "-").replace("–", "-").replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    s = re.sub(r"\s*\(.*?\)\s*", " ", s)       # убрать пояснения в скобках
    s = re.sub(r"[^\w\s\-]", " ", s)           # только буквы/цифры/пробел/дефис/_
    s = s.replace("_", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def extract_qid(val):
    if pd.isna(val):
        return None
    m = QID_RE.search(str(val))
    return m.group(0).upper() if m else None

def is_missing_qid(x) -> bool:
    if pd.isna(x):
        return True
    s = str(x).strip().lower()
    return s in {"", "unknown", "uknown", "n/a", "-", "none"}

def is_meaningful_label(s: str) -> bool:
    lo = str(s or "").strip().lower()
    return lo not in {"", "unknown", "uknown", "n/a", "-", "none"}

# ========= CACHE I/O =========
def load_json(path):
    p = Path(path)
    if p.exists():
        try:
            return json.loads(p.read_text("utf-8"))
        except Exception:
            return {}
    return {}

def save_json(path, data):
    try:
        Path(path).write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
    except Exception:
        pass

# ========= WIKIDATA =========
def wbsearchentities(label, session, lang="en", limit=10, timeout=TIMEOUT):
    params = {
        "action": "wbsearchentities",
        "format": "json",
        "language": lang,
        "uselang": lang,
        "type": "item",
        "search": label,
        "limit": limit,
    }
    r = session.get("https://www.wikidata.org/w/api.php", params=params, timeout=timeout)
    r.raise_for_status()
    return (r.json() or {}).get("search", []) or []

def pick_qid_from_results(label_raw: str, results: list) -> str | None:
    if not results:
        return None
    lab_norm = strip_accents(normalize_label(label_raw)).lower()
    # точное совпадение по label
    for it in results:
        lab = strip_accents(normalize_label(it.get("label", ""))).lower()
        if lab and lab == lab_norm:
            return it.get("id")
    # точное совпадение по aliases
    for it in results:
        for al in it.get("aliases") or []:
            al_norm = strip_accents(normalize_label(al)).lower()
            if al_norm and al_norm == lab_norm:
                return it.get("id")
    # иначе первый
    return results[0].get("id")

def wiki_search_title(term: str, session, lang="en", timeout=TIMEOUT):
    params = {
        "action": "query",
        "list": "search",
        "format": "json",
        "srsearch": term,
        "srlimit": 1,
        "srwhat": "nearmatch",
        "srprop": "",
    }
    r = session.get(f"https://{lang}.wikipedia.org/w/api.php", params=params, timeout=timeout)
    r.raise_for_status()
    hits = ((r.json() or {}).get("query", {}) or {}).get("search", []) or []
    return hits[0].get("title") if hits else None

def wiki_title_to_qid(site: str, title: str, session, timeout=TIMEOUT):
    params = {
        "action": "wbgetentities",
        "format": "json",
        "sites": site,
        "titles": title,
        "props": "info",
    }
    r = session.get("https://www.wikidata.org/w/api.php", params=params, timeout=timeout)
    r.raise_for_status()
    for k in (r.json() or {}).get("entities", {}):
        if k.startswith("Q"):
            return k
    return None

def resolve_label_to_qid(label: str, session) -> str | None:
    if not is_meaningful_label(label):
        return None
    # уже QID?
    q = extract_qid(label)
    if q:
        return q

    raw  = str(label).strip()
    norm = normalize_label(raw)
    fold = strip_accents(norm)

    candidates = [raw]
    if norm and norm != raw: candidates.append(norm)
    if fold and fold not in candidates: candidates.append(fold)

    # A) wbsearchentities
    for cand in candidates:
        for lang in LANGS_WB:
            try:
                res = wbsearchentities(cand, session, lang=lang, limit=10)
                qid = pick_qid_from_results(cand, res)
                if qid:
                    return qid
            except Exception:
                pass
            time.sleep(SLEEP_BETWEEN/2)

    # B) wiki search → title → wbgetentities
    for cand in candidates:
        for l, site in zip(["en", "ru", "de", "fr", "es"], WIKI_SITES):
            try:
                title = wiki_search_title(cand, session, lang=l)
                if title:
                    qid = wiki_title_to_qid(site, title, session)
                    if qid:
                        return qid
            except Exception:
                pass
            time.sleep(SLEEP_BETWEEN)

    return None

def get_sitelinks_count(qid, session, timeout=TIMEOUT):
    if not qid:
        return 0
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    try:
        r = session.get(url, timeout=timeout)
        if r.status_code == 404:
            return 0
        r.raise_for_status()
        ent = (r.json() or {}).get("entities", {}).get(qid, {})
        return len(ent.get("sitelinks", {}))
    except Exception:
        return 0

# ========= MAIN =========
def main():
    df = pd.read_csv(INPUT_CSV)

    for col in ["subject", "object"]:
        if col not in df.columns:
            raise ValueError(f"Нет обязательной колонки: {col}")

    # убедимся, что целевые колонки существуют
    for col in ["subject_qid","object_qid","subject_popularity_sitelinks","object_popularity_sitelinks","popularity_sitelinks_sum"]:
        if col not in df.columns:
            df[col] = pd.NA

    # маски «только пропуски»
    m_subj = df["subject_qid"].apply(is_missing_qid)
    m_obj  = df["object_qid"].apply(is_missing_qid)
    print(f"To fill subject_qid: {int(m_subj.sum())}")
    print(f"To fill object_qid : {int(m_obj.sum())}")

    if m_subj.sum() == 0 and m_obj.sum() == 0:
        df.to_csv(OUT, index=False)
        print(f"✅ Nothing to fill. Saved as-is: {OUT}")
        return

    session = build_session()
    label2qid = load_json(CACHE_LABEL2QID)
    qid2sitelinks = load_json(CACHE_SITELINKS)

    # —— SUBJECT only-missing
    for idx in tqdm(df.index[m_subj], desc="resolve subject_qid"):
        lbl = str(df.at[idx, "subject"])
        if not is_meaningful_label(lbl):
            continue
        if lbl in label2qid and label2qid[lbl]:
            qid = label2qid[lbl]
        else:
            qid = resolve_label_to_qid(lbl, session)
            label2qid[lbl] = qid or ""
            time.sleep(SLEEP_BETWEEN)
        if qid:
            df.at[idx, "subject_qid"] = qid

    # —— OBJECT only-missing
    for idx in tqdm(df.index[m_obj], desc="resolve object_qid"):
        lbl = str(df.at[idx, "object"])
        if not is_meaningful_label(lbl):
            continue
        if lbl in label2qid and label2qid[lbl]:
            qid = label2qid[lbl]
        else:
            qid = resolve_label_to_qid(lbl, session)
            label2qid[lbl] = qid or ""
            time.sleep(SLEEP_BETWEEN)
        if qid:
            df.at[idx, "object_qid"] = qid

    # сохранить кэш маппинга
    save_json(CACHE_LABEL2QID, label2qid)

    # строки, где ИЗНАЧАЛЬНО были пропуски (их и обновляем)
    touched = (m_subj | m_obj)

    # QID из затронутых строк → тянуть sitelinks только для новых
    new_qids = set()
    for idx in df.index[touched]:
        for col in ["subject_qid", "object_qid"]:
            q = str(df.at[idx, col]).strip()
            if q and q.upper().startswith("Q"):
                new_qids.add(q)

    to_fetch = [q for q in sorted(new_qids) if q not in qid2sitelinks]
    if to_fetch:
        print(f"Fetching sitelinks for {len(to_fetch)} new QIDs...")
        for q in tqdm(to_fetch, desc="qid→sitelinks"):
            qid2sitelinks[q] = get_sitelinks_count(q, session)
            time.sleep(SLEEP_BETWEEN)
        save_json(CACHE_SITELINKS, qid2sitelinks)

    # пересчёт популярностей только на touched
    rows = df.index[touched]
    if len(rows) > 0:
        sub_vals = df.loc[rows, "subject_qid"].astype(str).str.strip()
        obj_vals = df.loc[rows, "object_qid"].astype(str).str.strip()

        df.loc[rows, "subject_popularity_sitelinks"] = sub_vals.map(lambda q: int(qid2sitelinks.get(q, 0)) if q.upper().startswith("Q") else 0)
        df.loc[rows, "object_popularity_sitelinks"]  = obj_vals.map(lambda q: int(qid2sitelinks.get(q, 0)) if q.upper().startswith("Q") else 0)
        df.loc[rows, "popularity_sitelinks_sum"]     = (
            df.loc[rows, "subject_popularity_sitelinks"].fillna(0).astype(int)
            + df.loc[rows, "object_popularity_sitelinks"].fillna(0).astype(int)
        )

    # отчёт
    still_subj = int(df["subject_qid"].apply(is_missing_qid).sum())
    still_obj  = int(df["object_qid"].apply(is_missing_qid).sum())
    print(f"\nRemaining empty subject_qid: {still_subj}")
    print(f"Remaining empty object_qid : {still_obj}")

    # save
    Path(OUT).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(OUT, index=False)
    print(f"\n✅ Saved: {OUT}")
    print(f"ℹ️ Caches: {CACHE_LABEL2QID}, {CACHE_SITELINKS}")

if __name__ == "__main__":
    main()


To fill subject_qid: 71
To fill object_qid : 46


resolve subject_qid: 100%|██████████| 71/71 [03:52<00:00,  3.27s/it]
resolve object_qid: 100%|██████████| 46/46 [03:26<00:00,  4.49s/it]


Fetching sitelinks for 108 new QIDs...


qid→sitelinks: 100%|██████████| 108/108 [00:57<00:00,  1.87it/s]



Remaining empty subject_qid: 56
Remaining empty object_qid : 40

✅ Saved: /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.csv
ℹ️ Caches: /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.label2qid.json, /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.sitelinks_cache.json


In [11]:
import pandas as pd

PATH = "/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.csv"

df = pd.read_csv(PATH)

if "popularity_sitelinks_sum" not in df.columns:
    raise ValueError("Нет колонки 'popularity_sitelinks_sum'.")

# стабильная сортировка (сохраняет порядок строк с одинаковым значением)
df = df.sort_values("popularity_sitelinks_sum", ascending=True, kind="mergesort")

df.to_csv(PATH, index=False)
print(f"✅ Отсортировано и сохранено: {PATH}")
df

✅ Отсортировано и сохранено: /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.csv


Unnamed: 0,file,question,answer,subject,relation,object,PPL_Llama3_1-8B_Instruct,best_gen_Llama_8b_Instract,gen_recall_Llama_8b_Instract,bert_sim_Llama_8b_Instract,...,bert_sim_Zephyr_7b_Beta,PPL_Phi3_5_mini_Instruct,best_gen_Phi3_5_mini_Instruct,gen_recall_Phi3_5_mini_Instruct,bert_sim_Phi3_5_mini_Instruct,subject_qid,object_qid,subject_popularity_sitelinks,object_popularity_sitelinks,popularity_sitelinks_sum
0,business_industry.csv,What did book industry said to be the same as?,book publishing,book industry,said to be the same as,book publishing,1865.235713,Publishing was said to be like running a river,0.5,0.512184,...,0.468156,1.587546e+17,Digital distribution platforms Book industry o...,0.5,0.615013,Q56560668,Q112165919,0,0,0
1,places_city.csv,What did Calais twinned administrative body?,Xiangtan,Calais,twinned administrative body,Xiangtan,46.282780,France – via Lille and Kortrijk (,0.0,0.260455,...,-0.016905,1.862347e+15,Not applicable,0.0,0.205666,Q87300250,Q113491656,0,0,0
3,health_symptom.csv,What is the drug or therapy used for treatment...,propantheline,spasm,drug or therapy used for treatment,propantheline,5.538411,"Examples include baclofen, dantrolene",0.0,0.279883,...,0.355749,4.429714e+12,"Beta-blockers, muscle relaxants",0.0,0.342819,Q65632660,Q95594627,0,0,0
4,health_disease.csv,What is the health specialty of cerebrovascula...,cardiology,cerebrovascular trauma,health specialty,cardiology,139.951523,Neurosurgery and Vascular Surgery,0.0,0.357400,...,0.296120,2.191506e+17,Neurology,0.0,0.407211,Q55092798,Q15751978,0,0,0
5,health_disease.csv,What is the health specialty of traumatic suba...,cardiology,traumatic subarachnoid hemorrhage,health specialty,cardiology,49.679041,Neurocritical care,0.0,0.258862,...,0.135701,8.355436e+13,Neurosurgery and Neurology,0.0,0.323818,Q55093271,Q15751978,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55311,places_country.csv,What is the country of Japan?,Japan,Japan,country,Japan,12709.168794,Japan is a country,1.0,0.744425,...,0.636226,4.784381e+14,Japan is a country in East Asia,1.0,0.697189,Q17,Q17,410,410,820
55312,places_country.csv,What is the country of Russia?,Russia,Russia,country,Russia,11398.500244,Russia is a country in Eastern Europe and Nort...,1.0,0.672289,...,0.595697,8.400571e+15,Russia is a transcontinental country in Northe...,1.0,0.605403,Q159,Q159,410,410,820
55313,places_country.csv,What is the diplomatic relation of Russia?,Japan,Russia,diplomatic relation,Japan,1651.651733,Russia has relations with:,0.0,0.302459,...,0.202271,7.640358e+17,"Multifaceted and complex, with partners",0.0,0.091473,Q159,Q17,410,410,820
55314,places_country.csv,What is the diplomatic relation of Turkey?,Japan,Turkey,diplomatic relation,Japan,3224.247831,Turkey has diplomatic relations with 195 UN re...,0.0,0.119216,...,0.057623,2.014042e+18,Relations vary with each country; please speci...,0.0,0.371126,Q43,Q17,414,410,824


In [14]:
df.columns

Index(['file', 'question', 'answer', 'subject', 'relation', 'object',
       'PPL_Llama3_1-8B_Instruct', 'best_gen_Llama_8b_Instract',
       'gen_recall_Llama_8b_Instract', 'bert_sim_Llama_8b_Instract',
       'PPL_Llama3_2-3B_Instruct', 'best_gen_Llama_3b_Instract',
       'gen_recall_Llama_3b_Instract', 'bert_sim_Llama_3b_Instract',
       'PPL_Llama3_2-1B_Instruct', 'best_gen_Llama_1b_Instract',
       'gen_recall_Llama_1b_Instract', 'bert_sim_Llama_1b_Instract',
       'PPL_Gemma_7B_IT', 'best_gen_Gemma_7b_IT', 'gen_recall_Gemma_7b_IT',
       'bert_sim_Gemma_7b_IT', 'PPL_Zephyr_7B_Beta', 'best_gen_Zephyr_7b_Beta',
       'gen_recall_Zephyr_7b_Beta', 'bert_sim_Zephyr_7b_Beta',
       'PPL_Phi3_5_mini_Instruct', 'best_gen_Phi3_5_mini_Instruct',
       'gen_recall_Phi3_5_mini_Instruct', 'bert_sim_Phi3_5_mini_Instruct',
       'subject_qid', 'object_qid', 'subject_popularity_sitelinks',
       'object_popularity_sitelinks', 'popularity_sitelinks_sum'],
      dtype='object')

In [16]:
#!/usr/bin/env python3
import os
import math
import pandas as pd
from pathlib import Path

# ==== входные данные ====
# df = pd.read_csv("...")  # у тебя df уже есть в памяти, просто убери эту строку
OUTPUT_DIR = "/mnt/extremessd10tb/borisiuk/open-unlearning/tripunlamb_splits"

# соответствия моделей их колонкам
MODEL_COLS = {
    "llama3_1b": ("gen_recall_Llama_1b_Instract", "bert_sim_Llama_1b_Instract"),
    "llama3_3b": ("gen_recall_Llama_3b_Instract", "bert_sim_Llama_3b_Instract"),
    "llama3_8b": ("gen_recall_Llama_8b_Instract", "bert_sim_Llama_8b_Instract"),
    "gemma7"   : ("gen_recall_Gemma_7b_IT",       "bert_sim_Gemma_7b_IT"),
    "zephyr7"  : ("gen_recall_Zephyr_7b_Beta",    "bert_sim_Zephyr_7b_Beta"),
    "phi3"     : ("gen_recall_Phi3_5_mini_Instruct","bert_sim_Phi3_5_mini_Instruct"),
}

PCTS = [1, 5, 10, 15]
TYPES = ["popular", "rare"]  # popular: с конца; rare: с начала

def ensure_dir(p):
    Path(p).mkdir(parents=True, exist_ok=True)

def select_forget_indices(df: pd.DataFrame, side: str, k: int,
                          recall_col: str, sim_col: str) -> list[int]:
    """Идём от нужного конца и набираем k индексов, где recall>0.5 и sim>0.5."""
    # валидные строки для модели
    cond = (df[recall_col] > 0.5) & (df[sim_col] > 0.5)
    cond = cond.fillna(False)

    order_idx = df.index if side == "rare" else df.index[::-1]  # rare: с начала, popular: с конца
    chosen = []
    for i in order_idx:
        if cond.at[i]:
            chosen.append(i)
            if len(chosen) >= k:
                break

    # вернуть в исходном порядке датафрейма (стабильно)
    chosen.sort()
    return chosen

def save_split(df: pd.DataFrame, name: str):
    ensure_dir(OUTPUT_DIR)
    path = Path(OUTPUT_DIR) / f"{name}.csv"
    df.to_csv(path, index=False)
    print(f"saved: {path}  (rows={len(df)})")

def make_splits(df: pd.DataFrame):
    ensure_dir(OUTPUT_DIR)

    # full
    save_split(df, "full")

    total = len(df)
    print(f"Total rows: {total}")

    for model, (rec_col, sim_col) in MODEL_COLS.items():
        # sanity check
        for col in (rec_col, sim_col):
            if col not in df.columns:
                raise ValueError(f"Нет колонки '{col}' для модели '{model}'")

        for side in TYPES:
            for p in PCTS:
                k_target = int(total * p / 100)  # ⌊N% * |df|⌋
                if k_target <= 0:
                    print(f"[{model}][{side}][{p}%] target=0 — пропускаю")
                    continue

                forget_idx = select_forget_indices(df, side, k_target, rec_col, sim_col)
                if len(forget_idx) < k_target:
                    print(f"⚠️ [{model}][{side}][{p}%] нужно {k_target}, но нашлось только {len(forget_idx)} подходящих (recall&sim>0.5)")

                forget_df = df.loc[forget_idx].copy()
                retain_df = df.drop(index=forget_idx).copy()

                # имена сплитов
                forget_name = f"{side}_forget_{p}_{model}"
                retain_name = f"retain_{p}_{model}"  # комплемент к forget

                save_split(forget_df, forget_name)
                save_split(retain_df, retain_name)

# === вызов ===
make_splits(df)


saved: /mnt/extremessd10tb/borisiuk/open-unlearning/tripunlamb_splits/full.csv  (rows=55316)
Total rows: 55316
saved: /mnt/extremessd10tb/borisiuk/open-unlearning/tripunlamb_splits/popular_forget_1_llama3_1b.csv  (rows=553)
saved: /mnt/extremessd10tb/borisiuk/open-unlearning/tripunlamb_splits/retain_1_llama3_1b.csv  (rows=54763)
saved: /mnt/extremessd10tb/borisiuk/open-unlearning/tripunlamb_splits/popular_forget_5_llama3_1b.csv  (rows=2765)
saved: /mnt/extremessd10tb/borisiuk/open-unlearning/tripunlamb_splits/retain_5_llama3_1b.csv  (rows=52551)
saved: /mnt/extremessd10tb/borisiuk/open-unlearning/tripunlamb_splits/popular_forget_10_llama3_1b.csv  (rows=5531)
saved: /mnt/extremessd10tb/borisiuk/open-unlearning/tripunlamb_splits/retain_10_llama3_1b.csv  (rows=49785)
saved: /mnt/extremessd10tb/borisiuk/open-unlearning/tripunlamb_splits/popular_forget_15_llama3_1b.csv  (rows=8297)
saved: /mnt/extremessd10tb/borisiuk/open-unlearning/tripunlamb_splits/retain_15_llama3_1b.csv  (rows=47019)
sa

In [19]:
# !pip install datasets

In [21]:
#!/usr/bin/env python3
import os
import pandas as pd
from pathlib import Path
from datasets import Dataset, DatasetDict  # pip install datasets

# Папка со сплитами, которые создавали ранее
SPLITS_DIR = Path("/mnt/extremessd10tb/borisiuk/open-unlearning/tripunlamb_splits")
OUT_DIR    = Path("/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/tripunlamb_hfds")  # куда сохранить HF-датасет

def read_header_cols(csv_path: Path) -> list[str]:
    return list(pd.read_csv(csv_path, nrows=0).columns)

def load_split(csv_path: Path, all_cols: list[str]) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    # досоздать недостающие колонки и привести к единому порядку
    for c in all_cols:
        if c not in df.columns:
            df[c] = pd.NA
    df = df.reindex(columns=all_cols)
    return df

def main():
    assert SPLITS_DIR.exists(), f"Не найдена папка со сплитами: {SPLITS_DIR}"

    csvs = sorted([p for p in SPLITS_DIR.glob("*.csv") if p.is_file()])
    if not csvs:
        raise RuntimeError(f"В {SPLITS_DIR} нет .csv файлов")

    # базовая схема: из full.csv, иначе — по первому файлу
    full_csv = next((p for p in csvs if p.stem == "full"), None)
    schema_src = full_csv or csvs[0]
    all_cols = read_header_cols(schema_src)

    # собираем датасет дикт
    dsets = {}
    for p in csvs:
        df = load_split(p, all_cols)
        dsets[p.stem] = Dataset.from_pandas(df, preserve_index=False)
        print(f"Loaded split: {p.stem:>30s}  rows={len(df)}")

    dd = DatasetDict(dsets)

    # краткий отчёт
    print("\n=== DatasetDict summary ===")
    for name, ds in dd.items():
        print(f"{name:>30s}: {ds.num_rows} rows, {len(ds.features)} cols")

    # сохранить на диск в формате HF datasets (Arrow)
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    dd.save_to_disk(str(OUT_DIR))
    print(f"\n✅ Saved DatasetDict to: {OUT_DIR}")

    # (опционально) пуш на Hugging Face Hub:
    dd.push_to_hub("SwetieePawsss/TripUNLamb")
    # dd.push_to_hub("user_or_org/tripunlamb", private=True)  # потребуются токены HF

if __name__ == "__main__":
    main()


Loaded split:                           full  rows=55316
Loaded split:       popular_forget_10_gemma7  rows=5531
Loaded split:    popular_forget_10_llama3_1b  rows=5531
Loaded split:    popular_forget_10_llama3_3b  rows=5531
Loaded split:    popular_forget_10_llama3_8b  rows=5531
Loaded split:         popular_forget_10_phi3  rows=5531
Loaded split:      popular_forget_10_zephyr7  rows=5531
Loaded split:       popular_forget_15_gemma7  rows=8297
Loaded split:    popular_forget_15_llama3_1b  rows=8297
Loaded split:    popular_forget_15_llama3_3b  rows=8297
Loaded split:    popular_forget_15_llama3_8b  rows=8297
Loaded split:         popular_forget_15_phi3  rows=8297
Loaded split:      popular_forget_15_zephyr7  rows=6234
Loaded split:        popular_forget_1_gemma7  rows=553
Loaded split:     popular_forget_1_llama3_1b  rows=553
Loaded split:     popular_forget_1_llama3_3b  rows=553
Loaded split:     popular_forget_1_llama3_8b  rows=553
Loaded split:          popular_forget_1_phi3  rows=

Saving the dataset (1/1 shards): 100%|██████████| 55316/55316 [00:00<00:00, 314035.32 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5531/5531 [00:00<00:00, 213917.36 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5531/5531 [00:00<00:00, 218810.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5531/5531 [00:00<00:00, 218435.23 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5531/5531 [00:00<00:00, 217631.95 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5531/5531 [00:00<00:00, 217932.49 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5531/5531 [00:00<00:00, 218211.28 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8297/8297 [00:00<00:00, 249710.40 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8297/8297 [00:00<00:00, 250665.49 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 8297/8297 [00:00<00:00, 250420.17 examples/s]
Saving the dataset (1/1 shards): 100%|


✅ Saved DatasetDict to: /mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/tripunlamb_hfds


Creating parquet from Arrow format: 100%|██████████| 56/56 [00:00<00:00, 241.88ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.08s/ shards]
Creating parquet from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 273.31ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.36s/ shards]
Creating parquet from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 272.20ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.47s/ shards]
Creating parquet from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 275.18ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.39s/ shards]
Creating parquet from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 279.46ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.25s/ shards]
Creating parquet from Arrow format: 100%|██████████| 6/6 [00:00<00:00, 279.88ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.33s/ shards]
Creating parqu

In [1]:
!export HF_HOME="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"

import os
from huggingface_hub import login

# Задаем токен как переменную окружения в коде
os.environ["HF_TOKEN"] = "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"
login(os.getenv("HF_TOKEN"))

from huggingface_hub import login

# Ваш токен, полученный в настройках Hugging Face
token = "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"

# Авторизация с использованием токена
login(token)

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [27]:
import numpy as np
import pandas as pd
from functools import reduce
from operator import mul

# df уже в памяти

RECALL_COLS = [
    "gen_recall_Llama_8b_Instract",
    "gen_recall_Llama_3b_Instract",
    "gen_recall_Llama_1b_Instract",
    "gen_recall_Gemma_7b_IT",
    "gen_recall_Zephyr_7b_Beta",
    "gen_recall_Phi3_5_mini_Instruct",
]
SIM_COLS = [
    "bert_sim_Llama_8b_Instract",
    "bert_sim_Llama_3b_Instract",
    "bert_sim_Llama_1b_Instract",
    "bert_sim_Gemma_7b_IT",
    "bert_sim_Zephyr_7b_Beta",
    "bert_sim_Phi3_5_mini_Instruct",
]

# sanity
missing = [c for c in RECALL_COLS + SIM_COLS if c not in df.columns]
if missing:
    raise ValueError(f"В df отсутствуют колонки: {missing}")

total = len(df)

def per_column_report(cols, title):
    print(f"\n=== {title} (>0.5) per column ===")
    for c in cols:
        s = df[c]
        n = int((s > 0.5).sum())
        pct = 100.0 * n / total if total else 0.0
        na = int(s.isna().sum())
        print(f"{c:35s}  {n:7d} / {total}  ({pct:5.1f}%)   NaN: {na}")

per_column_report(RECALL_COLS, "gen_recall")
per_column_report(SIM_COLS,    "bert_sim")

rec_all_mask  = (df[RECALL_COLS] > 0.5).all(axis=1)
sim_all_mask  = (df[SIM_COLS] > 0.5).all(axis=1)
both_all_mask = rec_all_mask & sim_all_mask

def p(n):  # формат процентов
    return f"{(100.0*n/total if total else 0.0):.1f}%"

print("\n=== Intersections (>0.5) ===")
print(f"All gen_recalls: {int(rec_all_mask.sum())} / {total} ({p(rec_all_mask.sum())})")
print(f"All bert_sims : {int(sim_all_mask.sum())} / {total} ({p(sim_all_mask.sum())})")
print(f"ALL (rec & sim): {int(both_all_mask.sum())} / {total} ({p(both_all_mask.sum())})")

# Пошаговое (кумулятивное) сужение пересечения, чтобы увидеть «узкие» колонки
def cumulative_intersection(cols, title):
    print(f"\n=== Cumulative intersection: {title} ===")
    mask = pd.Series(True, index=df.index)
    prev = int(mask.sum())
    print(f"start -> {prev}")
    for c in cols:
        mask = mask & (df[c] > 0.5)
        now = int(mask.sum())
        print(f"+ {c:35s} => {now}  (Δ {now - prev})")
        prev = now
    return mask

_ = cumulative_intersection(RECALL_COLS, "gen_recalls")
_ = cumulative_intersection(SIM_COLS,    "bert_sims")
_ = cumulative_intersection(RECALL_COLS + SIM_COLS, "ALL (recalls then sims)")

# Наивная оценка: произведение индивидуальных долей (чтобы понять порядок величины)
rates = [(df[c] > 0.5).mean() for c in (RECALL_COLS + SIM_COLS)]
naive_expected = int(round(total * reduce(mul, rates, 1.0)))
print(f"\nNaive expected ALL-pass (product of per-column rates): ~{naive_expected} rows "
      f"({(100.0*naive_expected/total if total else 0.0):.1f}%)")



=== gen_recall (>0.5) per column ===
gen_recall_Llama_8b_Instract           22126 / 55316  ( 40.0%)   NaN: 0
gen_recall_Llama_3b_Instract           18305 / 55316  ( 33.1%)   NaN: 0
gen_recall_Llama_1b_Instract           14098 / 55316  ( 25.5%)   NaN: 0
gen_recall_Gemma_7b_IT                 12416 / 55316  ( 22.4%)   NaN: 0
gen_recall_Zephyr_7b_Beta               9729 / 55316  ( 17.6%)   NaN: 0
gen_recall_Phi3_5_mini_Instruct        17930 / 55316  ( 32.4%)   NaN: 0

=== bert_sim (>0.5) per column ===
bert_sim_Llama_8b_Instract             25493 / 55316  ( 46.1%)   NaN: 0
bert_sim_Llama_3b_Instract             23070 / 55316  ( 41.7%)   NaN: 0
bert_sim_Llama_1b_Instract             20703 / 55316  ( 37.4%)   NaN: 0
bert_sim_Gemma_7b_IT                   13382 / 55316  ( 24.2%)   NaN: 0
bert_sim_Zephyr_7b_Beta                 8915 / 55316  ( 16.1%)   NaN: 0
bert_sim_Phi3_5_mini_Instruct          20151 / 55316  ( 36.4%)   NaN: 0

=== Intersections (>0.5) ===
All gen_recalls: 4807 / 55316 (8

In [3]:
# pip install datasets
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict

# df: pd.DataFrame уже в памяти и ОТСОРТИРОВАН по популярности (вниз -> rare ... вверх -> popular)
df = pd.read_csv('/mnt/extremessd10tb/borisiuk/open-unlearning/constuction_full_triplets_ds/FIXED_final_QA_triplets.csv')

RECALL_COLS = [
    "gen_recall_Llama_8b_Instract",
    "gen_recall_Llama_3b_Instract",
    "gen_recall_Llama_1b_Instract",
    "gen_recall_Gemma_7b_IT",
    "gen_recall_Zephyr_7b_Beta",
    "gen_recall_Phi3_5_mini_Instruct",
]
SIM_COLS = [
    "bert_sim_Llama_8b_Instract",
    "bert_sim_Llama_3b_Instract",
    "bert_sim_Llama_1b_Instract",
    "bert_sim_Gemma_7b_IT",
    "bert_sim_Zephyr_7b_Beta",
    "bert_sim_Phi3_5_mini_Instruct",
]

need = RECALL_COLS + SIM_COLS + ["popularity_sitelinks_sum"]
miss = [c for c in need if c not in df.columns]
if miss:
    raise ValueError(f"В df отсутствуют колонки: {miss}")

total = len(df)

# маска «знания» по всем моделям
quality_mask = (df[RECALL_COLS] > 0.5).all(axis=1) & (df[SIM_COLS] > 0.5).all(axis=1)

def target_n(pct: int) -> int:
    return int(total * pct / 100)  # floor

def subset_dataset(dataframe: pd.DataFrame, idx_seq) -> Dataset:
    return Dataset.from_pandas(dataframe.iloc[idx_seq], preserve_index=False)

def forget_indices_ranking(kind: str, pct: int) -> list[int]:
    """Без пересечений знаний: rare = первые n; popular = последние n (в обратном порядке)."""
    n = target_n(pct)
    if n <= 0:
        return []
    if 2*n > total:
        raise ValueError(f"N слишком велик: 2*N% > 100% (pct={pct}, total={total})")
    if kind == "rare":
        return list(range(0, n))
    if kind == "popular":
        return list(range(total - 1, total - 1 - n, -1))  # с конца к началу
    raise ValueError("kind должен быть 'popular' или 'rare'")

def forget_indices_intersection(kind: str, pct: int) -> list[int]:
    """Полное пересечение знаний: набираем ровно n, двигаясь от края и пропуская некачественные."""
    n = target_n(pct)
    if n <= 0:
        return []
    if 2*n > total:
        raise ValueError(f"N слишком велик: 2*N% > 100% (pct={pct}, total={total})")

    out = []
    if kind == "rare":
        for i in range(total):
            if quality_mask.iat[i]:
                out.append(i)
                if len(out) == n: break
    elif kind == "popular":
        for i in range(total-1, -1, -1):
            if quality_mask.iat[i]:
                out.append(i)
                if len(out) == n: break
    else:
        raise ValueError("kind должен быть 'popular' или 'rare'")

    if len(out) != n:
        have = int(quality_mask.sum())
        raise ValueError(
            f"[know_intersection {kind} {pct}%] Недостаточно качественных строк: "
            f"нужно {n}, доступно {have}."
        )
    return out

def complement_retain_idx(rare_idx: list[int], popular_idx: list[int]) -> list[int]:
    """Ретейн — строки, которые не попали ни в rare, ни в popular (т.е. середина).
       Размер ≈ total-2n (точно для ранжирования; может быть > при пересечении наборов в know_intersection)."""
    forget_union = set(rare_idx) | set(popular_idx)
    retain_idx = [i for i in range(total) if i not in forget_union]
    return retain_idx

def report_forget(kind: str, pct: int, forget_idx: list[int], tag: str):
    pop_vals = df.iloc[forget_idx]["popularity_sitelinks_sum"].tolist()
    boundary = pop_vals[-1] if kind == "rare" else pop_vals[0]
    print(
        f"[{tag} {kind} {pct}%] total={total} target={target_n(pct)} "
        f"chosen_forget={len(forget_idx)} | boundary popularity_sitelinks_sum={boundary}"
    )

# ---- строим DatasetDict ----
splits = {}
splits["full"] = Dataset.from_pandas(df, preserve_index=False)

# A) know_intersection: только 1% и 5%
for pct in [1, 5]:
    rare_f = forget_indices_intersection("rare", pct)
    pop_f  = forget_indices_intersection("popular", pct)
    retain = complement_retain_idx(rare_f, pop_f)

    report_forget("rare", pct, rare_f,  tag="know_intersection")
    report_forget("popular", pct, pop_f, tag="know_intersection")
    print(f"[know_intersection retain {100-2*pct}%] size={len(retain)}")

    splits[f"know_intersection_rare_forget_{pct}"]      = subset_dataset(df, rare_f)
    splits[f"know_intersection_popular_forget_{pct}"]   = subset_dataset(df, pop_f)
    splits[f"know_intersection_retain_{100-2*pct}"]     = subset_dataset(df, retain)

# B) Ранжирование: 1,5,10,15%
for pct in [1, 5, 10, 15]:
    rare_f = forget_indices_ranking("rare", pct)
    pop_f  = forget_indices_ranking("popular", pct)
    retain = complement_retain_idx(rare_f, pop_f)

    report_forget("rare", pct, rare_f,  tag="ranking")
    report_forget("popular", pct, pop_f, tag="ranking")
    print(f"[ranking retain {100-2*pct}%] size={len(retain)}")

    splits[f"rare_forget_{pct}"]    = subset_dataset(df, rare_f)
    splits[f"popular_forget_{pct}"] = subset_dataset(df, pop_f)
    splits[f"retain_{100-2*pct}"]   = subset_dataset(df, retain)

# Готовый датадикт (в памяти)
tripunlamb = DatasetDict(splits)
print(tripunlamb)


[know_intersection rare 1%] total=55316 target=553 chosen_forget=553 | boundary popularity_sitelinks_sum=133
[know_intersection popular 1%] total=55316 target=553 chosen_forget=553 | boundary popularity_sitelinks_sum=820
[know_intersection retain 98%] size=54210
[know_intersection rare 5%] total=55316 target=2765 chosen_forget=2765 | boundary popularity_sitelinks_sum=745
[know_intersection popular 5%] total=55316 target=2765 chosen_forget=2765 | boundary popularity_sitelinks_sum=820
[know_intersection retain 90%] size=52536
[ranking rare 1%] total=55316 target=553 chosen_forget=553 | boundary popularity_sitelinks_sum=6
[ranking popular 1%] total=55316 target=553 chosen_forget=553 | boundary popularity_sitelinks_sum=824
[ranking retain 98%] size=54210
[ranking rare 5%] total=55316 target=2765 chosen_forget=2765 | boundary popularity_sitelinks_sum=20
[ranking popular 5%] total=55316 target=2765 chosen_forget=2765 | boundary popularity_sitelinks_sum=824
[ranking retain 90%] size=49786
[ra

In [4]:
# dataset_splits.push_to_hub("SwetieePawsss/TripUNLamb")
tripunlamb.push_to_hub("SwetieePawsss/TripUNLamb")

Creating parquet from Arrow format: 100%|██████████| 56/56 [00:00<00:00, 234.25ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.68s/ shards]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 302.25ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.08 shards/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 383.04ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.17s/ shards]
Creating parquet from Arrow format: 100%|██████████| 55/55 [00:00<00:00, 256.42ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.35s/ shards]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 265.29ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.73s/ shards]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 262.30ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.36s/ shards]
Creating par

CommitInfo(commit_url='https://huggingface.co/datasets/SwetieePawsss/TripUNLamb/commit/e39802a67535fae8cada8fab45d67cbccecf2c73', commit_message='Upload dataset', commit_description='', oid='e39802a67535fae8cada8fab45d67cbccecf2c73', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SwetieePawsss/TripUNLamb', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SwetieePawsss/TripUNLamb'), pr_revision=None, pr_num=None)

In [4]:
from datasets import load_dataset, DatasetDict

# --- параметры ---
REPO_IN = "SwetieePawsss/TripUNLamb"
SOURCE_SPLIT = "know_intersection_retain_98"
METRIC_COL = "gen_recall_Llama_8b_Instract"
FAST_SIZE = 500
SEED = 42

# --- загрузка ---
ds: DatasetDict = load_dataset(REPO_IN)
print("Splits found:", list(ds.keys()))

# проверим наличие нужных колонок (ровно по именам)
for split_name, dset in ds.items():
    cols = set(dset.column_names)
    if not {"subject", "object"} <= cols:
        raise KeyError(f"В сплите '{split_name}' нет колонок 'subject' и/или 'object'. Есть: {sorted(cols)}")

# --- 1) фильтрация во всех сплитах: оставляем только где subject != object ---
filtered_splits = {}
for split_name, dset in ds.items():
    n_before = len(dset)
    dset2 = dset.filter(lambda ex: ex["subject"] != ex["object"])
    n_after = len(dset2)
    print(f"[{split_name}] before={n_before}, after={n_after}, removed={n_before - n_after}")
    filtered_splits[split_name] = dset2

filtered = DatasetDict(filtered_splits)

# --- 2) fast_retain из know_intersection_retain_98 с метрикой > 0.6 ---
if SOURCE_SPLIT not in filtered:
    raise KeyError(f"Нет сплита '{SOURCE_SPLIT}'. Доступные: {list(filtered.keys())}")

if METRIC_COL not in filtered[SOURCE_SPLIT].column_names:
    raise KeyError(f"Нет колонки '{METRIC_COL}'. Есть: {filtered[SOURCE_SPLIT].column_names}")

# Оставляем кандидатов с метрикой > 0.6 (без нормализации, просто приводим к float для надёжности)
def pred_gt_06(ex):
    v = ex[METRIC_COL]
    try:
        v = float(v)
    except Exception:
        return False
    return v > 0.6

cands = filtered[SOURCE_SPLIT].filter(pred_gt_06)
print(f"Candidates in '{SOURCE_SPLIT}' with {METRIC_COL}>0.6: {len(cands)}")

cands = cands.shuffle(seed=SEED)
take_n = min(FAST_SIZE, len(cands))
fast_retain = cands.select(range(take_n))

filtered["fast_retain"] = fast_retain
print(f"Added split 'fast_retain' with {len(fast_retain)} rows.")


Splits found: ['full', 'know_intersection_rare_forget_1', 'know_intersection_popular_forget_1', 'know_intersection_retain_98', 'know_intersection_rare_forget_5', 'know_intersection_popular_forget_5', 'know_intersection_retain_90', 'rare_forget_1', 'popular_forget_1', 'retain_98', 'rare_forget_5', 'popular_forget_5', 'retain_90', 'rare_forget_10', 'popular_forget_10', 'retain_80', 'rare_forget_15', 'popular_forget_15', 'retain_70']


Filter: 100%|██████████| 55316/55316 [00:01<00:00, 46508.16 examples/s]


[full] before=55316, after=54975, removed=341


Filter: 100%|██████████| 553/553 [00:00<00:00, 33253.29 examples/s]


[know_intersection_rare_forget_1] before=553, after=516, removed=37


Filter: 100%|██████████| 553/553 [00:00<00:00, 34160.80 examples/s]


[know_intersection_popular_forget_1] before=553, after=415, removed=138


Filter: 100%|██████████| 54210/54210 [00:01<00:00, 47031.22 examples/s]


[know_intersection_retain_98] before=54210, after=54044, removed=166


Filter: 100%|██████████| 2765/2765 [00:00<00:00, 42605.62 examples/s]


[know_intersection_rare_forget_5] before=2765, after=2548, removed=217


Filter: 100%|██████████| 2765/2765 [00:00<00:00, 43104.44 examples/s]


[know_intersection_popular_forget_5] before=2765, after=2540, removed=225


Filter: 100%|██████████| 52536/52536 [00:01<00:00, 46751.08 examples/s]


[know_intersection_retain_90] before=52536, after=52424, removed=112


Filter: 100%|██████████| 553/553 [00:00<00:00, 33593.31 examples/s]


[rare_forget_1] before=553, after=542, removed=11


Filter: 100%|██████████| 553/553 [00:00<00:00, 34256.16 examples/s]


[popular_forget_1] before=553, after=540, removed=13


Filter: 100%|██████████| 54210/54210 [00:01<00:00, 47298.23 examples/s]


[retain_98] before=54210, after=53893, removed=317


Filter: 100%|██████████| 2765/2765 [00:00<00:00, 42368.10 examples/s]


[rare_forget_5] before=2765, after=2739, removed=26


Filter: 100%|██████████| 2765/2765 [00:00<00:00, 43218.01 examples/s]


[popular_forget_5] before=2765, after=2720, removed=45


Filter: 100%|██████████| 49786/49786 [00:01<00:00, 46724.45 examples/s]


[retain_90] before=49786, after=49516, removed=270


Filter: 100%|██████████| 5531/5531 [00:00<00:00, 44149.22 examples/s]


[rare_forget_10] before=5531, after=5495, removed=36


Filter: 100%|██████████| 5531/5531 [00:00<00:00, 44976.06 examples/s]


[popular_forget_10] before=5531, after=5388, removed=143


Filter: 100%|██████████| 44254/44254 [00:00<00:00, 46776.88 examples/s]


[retain_80] before=44254, after=44092, removed=162


Filter: 100%|██████████| 8297/8297 [00:00<00:00, 45069.09 examples/s]


[rare_forget_15] before=8297, after=8253, removed=44


Filter: 100%|██████████| 8297/8297 [00:00<00:00, 46264.35 examples/s]


[popular_forget_15] before=8297, after=8147, removed=150


Filter: 100%|██████████| 38722/38722 [00:00<00:00, 46831.75 examples/s]


[retain_70] before=38722, after=38575, removed=147


Filter: 100%|██████████| 54044/54044 [00:04<00:00, 12065.20 examples/s]

Candidates in 'know_intersection_retain_98' with gen_recall_Llama_8b_Instract>0.6: 20814
Added split 'fast_retain' with 500 rows.





In [5]:
filtered

DatasetDict({
    full: Dataset({
        features: ['file', 'question', 'answer', 'subject', 'relation', 'object', 'PPL_Llama3_1-8B_Instruct', 'best_gen_Llama_8b_Instract', 'gen_recall_Llama_8b_Instract', 'bert_sim_Llama_8b_Instract', 'PPL_Llama3_2-3B_Instruct', 'best_gen_Llama_3b_Instract', 'gen_recall_Llama_3b_Instract', 'bert_sim_Llama_3b_Instract', 'PPL_Llama3_2-1B_Instruct', 'best_gen_Llama_1b_Instract', 'gen_recall_Llama_1b_Instract', 'bert_sim_Llama_1b_Instract', 'PPL_Gemma_7B_IT', 'best_gen_Gemma_7b_IT', 'gen_recall_Gemma_7b_IT', 'bert_sim_Gemma_7b_IT', 'PPL_Zephyr_7B_Beta', 'best_gen_Zephyr_7b_Beta', 'gen_recall_Zephyr_7b_Beta', 'bert_sim_Zephyr_7b_Beta', 'PPL_Phi3_5_mini_Instruct', 'best_gen_Phi3_5_mini_Instruct', 'gen_recall_Phi3_5_mini_Instruct', 'bert_sim_Phi3_5_mini_Instruct', 'subject_qid', 'object_qid', 'subject_popularity_sitelinks', 'object_popularity_sitelinks', 'popularity_sitelinks_sum'],
        num_rows: 54975
    })
    know_intersection_rare_forget_1: Dataset(

In [6]:
# dataset_splits.push_to_hub("SwetieePawsss/TripUNLamb")
filtered.push_to_hub("SwetieePawsss/TripUNLamb")

Creating parquet from Arrow format: 100%|██████████| 55/55 [00:11<00:00,  4.68ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:13<00:00, 13.95s/ shards]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 11.58ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.32s/ shards]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 14.67ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.14s/ shards]
Creating parquet from Arrow format: 100%|██████████| 55/55 [00:11<00:00,  4.85ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:13<00:00, 13.14s/ shards]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00,  6.25ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.50s/ shards]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00,  6.18ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.88s/ shards]
Creating parquet f

CommitInfo(commit_url='https://huggingface.co/datasets/SwetieePawsss/TripUNLamb/commit/97ef7a76641ad08b4e927058d8b905fa4290d248', commit_message='Upload dataset', commit_description='', oid='97ef7a76641ad08b4e927058d8b905fa4290d248', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SwetieePawsss/TripUNLamb', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SwetieePawsss/TripUNLamb'), pr_revision=None, pr_num=None)