In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip -q install pandas numpy scikit-learn unidecode fasttext==0.9.2
!apt-get update -qq
!apt-get install -y -qq autoconf automake libtool pkg-config curl build-essential

!git clone --depth 1 https://github.com/openvenues/libpostal
!cd libpostal && ./bootstrap.sh
!cd libpostal && ./configure --datadir=/content/libpostal_data
!cd libpostal && make -j"$(nproc)"
!cd libpostal && make install

!ldconfig


In [None]:
!pip install -U pip setuptools wheel
!pip install postal
!pip install -q sentence-transformers faiss-cpu


In [None]:
import pandas as pd, numpy as np, re, os, gc
from unidecode import unidecode
import fasttext
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize as sk_normalize
from sklearn.metrics.pairwise import cosine_similarity

_ABBR = {
    # mahalle
    r"\bmah\.?\b": "mahalle", r"\bmh\.?\b": "mahalle", r"\bmhl\b": "mahalle",
    r"\bmahallesi\b": "mahalle",
    # sokak
    r"\bsok\.?\b": "sokak", r"\bsk\.?\b": "sokak", r"\bsoka?\.?\b": "sokak",
    # cadde
    r"\bcadd?\.?\b": "cadde", r"\bcad\.?\b": "cadde", r"\bcd\.?\b": "cadde",
    # bulvar
    r"\bblv\.?\b": "bulvar", r"\bbulv?\.?\b": "bulvar",
    # apartman / site / blok
    r"\bapt\.?\b": "apartman", r"\bap\.?\b": "apartman",
    r"\bsitesi?\b": "sitesi", r"\bsit\.\b": "sitesi",
    r"\bblok\b": "blok",
    # organize sanayi
    r"\bosb\b": "organize sanayi bolgesi", r"\borg\.?\b": "organize",
    # posta kutusu
    r"\bpk\.?\b": "postakutusu",
    # merkez
    r"\bmerkez\b": "merkez"
}
_punct_re = re.compile(r"[^a-z0-9ğüşöçıİĞÜŞÖÇ]+", flags=re.IGNORECASE)

def _casefold_tr(s: str) -> str:
    # I/İ
    return (s or "").strip().casefold()

def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = _casefold_tr(s)

    # kısaltmalar
    for pat, repl in _ABBR.items():
        s = re.sub(pat, repl, s)

    s = re.sub(r"\b(?:no|numara)\s*[:\-]?\s*([0-9]+(?:/[0-9a-z])?)\b", r"no \1", s)
    s = re.sub(r"\bno\s+(\d+)[\s/]*([a-z0-9])\b", r"no \1 d \2", s)
    s = re.sub(r"\bkat\s*([0-9]+)\b", r"kat \1", s)
    s = re.sub(r"\bd\.?\s*([0-9]+)\b", r"d \1", s)        
    s = re.sub(r"\bdr\.?\s+(?=[a-zğüşöçı])", "doktor ", s)
    s = re.sub(r"\bdaire\s*([0-9]+)\b", r"d \1", s)
    s = re.sub(r"\bblok\s*([a-z0-9]+)\b", r"blok \1", s)
    s = re.sub(r"([a-zğüşöçı])(\d)", r"\1 \2", s)
    s = re.sub(r"(\d)([a-zğüşöçı])", r"\1 \2", s)
    s = _punct_re.sub(" ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def normalize_text_ascii(s: str) -> str:
    return normalize_text(unidecode(s or ""))

def fingerprint(s: str) -> str:
    s = normalize_text(s)
    toks = s.split()
    keep = {"no","kat","d","blok"}
    toks = [t for t in toks if (len(t) > 1 or t in keep)]
    toks = sorted(set(toks))
    return " ".join(toks)

def build_text(df: pd.DataFrame) -> pd.Series:
    return (df["norm"] + " || " + df["norm_ascii"]).astype(str)


In [None]:
train = pd.read_csv("data/train.csv")
test  = pd.read_csv("data/test.csv")

if "id" not in test.columns:
    test = test.reset_index().rename(columns={"index": "id"})

train["label"] = train["label"].astype(str)
train["address"] = train["address"].astype(str)
test["address"]  = test["address"].astype(str)

print("train:", train.shape, "| test:", test.shape, "| sınıf sayısı:", train["label"].nunique())


In [None]:

def majority_map(df: pd.DataFrame, key_col: str, label_col: str,
                 min_count: int = 1, purity: float = 0.0) -> pd.Series:
    vc = df.groupby(key_col)[label_col].value_counts()
    vc = vc.to_frame("cnt").reset_index()
    total = vc.groupby(key_col)["cnt"].transform("sum")
    vc["frac"] = vc["cnt"] / total
    vc = vc.sort_values(["cnt"], ascending=False).drop_duplicates([key_col])
    ok = (vc["cnt"] >= min_count) & (vc["frac"] >= purity)
    return pd.Series(vc.loc[ok, label_col].values,
                     index=vc.loc[ok, key_col].values)


In [None]:
# normalize / ascii / fingerprint 
for col, fn in [("norm", normalize_text), ("norm_ascii", normalize_text_ascii), ("fp", fingerprint)]:
    train[col] = train["address"].map(fn)
    test[col]  = test["address"].map(fn)

norm2label       = majority_map(train, "norm", "label", min_count=3, purity=0.75)
norm_ascii2label = majority_map(train, "norm_ascii", "label", min_count=3, purity=0.75)
fp2label         = majority_map(train, "fp", "label", min_count=3, purity=0.75)

global_top_label = train["label"].value_counts().idxmax()

pred_rule = pd.Series([None]*len(test), index=test.index, dtype="object")

m_norm  = test["norm"].isin(norm2label.index)
pred_rule[m_norm] = test.loc[m_norm, "norm"].map(norm2label)

m_ascii = pred_rule.isna() & test["norm_ascii"].isin(norm_ascii2label.index)
pred_rule[m_ascii] = test.loc[m_ascii, "norm_ascii"].map(norm_ascii2label)

m_fp    = pred_rule.isna() & test["fp"].isin(fp2label.index)
pred_rule[m_fp] = test.loc[m_fp, "fp"].map(fp2label)

coverage = (~pred_rule.isna()).mean()*100
print(f"[RULE] kapsama: %{coverage:.2f}  (norm: {m_norm.mean()*100:.2f} / ascii: {m_ascii.mean()*100:.2f} / fp: {m_fp.mean()*100:.2f})")

need_ml_idx = pred_rule[pred_rule.isna()].index
print("ML ile sınıflandırılacak test sayısı:", len(need_ml_idx))


In [None]:
# ===== LIBPOSTAL PARSER KATMANI =====
from postal.parser import parse_address
import re

def lp_parse_row(s: str):
    try:
        parts = dict(parse_address(s or ""))
    except Exception:
        parts = {}
    house = parts.get("house_number") or parts.get("house") or ""
    road = parts.get("road") or parts.get("pedestrian") or parts.get("footway") or ""
    # birim/kat/daire
    unit = (parts.get("unit") or parts.get("level") or parts.get("staircase") 
            or parts.get("entrance") or parts.get("po_box") or "")
    # mahalle/ilçe etiket karması
    neighbourhood = (parts.get("neighbourhood") or parts.get("suburb") 
                     or parts.get("city_district") or parts.get("state_district") or "")
    city = parts.get("city") or parts.get("state") or ""   # TR’de bazen 'state' = İl
    postcode = parts.get("postcode") or ""
    return house, road, unit, neighbourhood, city, postcode


def lp_clean_road(road: str) -> str:
    r = normalize_text(road)
    return r

def make_lp_key(addr: str) -> str:
    h, road, unit, suburb, city, pc = lp_parse_row(addr)
    road = lp_clean_road(road)
    # no + road anahtar +varsa posta kodu
    key = f"{h} {road}".strip()
    if pc: key = f"{key} #{pc}"
    return re.sub(r"\s+", " ", key).strip()

train["lp_key"] = train["address"].map(make_lp_key)
test["lp_key"]  = test["address"].map(make_lp_key)


lp2label = majority_map(train, "lp_key", "label", min_count=3, purity=0.80)

# kural tabanına “lp_key” kuralı
m_lp = pred_rule.isna() & test["lp_key"].isin(lp2label.index)
pred_rule[m_lp] = test.loc[m_lp, "lp_key"].map(lp2label)

coverage = (~pred_rule.isna()).mean()*100
print(f"[RULE+LP] kapsama: %{coverage:.2f}  (+lp: {m_lp.mean()*100:.2f})")

need_ml_idx = pred_rule[pred_rule.isna()].index
print("ML ile sınıflandırılacak test sayısı (LP sonrası):", len(need_ml_idx))


In [None]:
import os, psutil
print("vCPU:", os.cpu_count(), "| logical:", psutil.cpu_count(logical=True))


In [None]:
ft_train_path = "ft_train.txt"
with open(ft_train_path, "w", encoding="utf-8") as f:
    for lbl, txt in zip(train["label"].tolist(), build_text(train).tolist()):
        f.write(f"__label__{lbl} {txt}\n")

model = fasttext.train_supervised(
    input=ft_train_path,
    lr=0.7,
    epoch=35,
    wordNgrams=4,
    dim=300,
    loss="softmax",
    minn=2, maxn=5
)
print("fastText modeli eğitildi.")

texts_need = build_text(test.loc[need_ml_idx]).tolist()
ft_labels, ft_probs = model.predict(texts_need, k=5)  # top-5 alıyoruz (re-rank için)

top1 = [labs[0].replace("__label__", "") if len(labs)>0 else global_top_label for labs in ft_labels]
pred_ml_top1 = pd.Series(top1, index=need_ml_idx, dtype="object")


In [None]:
# ===== SBERT + FAISS RERANKER (TF-IDF yerine) =====
import torch, numpy as np, pandas as pd, faiss, gc
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
emb = SentenceTransformer(MODEL_NAME, device=device)

def text_for_embed(df: pd.DataFrame) -> pd.Series:
    return build_text(df).astype(str).fillna("")

# ---- zayıf örnek eşiği ve adaylar ----
P1_THR, M_THR = 0.35, 0.03
TOPK_FT = 5
ALPHA = 0.5         # blend: 0.5*cos + 0.5*ft_prob
MIN_LABEL_COUNT = 5 # centroid için minimum örnek

ft_p1 = np.array([ (p[0] if len(p)>0 else 0.0) for p in ft_probs ], dtype=np.float32)
ft_p2 = np.array([ (p[1] if len(p)>1 else 0.0) for p in ft_probs ], dtype=np.float32)
weak_mask = (ft_p1 < P1_THR) | ((ft_p1 - ft_p2) < M_THR)
weak_idx = np.where(weak_mask)[0]
print(f"[SBERT] zayıf örnek: {len(weak_idx)} / {len(need_ml_idx)}")

# fastText top-K adaylar ve olasılıkları
ft_topk = []
ft_topk_prob = []
for labs, probs in zip(ft_labels, ft_probs):
    labs = [l.replace("__label__","") for l in labs[:TOPK_FT]]
    prs  = [float(p) for p in probs[:TOPK_FT]]
    ft_topk.append(labs)
    ft_topk_prob.append(prs)

# ---- sadece gereken label'ların centroid embedding'i ----
needed_labels = sorted({l for i in weak_idx for l in ft_topk[i]})
print(f"[SBERT] centroid hesaplanacak label sayısı: {len(needed_labels)}")

# chunk'lı embedding + centroid biriktirme
label_to_sum = {l: None for l in needed_labels}
label_to_cnt = {l: 0    for l in needed_labels}

BATCH = 2048  # embedding için chunk
train_text = text_for_embed(train)
y_lab = train["label"].astype(str).values

for s in range(0, len(train), BATCH):
    t = train_text.iloc[s:s+BATCH].tolist()
    X = emb.encode(t, convert_to_numpy=True, normalize_embeddings=True, batch_size=128, show_progress_bar=False)
    yl = y_lab[s:s+BATCH]
    # yalnız needed_labels için topla
    for l in np.unique(yl):
        if l not in label_to_sum:
            continue
        idx = np.where(yl == l)[0]
        if len(idx)==0:
            continue
        vec = X[idx].sum(axis=0, dtype=np.float32)  # sum
        if label_to_sum[l] is None:
            label_to_sum[l] = vec
        else:
            label_to_sum[l] += vec
        label_to_cnt[l] += len(idx)
    del X
    gc.collect()

# centroid matrisini hazırla (L x D)
labels_ok = []
centroids = []
for l in needed_labels:
    cnt = label_to_cnt[l]
    if cnt >= 1:
        v = label_to_sum[l] / max(1, cnt)
        # zaten normalize encode kullandık ama tedbir alfdık:
        v = v / (np.linalg.norm(v) + 1e-9)
    else:
        # hiç yoksa sıfır vektör
        v = np.zeros(emb.get_sentence_embedding_dimension(), dtype=np.float32)
    labels_ok.append(l)
    centroids.append(v.astype(np.float32))
centroids = np.vstack(centroids).astype(np.float32)
print("[SBERT] centroid matrix:", centroids.shape)

# ---- FAISS index (cosine ~ inner product on normalized vectors) ----
index = faiss.IndexFlatIP(centroids.shape[1])
index.add(centroids)  # L adet centroid

# test (need_ml) tarafı – zayıflar için embedding
need_text = text_for_embed(test.loc[need_ml_idx]).tolist()

final_ml = pred_ml_top1.loc[need_ml_idx].astype(str).to_numpy()
changed = 0

# encode zayıflar
weak_texts = [need_text[i] for i in weak_idx]
Z = emb.encode(weak_texts, convert_to_numpy=True, normalize_embeddings=True, batch_size=128, show_progress_bar=False)

# FAISS ile topK centroid çek (geniş tutup sonra aday kesişimi alacağız)
K_F = min(50, len(labels_ok))
D, I = index.search(Z, K_F)  # (Nweak x K_F)

# label id -> row index map
lab2row = {l:i for i,l in enumerate(labels_ok)}

for pos, j in enumerate(weak_idx):
    # FAISS adayları
    faiss_rows = I[pos]
    faiss_labs = [labels_ok[r] for r in faiss_rows if r != -1]
    faiss_scores = D[pos]  # cosine (IP)

    # fastText adayları
    cand_labs = ft_topk[j]
    cand_probs = ft_topk_prob[j]

    # kesişim: FAISS adayları ∩ fastText adayları
    inter = [(lbl, faiss_scores[faiss_labs.index(lbl)] if lbl in faiss_labs else 0.0,
              cand_probs[cand_labs.index(lbl)] if lbl in cand_labs else 0.0)
             for lbl in set(cand_labs)]

    # hiçbir kesişim yoksa fastText top1'i koru
    if not inter:
        continue

    # blend skor
    best_lbl, best_score = final_ml[j], -1e9
    for lbl, cos, p_ft in inter:
        cnt = label_to_cnt.get(lbl, 0)
        if cnt < MIN_LABEL_COUNT:
            cos *= 0.3  # az örneğe ceza
        score = ALPHA * float(cos) + (1-ALPHA) * float(p_ft)
        if score > best_score:
            best_lbl, best_score = lbl, score

    # sadece anlamlı iyileşmede değiştir
    if best_lbl != final_ml[j] and best_score > (ALPHA*0 + (1-ALPHA)*ft_p1[j] + 1e-6):
        final_ml[j] = best_lbl
        changed += 1

print(f"[SBERT] değişen zayıf örnek: {changed}/{len(weak_idx)}")


In [None]:
# =========================
# SUBMISSION BUILDER (fastText/FAISS/SBERT → submission.csv)
# =========================
import pandas as pd
import numpy as np
import time, os, re

assert "test" in globals(), "test DataFrame yok."
if "id" not in test.columns:
    test = test.reset_index().rename(columns={"index": "id"})
assert "pred_rule" in globals(), "pred_rule yok (kural tabanı oluşturulmalı)."
assert "need_ml_idx" in globals(), "need_ml_idx yok (kural sonrası ML'e gidecek indeksler)."

def ensure_global_top_label():
    if "global_top_label" in globals():
        return str(global_top_label)
    elif "train" in globals() and "label" in train.columns:
        return str(train["label"].astype(str).value_counts().idxmax())
    else:
        return "0"

global_top_label = ensure_global_top_label()

# --- fastText tahmini  ---
if "pred_ml_top1" in globals():
    ft_series = pred_ml_top1.astype(str)
else:
    assert "ft_labels" in globals(), "Ne 'pred_ml_top1' ne de 'ft_labels' var."
    top1 = [(labs[0].replace("__label__","") if len(labs)>0 else global_top_label) for labs in ft_labels]
    if "need_ml_idx" in globals() and len(top1) == len(need_ml_idx):
        tgt_idx = need_ml_idx
    elif len(top1) == len(test):
        tgt_idx = test.index
    else:
        raise ValueError(f"fastText tahmin sayısı ({len(top1)}) ile hedef indeks uyuşmuyor.")
    ft_series = pd.Series(top1, index=tgt_idx, dtype="object")

# --- final_pred: kural + ML birleşimi ---
final_pred = pred_rule.astype("object").copy()

# eğer SBERT/FAISS rerankerdan gelen final_ml varsa onu yaz yoksa fastText top-1
if "final_ml" in globals():
    final_pred.loc[need_ml_idx] = pd.Series(final_ml, index=need_ml_idx).astype(str).values
else:
    final_pred.loc[ft_series.index] = ft_series.values

# Boş kalanlar global_top_label
final_pred.fillna(str(global_top_label), inplace=True)
final_pred = final_pred.astype(str)

# --- submission ---
submission = pd.DataFrame({"id": test["id"], "label": final_pred})
# label tamamen sayısalsa int'e çevir
if submission["label"].str.fullmatch(r"\d+").all():
    submission["label"] = submission["label"].astype(int)

submission = submission.sort_values("id").reset_index(drop=True)
out_path = "submission.csv"
submission.to_csv(out_path, index=False)
uniq = submission["label"].nunique()
vc = submission["label"].value_counts(normalize=True)
top_lab = vc.index[0]
top_share = vc.iloc[0] * 100

print(f"✅ submission kaydedildi → {out_path}")
print(f"satır: {len(submission)} | tekil label: {uniq} | en sık label: {top_lab} (%{top_share:.2f})")
print(submission.head())


Tekrar iyileştirme için ekli kısımlar 