<a href="https://colab.research.google.com/github/1muyassar-kholova/surkhandarya_map/blob/main/Bak_15_COWB_mavzusi_amaliyoti.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
print("Python versiyasi:", sys.version)
print("Colab tayyor! ✅")


Python versiyasi: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
Colab tayyor! ✅


In [None]:
# Kichik o'zbekcha korpus (oddiy gaplar)
raw = [
    "Men ertalab maktabga boraman",
    "U kechqurun uyga qaytdi",
    "Biz kutubxonaga borganmiz",
    "Talabalar ertalab darsga boradi",
    "Maktabga borishdan oldin nonushta qilaman",
    "Kutubxonada kitob o'qiyman",
    "Ular bugun universitetga bordi",
    "Men bugun ertalab tez uyg'ondim",
    "Kecha kechqurun uyga erta qaytdik"
]

def tok(s):
    return s.lower().replace("’","'").split()

sentences = [tok(s) for s in raw]
print("Tokenlangan gaplar:")
for s in sentences:
    print(s)


Tokenlangan gaplar:
['men', 'ertalab', 'maktabga', 'boraman']
['u', 'kechqurun', 'uyga', 'qaytdi']
['biz', 'kutubxonaga', 'borganmiz']
['talabalar', 'ertalab', 'darsga', 'boradi']
['maktabga', 'borishdan', 'oldin', 'nonushta', 'qilaman']
['kutubxonada', 'kitob', "o'qiyman"]
['ular', 'bugun', 'universitetga', 'bordi']
['men', 'bugun', 'ertalab', 'tez', "uyg'ondim"]
['kecha', 'kechqurun', 'uyga', 'erta', 'qaytdik']


In [None]:
# 4-qadam: Lug'at va co-occurrence jadvali

# Lug'at (so'z -> raqam)
vocab = sorted({w for sent in sentences for w in sent})
idx = {w:i for i,w in enumerate(vocab)}
itos = {i:w for w,i in idx.items()}

print("Lug'at:", vocab)
print("So'zlar soni:", len(vocab))

# Co-occurrence jadvali (oddiy)
window = 2  # chap-o'ngdan 2 ta so'zni kontekst sifatida olamiz
V = len(vocab)
C = [[0]*V for _ in range(V)]  # boshlang'ich 0 matritsa

for sent in sentences:
    ids = [idx[w] for w in sent]
    for i, center in enumerate(ids):
        ctx = ids[max(0,i-window):i] + ids[i+1:i+1+window]
        for u in ctx:
            C[center][u] += 1
            C[u][center] += 1  # simmetrik (oddiylik uchun)

print("\n'ertalab' so'zi indeks:", idx["ertalab"])
print("Uning co-occurrence qatori:", C[idx["ertalab"]])


Lug'at: ['biz', 'boradi', 'boraman', 'bordi', 'borganmiz', 'borishdan', 'bugun', 'darsga', 'erta', 'ertalab', 'kecha', 'kechqurun', 'kitob', 'kutubxonada', 'kutubxonaga', 'maktabga', 'men', 'nonushta', "o'qiyman", 'oldin', 'qaytdi', 'qaytdik', 'qilaman', 'talabalar', 'tez', 'u', 'ular', 'universitetga', "uyg'ondim", 'uyga']
So'zlar soni: 30

'ertalab' so'zi indeks: 9
Uning co-occurrence qatori: [0, 2, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0]


In [3]:
# 5-qadam: xavfsiz CBOWcha bashorat (bitta katak)
from math import sqrt

# --- tekshiruvlar ---
assert 'sentences' in globals(), "sentences aniqlanmagan (3-qadamni ishlating)."
assert 'vocab' in globals() and 'idx' in globals() and 'C' in globals() and 'V' in globals(), \
       "Lug'at yoki C/V aniqlanmagan (4-qadamni ishlating)."

def dot(a,b): return sum(x*y for x,y in zip(a,b))
def norm(a):
    s = sum(x*x for x in a)
    return sqrt(s) if s > 0 else 1.0  # nol bo'lsa 1.0 qaytaramiz (bo'linish xatosini oldini oladi)

def cosine(a,b):
    na, nb = norm(a), norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return dot(a,b)/(na*nb)

def avg_vectors(words):
    # lug'atda bo'lmagan so'zlarni e'tiborsiz qoldiramiz
    kept = [w for w in words if w in idx]
    vec = [0.0]*V
    if not kept:
        return vec
    for w in kept:
        row = C[idx[w]]
        # uzunlik mosligini kafolatlaymiz
        if len(row) != V:
            raise ValueError("C matritsasi o'lchami mos emas")
        vec = [v + row[j] for j,v in enumerate(vec)]
    return [v/len(kept) for v in vec]

def predict_center(context_words, forbid=set()):
    ctx_vec = avg_vectors(context_words)
    best, best_sim = None, -1.0
    for w in vocab:
        if w in context_words or w in forbid:
            continue
        sim = cosine(ctx_vec, C[idx[w]])
        if sim > best_sim:
            best, best_sim = w, sim
    return best, best_sim

# --- SINOV ---
context = ["men", "maktabga", "boraman"]   # "Men ___ maktabga boraman"
pred, score = predict_center(context)

print("Kontekst:", context)
print("Bashorat qilingan markaziy so'z:", pred, f"(cos={score:.3f})")

# diagnostika uchun: eng yuqori 5 ta nomzod
cvec = avg_vectors(context)
cands = []
for w in vocab:
    if w in context:
        continue
    cands.append((w, cosine(cvec, C[idx[w]])))
cands.sort(key=lambda x: x[1], reverse=True)
print("\nTop-5 nomzod:")
for w,sc in cands[:5]:
    print(f"{w:12s}  cos={sc:.3f}")


AssertionError: sentences aniqlanmagan (3-qadamni ishlating).

In [4]:
# 3-qadam: Kichik korpus va tokenizatsiya
raw = [
    "Men ertalab maktabga boraman",
    "U kechqurun uyga qaytdi",
    "Biz kutubxonaga borganmiz",
    "Talabalar ertalab darsga boradi",
    "Maktabga borishdan oldin nonushta qilaman",
    "Kutubxonada kitob o'qiyman",
    "Ular bugun universitetga bordi",
    "Men bugun ertalab tez uyg'ondim",
    "Kecha kechqurun uyga erta qaytdik"
]

def tok(s):
    return s.lower().replace("’","'").split()

sentences = [tok(s) for s in raw]
print("Tokenlangan gaplar:")
for s in sentences:
    print(s)


Tokenlangan gaplar:
['men', 'ertalab', 'maktabga', 'boraman']
['u', 'kechqurun', 'uyga', 'qaytdi']
['biz', 'kutubxonaga', 'borganmiz']
['talabalar', 'ertalab', 'darsga', 'boradi']
['maktabga', 'borishdan', 'oldin', 'nonushta', 'qilaman']
['kutubxonada', 'kitob', "o'qiyman"]
['ular', 'bugun', 'universitetga', 'bordi']
['men', 'bugun', 'ertalab', 'tez', "uyg'ondim"]
['kecha', 'kechqurun', 'uyga', 'erta', 'qaytdik']


In [5]:
# 4-qadam: Lug'at va co-occurrence jadvali

# Lug'at (so'z -> raqam)
vocab = sorted({w for sent in sentences for w in sent})
idx = {w:i for i,w in enumerate(vocab)}
itos = {i:w for w,i in idx.items()}

print("Lug'at:", vocab)
print("So'zlar soni:", len(vocab))

# Co-occurrence jadvali (oddiy)
window = 2  # chap-o'ngdan 2 ta so'zni kontekst sifatida olamiz
V = len(vocab)
C = [[0]*V for _ in range(V)]  # boshlang'ich 0 matritsa

for sent in sentences:
    ids = [idx[w] for w in sent]
    for i, center in enumerate(ids):
        ctx = ids[max(0,i-window):i] + ids[i+1:i+1+window]
        for u in ctx:
            C[center][u] += 1
            C[u][center] += 1  # simmetrik (oddiylik uchun)

print("\n'ertalab' so'zi indeks:", idx["ertalab"])
print("Uning co-occurrence qatori:", C[idx["ertalab"]])


Lug'at: ['biz', 'boradi', 'boraman', 'bordi', 'borganmiz', 'borishdan', 'bugun', 'darsga', 'erta', 'ertalab', 'kecha', 'kechqurun', 'kitob', 'kutubxonada', 'kutubxonaga', 'maktabga', 'men', 'nonushta', "o'qiyman", 'oldin', 'qaytdi', 'qaytdik', 'qilaman', 'talabalar', 'tez', 'u', 'ular', 'universitetga', "uyg'ondim", 'uyga']
So'zlar soni: 30

'ertalab' so'zi indeks: 9
Uning co-occurrence qatori: [0, 2, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0]


In [6]:
# 5-qadam: xavfsiz CBOWcha bashorat (bitta katak)
from math import sqrt

# --- tekshiruvlar ---
assert 'sentences' in globals(), "sentences aniqlanmagan (3-qadamni ishlating)."
assert 'vocab' in globals() and 'idx' in globals() and 'C' in globals() and 'V' in globals(), \
       "Lug'at yoki C/V aniqlanmagan (4-qadamni ishlating)."

def dot(a,b): return sum(x*y for x,y in zip(a,b))
def norm(a):
    s = sum(x*x for x in a)
    return sqrt(s) if s > 0 else 1.0  # nol bo'lsa 1.0 qaytaramiz (bo'linish xatosini oldini oladi)

def cosine(a,b):
    na, nb = norm(a), norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return dot(a,b)/(na*nb)

def avg_vectors(words):
    # lug'atda bo'lmagan so'zlarni e'tiborsiz qoldiramiz
    kept = [w for w in words if w in idx]
    vec = [0.0]*V
    if not kept:
        return vec
    for w in kept:
        row = C[idx[w]]
        # uzunlik mosligini kafolatlaymiz
        if len(row) != V:
            raise ValueError("C matritsasi o'lchami mos emas")
        vec = [v + row[j] for j,v in enumerate(vec)]
    return [v/len(kept) for v in vec]

def predict_center(context_words, forbid=set()):
    ctx_vec = avg_vectors(context_words)
    best, best_sim = None, -1.0
    for w in vocab:
        if w in context_words or w in forbid:
            continue
        sim = cosine(ctx_vec, C[idx[w]])
        if sim > best_sim:
            best, best_sim = w, sim
    return best, best_sim

# --- SINOV ---
context = ["men", "maktabga", "boraman"]   # "Men ___ maktabga boraman"
pred, score = predict_center(context)

print("Kontekst:", context)
print("Bashorat qilingan markaziy so'z:", pred, f"(cos={score:.3f})")

# diagnostika uchun: eng yuqori 5 ta nomzod
cvec = avg_vectors(context)
cands = []
for w in vocab:
    if w in context:
        continue
    cands.append((w, cosine(cvec, C[idx[w]])))
cands.sort(key=lambda x: x[1], reverse=True)
print("\nTop-5 nomzod:")
for w,sc in cands[:5]:
    print(f"{w:12s}  cos={sc:.3f}")

Kontekst: ['men', 'maktabga', 'boraman']
Bashorat qilingan markaziy so'z: tez (cos=0.577)

Top-5 nomzod:
tez           cos=0.577
boradi        cos=0.566
talabalar     cos=0.566
uyg'ondim     cos=0.566
darsga        cos=0.462


In [7]:
# 6-qadam: Eng yaqin so'zlar (neighbors) funksiyasi va tez sinovlar

from math import sqrt

# --- xavfsizlik ---
assert 'vocab' in globals() and 'idx' in globals() and 'C' in globals() and 'V' in globals(), \
       "Lug'at yoki C/V aniqlanmagan (4-qadamni ishlating)."

def dot(a,b): return sum(x*y for x,y in zip(a,b))
def norm(a):
    s = sum(x*x for x in a)
    return sqrt(s) if s > 0 else 1.0

def cosine(a,b):
    na, nb = norm(a), norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return dot(a,b)/(na*nb)

def most_similar(word, topn=5):
    """Berilgan so'zga co-occurrence vektori bo'yicha eng yaqin qo'shnilarni qaytaradi."""
    if word not in idx:
        raise KeyError(f"‘{word}’ lug'atda yo'q. Mavjudlardan birini tanlang: {vocab[:10]} ...")
    base = C[idx[word]]
    sims = []
    for w in vocab:
        if w == word:
            continue
        sims.append((w, cosine(base, C[idx[w]])))
    sims.sort(key=lambda x: x[1], reverse=True)
    return sims[:topn]

# --- Sinovlar ---
probe_words = ["ertalab", "maktabga", "uyga", "men"]
for pw in probe_words:
    if pw in idx:
        print(f"\n‘{pw}’ ga eng yaqin so'zlar:")
        for w,sc in most_similar(pw, topn=5):
            print(f"{w:12s}  cos={sc:.3f}")



‘ertalab’ ga eng yaqin so'zlar:
maktabga      cos=0.387
bugun         cos=0.354
darsga        cos=0.333
tez           cos=0.333
men           cos=0.236

‘maktabga’ ga eng yaqin so'zlar:
nonushta      cos=0.516
ertalab       cos=0.387
bugun         cos=0.365
men           cos=0.365
boradi        cos=0.316

‘uyga’ ga eng yaqin so'zlar:
erta          cos=0.577
kecha         cos=0.471
kechqurun     cos=0.471
qaytdi        cos=0.471
u             cos=0.471

‘men’ ga eng yaqin so'zlar:
boraman       cos=0.866
tez           cos=0.707
boradi        cos=0.577
talabalar     cos=0.577
uyg'ondim     cos=0.577
