In [8]:
!pip install -q sentence-transformers

In [18]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from collections import defaultdict
from tqdm import tqdm
# Change path if needed
df = pd.read_csv("/kaggle/input/datasets/anuragkacholiya/connections-raw-data/Connections_Data.csv")

df.head()
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Unnamed: 0,Game ID,Puzzle Date,Word,Group Name,Group Level,Starting Row,Starting Column
0,1,2023-06-12,SNOW,WET WEATHER,0,1,1
1,1,2023-06-12,LEVEL,PALINDROMES,3,1,2
2,1,2023-06-12,SHIFT,KEYBOARD KEYS,2,1,3
3,1,2023-06-12,KAYAK,PALINDROMES,3,1,4
4,1,2023-06-12,HEAT,NBA TEAMS,1,2,1


In [19]:
# Ensure consistent casing
df["Word"] = df["Word"].str.upper().str.strip()

puzzles = []

for game_id, group in df.groupby("Game ID"):
    
    # Sort by original board position
    group = group.sort_values(["Starting Row", "Starting Column"])
    
    if len(group) == 16:
        words = group["Word"].tolist()
        labels = group["Group Level"].tolist()
        
        puzzles.append({
            "game_id": game_id,
            "words": words,
            "labels": labels
        })

print("Total puzzles:", len(puzzles))

Total puzzles: 915


In [20]:
model = SentenceTransformer("all-MiniLM-L6-v2")

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [21]:
from scipy.optimize import linear_sum_assignment
from sklearn.metrics import confusion_matrix

def clustering_accuracy(true_labels, pred_labels):
    cm = confusion_matrix(true_labels, pred_labels)
    row_ind, col_ind = linear_sum_assignment(-cm)
    return cm[row_ind, col_ind].sum() / len(true_labels)

In [22]:
all_ari = []
all_nmi = []
all_acc = []

for puzzle in tqdm(puzzles):
    words = puzzle["words"]
    true_labels = puzzle["labels"]
    
    # Encode words independently
    embeddings = model.encode(words)
    
    # Agglomerative clustering with cosine distance
    clustering = AgglomerativeClustering(
        n_clusters=4,
        metric="cosine",
        linkage="average"
    )
    
    pred_labels = clustering.fit_predict(embeddings)
    
    # Metrics
    ari = adjusted_rand_score(true_labels, pred_labels)
    nmi = normalized_mutual_info_score(true_labels, pred_labels)
    acc = clustering_accuracy(true_labels, pred_labels)
    
    all_ari.append(ari)
    all_nmi.append(nmi)
    all_acc.append(acc)

print("===== BASELINE 2 RESULTS =====")
print("Mean ARI :", np.mean(all_ari))
print("Mean NMI :", np.mean(all_nmi))
print("Mean Accuracy :", np.mean(all_acc))

100%|██████████| 915/915 [00:32<00:00, 27.76it/s]

===== BASELINE 2 RESULTS =====
Mean ARI : 0.1535319646188354
Mean NMI : 0.41849089779334875
Mean Accuracy : 0.5352459016393443





In [23]:
p = puzzles[0]
words = p["words"]
true = p["labels"]

embeddings = model.encode(words)
clustering = AgglomerativeClustering(n_clusters=4, metric="cosine", linkage="average")
pred = clustering.fit_predict(embeddings)

print("Words:", words)
print("True :", true)
print("Pred :", pred)

Words: ['SNOW', 'LEVEL', 'SHIFT', 'KAYAK', 'HEAT', 'TAB', 'BUCKS', 'RETURN', 'JAZZ', 'HAIL', 'OPTION', 'RAIN', 'SLEET', 'RACECAR', 'MOM', 'NETS']
True : [0, 3, 2, 3, 1, 2, 1, 2, 1, 0, 2, 0, 0, 3, 3, 1]
Pred : [0 0 0 2 0 0 1 3 1 0 0 0 0 2 0 1]


In [25]:
perfect_puzzles = 0

def is_perfect_clustering(true_labels, pred_labels):
    cm = confusion_matrix(true_labels, pred_labels)
    row_ind, col_ind = linear_sum_assignment(-cm)
    return cm[row_ind, col_ind].sum() == len(true_labels)

for puzzle in puzzles:
    words = puzzle["words"]
    true_labels = puzzle["labels"]
    
    embeddings = model.encode(words)
    
    clustering = AgglomerativeClustering(
        n_clusters=4,
        metric="cosine",
        linkage="average"
    )
    
    pred_labels = clustering.fit_predict(embeddings)
    
    acc = clustering_accuracy(true_labels, pred_labels)
    
    # Use tolerance instead of exact equality
    if is_perfect_clustering(true_labels, pred_labels):
        perfect_puzzles += 1

print("Perfectly solved puzzles:", perfect_puzzles)
print("Out of:", len(puzzles))
print("Solve Rate:", perfect_puzzles / len(puzzles))

Perfectly solved puzzles: 1
Out of: 915
Solve Rate: 0.001092896174863388


In [26]:
high_acc_75 = 0
high_acc_875 = 0

for acc in all_acc:
    if acc >= 0.75:
        high_acc_75 += 1
    if acc >= 0.875:
        high_acc_875 += 1

print("Puzzles ≥75% correct:", high_acc_75)
print("Puzzles ≥87.5% correct:", high_acc_875)

Puzzles ≥75% correct: 51
Puzzles ≥87.5% correct: 5
