In [38]:
import numpy as np
import pandas as pd
from k_means_constrained import KMeansConstrained

In [2]:
GLOVE_FILE = "glove.6B.100d.txt"

embeddings_index = {}
with open(GLOVE_FILE) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

In [3]:
CONNECTIONS_DATA = "connections.csv"
conn = pd.read_csv(CONNECTIONS_DATA)
conn

Unnamed: 0,0,1,2,3,clue
0,BIG,HOT,IN,POPULAR,OF-THE-MOMENT
1,CHARACTER,GLYPH,ICON,SYMBOL,PICTOGRAPH
2,ASSESS,CHARGE,FINE,LEVY,"IMPOSE, AS A PENALTY"
3,HANDSOME,HIPPO,LEGEND,LIPID,WORDS BEGINNING WITH BODY PARTS
4,CALCULATOR,CALENDAR,CAMERA,CLOCK,SMARTPHONE FEATURES BEGINNING WITH “C”
...,...,...,...,...,...
907,ARE,QUEUE,SEA,WHY,LETTER HOMOPHONES
908,HAIL,RAIN,SLEET,SNOW,WET WEATHER
909,BUCKS,HEAT,JAZZ,NETS,NBA TEAMS
910,OPTION,RETURN,SHIFT,TAB,KEYBOARD KEYS


In [4]:
unk_words = set(word for word in conn.iloc[:, 0:4].to_numpy().flatten() if str(word).lower() not in embeddings_index)
unk_words

{'BEACH BOY',
 'BLACK WIDOW',
 'CANDY CANE',
 'CAT EYE',
 'CLOUD NINE',
 'DOJA',
 'FAST ONE',
 'FIDDLESTICKS',
 'FIRE TRUCK',
 'FRESH AIR',
 'GET OUT',
 'GOLF COURSE',
 'HIGH FIVE',
 'I RAN',
 'ICE CUBE',
 'LOST BOYS',
 'MARY JANE',
 'MASHED POTATO',
 'MILKY WAY',
 'MOLE RAT',
 'NEAR DARK',
 'NEW YORK',
 'OLD SPICE',
 'ON BOARD',
 'PEPPERMINT PATTY',
 'RACE CAR',
 'REPLY ALL',
 "ROCK 'N ROLL",
 'ROFL',
 'ROLLING STONE',
 'SCHOOL BUS',
 'SELFIE',
 'SLEEPING BAG',
 'STOP SIGN',
 'TOP TEN',
 'TWEEZE',
 'UP FIRST'}

In [5]:
valid = conn[~conn.iloc[:, 0:4].isin(unk_words).any(axis=1)]
valid

Unnamed: 0,0,1,2,3,clue
0,BIG,HOT,IN,POPULAR,OF-THE-MOMENT
1,CHARACTER,GLYPH,ICON,SYMBOL,PICTOGRAPH
2,ASSESS,CHARGE,FINE,LEVY,"IMPOSE, AS A PENALTY"
3,HANDSOME,HIPPO,LEGEND,LIPID,WORDS BEGINNING WITH BODY PARTS
4,CALCULATOR,CALENDAR,CAMERA,CLOCK,SMARTPHONE FEATURES BEGINNING WITH “C”
...,...,...,...,...,...
906,ESSENCE,PEOPLE,TIME,US,MAGAZINES
907,ARE,QUEUE,SEA,WHY,LETTER HOMOPHONES
908,HAIL,RAIN,SLEET,SNOW,WET WEATHER
909,BUCKS,HEAT,JAZZ,NETS,NBA TEAMS


In [54]:
sets.shape

(219, 4, 4, 100)

In [95]:
sample = valid.sample(valid.shape[0] // 4 * 4).iloc[:, :4].to_numpy()
sets = np.array(np.array_split(sample, len(sample) // 4))
enc = sets.reshape((-1, 16))
enc = np.array([[embeddings_index[str(word).lower()] for word in set] for set in enc])
(sets[0], enc[0])

(array([['DREAMCAST', 'GENESIS', 'SWITCH', 'WII'],
        ['LINE', 'POINT', 'RAY', 'SEGMENT'],
        ['FORD', 'GRANT', 'LINCOLN', 'WILSON'],
        ['GANDER', 'GLANCE', 'GLIMPSE', 'LOOK']], dtype=object),
 array([[ 0.42829  , -0.040562 ,  1.4194   , ...,  0.44882  ,  1.2629   ,
         -0.42215  ],
        [ 0.66575  ,  0.94694  ,  0.38909  , ...,  0.17085  ,  0.73485  ,
         -0.058583 ],
        [-0.0099418, -0.030939 ,  0.17056  , ...,  0.35289  ,  0.68786  ,
         -0.37449  ],
        ...,
        [-0.55429  ,  0.45861  ,  1.2993   , ..., -0.17074  , -0.18143  ,
          0.68832  ],
        [ 0.1986   ,  0.26173  ,  1.0303   , ..., -0.12858  , -0.13126  ,
          0.8605   ],
        [-0.23191  ,  0.61425  ,  0.72979  , ..., -0.30782  ,  0.24977  ,
          0.48365  ]], dtype=float32))

In [96]:
clf = KMeansConstrained(
    n_clusters=4,
    size_min=4,
    size_max=4,
)

example = clf.fit_predict(enc[0])
example

array([3, 3, 1, 3, 1, 1, 2, 0, 2, 1, 2, 2, 3, 0, 0, 0], dtype=int32)

In [97]:
ex_set = [sets[0].reshape((16))[np.where(example == i)] for i in range(0, 4)]
ex_set

[array(['SEGMENT', 'GLANCE', 'GLIMPSE', 'LOOK'], dtype=object),
 array(['SWITCH', 'LINE', 'POINT', 'GRANT'], dtype=object),
 array(['RAY', 'FORD', 'LINCOLN', 'WILSON'], dtype=object),
 array(['DREAMCAST', 'GENESIS', 'WII', 'GANDER'], dtype=object)]

In [98]:
sets[0]

array([['DREAMCAST', 'GENESIS', 'SWITCH', 'WII'],
       ['LINE', 'POINT', 'RAY', 'SEGMENT'],
       ['FORD', 'GRANT', 'LINCOLN', 'WILSON'],
       ['GANDER', 'GLANCE', 'GLIMPSE', 'LOOK']], dtype=object)