In [8]:
import numpy as np
import pandas as pd
from collections import Counter
import re
import string
import operator


from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk import edit_distance


from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

lemmatizer = WordNetLemmatizer()



# Util

In [5]:
# import word2vec
word2vec = {}
with open('glove.6B.300d.txt', encoding='utf-8') as f:
    for line in f:
        try :
            values = line.split()
            word = values[0]
            vec = np.asarray(values[1:], dtype='float32')
            word2vec[word] = vec
        except :
            None

In [13]:
# Vectorize strings
def vectorize(s):
    v = np.zeros(300, dtype='float32')
    l = 0
    toks = word_tokenize(s)
    for w in toks :
        try :
            v += word2vec[w]
            l += 1
        except :
            print(w)
            # print words not found in glove 6B
    if l != 0 :
        return v/l
    else :
        return None

In [11]:
# distance btwn a word cat and an aspect (the name of the word is cat because 
#    I mostly use this function to find aspect that contains the name of the category)
def dist(cat, aspect):
    cat_ = lemmatizer.lemmatize(cat)
    toks = word_tokenize(aspect)
    l = []
    for w in toks :
        l.append(edit_distance(lemmatizer.lemmatize(w),cat_))
    return min(l) 

In [4]:
# function to clean strings (remove stop words and keep only letters)
stop_words=set(stopwords.words("english")+list(string.punctuation)+['``',"''"]+["]","["]+['doe', 'ha', 'wa'])
def clean(s):
    return " ".join([re.sub('[^a-zA-Z]+', '', w) for w in word_tokenize(s) if not w in stop_words]).lower()

# Import data

In [3]:
df = pd.read_csv('product_aspects_electronics_and_games.csv')

In [6]:
df['cats'] = df.apply(lambda x : re.sub('[^a-zA-Z& ]+', '', x['cats']), axis = 1 )
df['indiv_aspects_l'] = df.apply(lambda x : [s[1:-1] for s in x['indiv_aspects'][1:-1].split(", ")], axis=1)

# I focus here on toys & games category
df = df[df['cats'] == 'Toys & Games'].copy()

In [9]:
# Create a list of all aspects in the dataset (this code is ugly haha...)
l_a = np.concatenate(df['indiv_aspects_l'].values)
l_a_ = [clean(x) for x in l_a]
l_a__ = [clean(x) for x in l_a_ if x!='']
aspects = [x for x in l_a__ if x!='']

set_aspects = set([x for x in l_a__ if x!=''])
count_a = Counter(aspects)
sorted_a = sorted(count_a.items(), key=operator.itemgetter(1), reverse=True)

In [10]:
# aspects with frequencies
sorted_a

[('game', 1129),
 ('fun', 737),
 ('great', 553),
 ('easy', 359),
 ('worth', 319),
 ('good', 295),
 ('simple', 288),
 ('rules', 269),
 ('cards', 248),
 ('cute', 210),
 ('book', 183),
 ('nice', 178),
 ('hard', 149),
 ('different', 146),
 ('small', 140),
 ('easy learn', 135),
 ('better', 133),
 ('awesome', 127),
 ('difficult', 111),
 ('complicated', 106),
 ('perfect', 103),
 ('complex', 102),
 ('lot', 100),
 ('interesting', 97),
 ('durable', 92),
 ('board', 88),
 ('elf', 83),
 ('everyone', 81),
 ('one', 81),
 ('clear', 79),
 ('beautiful', 77),
 ('amazing', 76),
 ('artwork', 70),
 ('funny', 70),
 ('quality', 70),
 ('wonderful', 69),
 ('cool', 68),
 ('sturdy', 66),
 ('adorable', 66),
 ('players', 66),
 ('great time', 64),
 ('fantastic', 64),
 ('game play', 63),
 ('similar', 63),
 ('excellent', 63),
 ('instructions', 62),
 ('enjoyable', 61),
 ('pieces', 60),
 ('challenging', 60),
 ('bad', 60),
 ('art', 59),
 ('smaller', 58),
 ('price', 57),
 ('kids', 57),
 ('possible', 56),
 ('high', 55),
 (

# vectorize aspects

In [14]:
L = []
X = []
idx = []
for a in set_aspects:
    v_ = vectorize(a)
    if (type(v_) !=  type(None))&(dist('game', a) != 0) :
        # I remove the aspects that containts the word "game"
        L.append(('games', (a, count_a[a] ,v_)))
        X.append(v_)
        idx.append(a)
# the prints are the words not found in glove 6B -> could be improve by adding a spacy corrector

humungous
granparents
genestealers
spendy
vaue
artworkcons
toonish
boardgamers
manipulatable
strengthcraft
awsome
perfumyplasticy
wowfactor
discriptions
oldschool
allno
mendable
fragileish
slaphappy
magnifacant
gamebox
metallicpearled
immediatesatisfaction
levals
beatiful
georgeous
faceup
easilymastered
positionable
lesserskilled
peices
deckbuilding
gofish
interestingfun
titlevillain
injokes
moviegame
minitures
appearent
twowagon
playabilty
ironbreaker
unstreamlined
ffgs
flawlessness
catanesque
eatch
varities
awardwinning
unpredicatable
comperable
infantries
motiv
longthe
tieins
replayablility
veryartistic
pikids
abovepar
cardbasedfamily
hisher
headfacehair
minichallenges
wikkistix
twopart
gratzirachel
madeand
twoplayer
paida
lockin
simplethe
playerplayer
niceweight
expac
reallife
instrucion
bookthe
interestingwe
erratas
hitpoints
liveshealth
shinynew
booktoy
explanantions
illistration
supermag
hillarious
helpfull
packable
chanceluck
ordable
terraformers
instructionsrules
stratigy
deck

In [18]:
X = np.array(X)
idx = np.array(idx)
df = pd.DataFrame(data=X, index=idx)

# Elbow method

In [15]:
# Takes time to run !!!

# scores = []
# ks = []
# stds = []
# for k in range (10, 30):
#     print(k)
#     ks.append(k)
#     scores_i = []
#     for i in range (20):
#         kmeans = KMeans(n_clusters = k)
#         y_pred = kmeans.fit_predict(X)
#         score = silhouette_score(X, y_pred)
#         scores_i.append(score)
#     mean_ = np.mean(scores_i)
#     std_ = np.std(scores_i)
#     scores.append(mean_)
#     stds.append(std_)
#     print (mean_, std_)

In [None]:
# scores_up = np.array(scores)+3*np.array(stds)
# scores_down = np.array(scores)-3*np.array(stds)

In [None]:
# plt.plot(ks, scores)
# plt.plot(ks, scores_up)
# plt.plot(ks, scores_down)

# the plot is not very nice...

In [16]:
# nb_cluster_opti = ks[np.argmin(scores)]
nb_cluster_opti = 14
nb_cluster_opti

14

# Fit Kmean with optimal k

In [19]:
kmeans = KMeans(n_clusters = nb_cluster_opti, random_state=0).fit(X)
centers = kmeans.cluster_centers_
df['cluster'] = kmeans.labels_

In [23]:
# take aspects close to each centroid to represent the cluster
clusters = {}
for i in range (nb_cluster_opti):
    df_i = df[df['cluster']==i].copy()
    arr_i = df_i.values[:,:-1]
    c = centers[i]
    df_i['dist'] = np.apply_along_axis(lambda x : np.sqrt(np.sum((x-c)**2)), 1, arr_i)
    idx_close = list(df_i['dist'].sort_values()[:10].index)
    clusters[i] = idx_close

In [24]:
clusters

{0: ['well many way',
  'way many rules',
  'particular set',
  'much various things',
  'instance',
  'one particular suspect',
  'way many advantages',
  'way many pieces',
  'several small expansion packs',
  'many moves'],
 1: ['fate point economy epiphany',
  'likely experience',
  'subtle long term effects',
  'pretty much effects',
  'three builtin difficulty levels',
  'major annoying differences',
  'much leadership synergy',
  'luck risk element',
  'wound success markers',
  'fact'],
 2: ['unique ruinous power cultist figures staves',
  'lastly heroes',
  'dark tales expansion pack',
  'crafty original dice mechanic',
  'even weakest mages',
  'warlock quest cards',
  'zombie survivor pieces',
  'childrens first instinct',
  'firefly universe money',
  'rafter several unsuccessful guesses'],
 3: ['one practice round',
  'least one new person',
  'first thing',
  'next day perfect condition',
  'last one',
  'one set',
  'first one',
  'first time',
  'every expansion set',
 

# Lab

In [22]:
# print the aspects that contains game we could keep these aspect and just remove the word "game" from the aspects !
dict_dist = {a : dist('game', a) for a in set_aspects}
for x in dict_dist :
    if dict_dist[x] == 0 :
        print(x)

unique game play
board game variation
mind games
word game
times game
x matrix game
excellent game
wanted game
strategy card game
ps video games
overall game
clever homogenous game design
old scrabble game
new game system
game mechanics
close game
another fantasy flight game
weekly game night
card games
original game
every game
game experience
game nights
complex game
person games
many cool board games
well thought game system
basic game functions
several wrestling games
one practice game
game learning curve
complexity game length
riotous game
twoplayer game
good game play
classic game
game board art
game
game adaptations
competitive game
game design
team building games
geography game
board game version
first games
quicklythe game pieces
many trading card games
party game
game game
regular game night
even quickstart game
game system
game maker
multiplayer games
game manual
game cards
unfinished game
interesting game
new game
great family game night
many games
four card games cases
many