In [1]:
import numpy as np
import matplotlib.pyplot as plt
import nltk, os
from utils.morpho_tagger import MorphoTagger
from gensim.models.fasttext import FastText
from functools import reduce
import argparse
from wordcloud import WordCloud
from time import time
from gensim.similarities import WmdSimilarity
from nltk.cluster import KMeansClusterer
from datetime import datetime
import random
from collections import Counter


class FastTextConfig:
    def __init__(self, embedding_size, window_size, min_word, down_sampling):
        self.embedding_size = embedding_size
        self.window_size = window_size
        self.min_word = min_word
        self.down_sampling = down_sampling
        
class FastTextSimilarityModel:
    def __init__(self, file_path, conf):
        def _load(file, l):
            with open(file_path+file, "r", encoding='utf-8') as file:
                for line in file:
                    line = line[:-1]
                    # just different sentences
                    if line not in l:
                        l.append(line)
        
        self.sentences_pos = []
        self.sentences_pos_processed = []
        self.sentences_neg = []
        self.sentences_neg_processed = []
        self.model_conf = conf
        self.model_pos = None
        self.model_neg = None
        
        
        _load('dataset_positive.txt', self.sentences_pos)
        _load('dataset_negative.txt', self.sentences_neg)
        
        self.sentences_pos_len = len(self.sentences_pos)
        self.sentences_neg_len = len(self.sentences_neg)
        
        self.sentences_pos_len = 500
        self.sentences_pos = self.sentences_pos[:self.sentences_pos_len]
       

    def preprocess(self, tagger: MorphoTagger, sentences):
        sentences_processed = []


        for sentence in sentences:
            s = []
            l = reduce(lambda x, y: x + y, tagger.pos_tagging(sentence, False))
            for idx, wp in enumerate(l):
                s.append(wp.lemma)
            sentences_processed.append(s)
            
        
        return sentences_processed

    def train_similarity(self, sentences_processed):
        model = FastText(sentences_processed,
                        size=self.model_conf.embedding_size,
                        window=self.model_conf.window_size,
                        min_count=self.model_conf.min_word,
                        sample=self.model_conf.down_sampling,
                        sg=1,
                        iter=100)
        model.init_sims(replace=True)
        return model
    
    def set_pos(self, sentences):
        self.sentences_pos_processed = sentences
    
    def set_neg(self, sentences):
        self.sentences_neg_processed = sentences
    
    def set_pos_model(self, model):
        self.model_pos = model
    
    def set_neg_model(self, model):
        self.model_neg = model
        
    def word_cloud(self, sentences, category, className):
        tokens = [token for sentence in sentences for token in sentence]
        text = ' '.join(tokens)
        wordcloud = WordCloud(max_font_size=40, width=600, 
                              height=400, background_color='white', 
                              max_words=200, relative_scaling=1.0).generate_from_text(text)

        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis("off")
        wordcloud.to_file('./tmp/' + category + '-' +className + '.jpg')

%matplotlib inline

In [2]:
%%time
tagger = MorphoTagger()
tagger.load_tagger("external/morphodita/czech-morfflex-pdt-161115-no_dia-pos_only.tagger")

conf = FastTextConfig(100, 10, 5, 1e-2)

fastTextModel = FastTextSimilarityModel("./", conf)
fastTextModel.set_pos(fastTextModel.preprocess(tagger, fastTextModel.sentences_pos))
#fastTextModel.set_neg(fastTextModel.preprocess(tagger, fastTextModel.sentences_neg))

fastTextModel.set_pos_model(fastTextModel.train_similarity(fastTextModel.sentences_pos_processed))
#fastTextModel.set_neg_model(fastTextModel.train_similarity(fastTextModel.sentences_neg_processed))


CPU times: user 9.86 s, sys: 2.97 s, total: 12.8 s
Wall time: 12.8 s


In [3]:
print(fastTextModel.sentences_pos_len)
#print(fastTextModel.sentences_neg_len)

500


In [None]:
semantically_similar_words = {words: [item[0] for item in fastTextModel.model_pos.wv.most_similar([words], topn=5)]
                                      for words in
                                      ['cena', 'kabel', 
                                       'manipulace', 'hadice', 
                                       'nádoba', 'kabel', 
                                       'filtr','šnůra',
                                      'kvalita']}

for k, v in semantically_similar_words.items():
    print(k + ":" + str(v))

In [None]:
print(fastTextModel.model_pos.wv.similarity(w1='doba', w2='nádoba'))
print(fastTextModel.model_pos.wv.similarity(w1='kabel', w2='hadice'))

In [None]:
from sklearn.decomposition import PCA

all_similar_words = sum([[k] + v for k, v in semantically_similar_words.items()], [])

print(all_similar_words)
print(type(all_similar_words))
print(len(all_similar_words))

In [None]:
word_vectors = fastTextModel.model_pos.wv[all_similar_words]

pca = PCA(n_components=2)

p_comps = pca.fit_transform(word_vectors)
word_names = all_similar_words

plt.figure(figsize=(18, 10))
plt.scatter(p_comps[:, 0], p_comps[:, 1], c='red')

for word_names, x, y in zip(word_names, p_comps[:, 0], p_comps[:, 1]):
    plt.annotate(word_names, xy=(x+0.06, y+0.03), xytext=(0, 0), textcoords='offset points')

In [None]:
s1 = fastTextModel.preprocess(tagger,['Snadná obsluha, sestavení i čištění po použití.'])[0]
s2 = fastTextModel.preprocess(tagger,['Snadná obsluha, demontáž-montáž, výměna sáčku a filtru.'])[0]
distance = fastTextModel.model_pos.wv.wmdistance(s1, s2)
print('distance = %.4f' % distance)
distance = fastTextModel.model_pos.wv.wmdistance(s2, s1)
print('distance = %.4f' % (1-distance))


In [None]:
# positive are loaded in fastext model
fastTextModel.word_cloud(fastTextModel.sentences_pos_processed, 'vysavace' , 'positive')
fastTextModel.word_cloud(fastTextModel.sentences_neg_processed, 'vysavace' , 'positive')





In [14]:
num_best = fastTextModel.sentences_pos_len
start = time()
wmd_sim = WmdSimilarity(fastTextModel.sentences_pos_processed, fastTextModel.model_pos, num_best=num_best)
print('It took {:02} seconds to run.'.format(time() - start))

It took 0.4076354503631592 seconds to run.


In [None]:
start = time()
for i in range(0,10):
    sims = wmd_sim[s1]

print('It took {:02} seconds to run.'.format(time() - start))

In [17]:
start = time()
sims_multi = wmd_sim[fastTextModel.preprocess(tagger,['Malý (všude se vejde),turbo kartáč, bezsáčkový, příslušenství.'])[0]]
print('It took {:02} seconds to run.'.format(time() - start))

for i in range(num_best):
    print('sim = {:04}'.format(sims_multi[i][1]))
    print(fastTextModel.sentences_pos[sims_multi[i][0]])


It took 0.9344799518585205 seconds to run.
sim = 01.0
Malý (všude se vejde),turbo kartáč, bezsáčkový, příslušenství.
sim = 0.6447645302526329
Chtěla jsem bezsáčkový na vodu a vybrala jsem na základě recenzí.
sim = 0.6443571619589752
Turbo kartac-naprosta spokojenost.
sim = 0.6358200838242337
Příslušenství je prakticky řešené.
sim = 0.6351759234519027
Bezsáčkový provoz je prioritou.
sim = 0.6351668433891402
Parkovací brzda na rotační kartáč.
sim = 0.6346725437812631
Široká plejáda dokoupitelného příslušenství.
sim = 0.6341126125785577
Štěrbinová hubice s odjímatelným kartáčem.
sim = 0.6304872092607772
Možnost mytí oken v případě zakoupení dalšího příslušenství.
sim = 0.6284625679529029
Elektrický kartáč, který lze vypnout.
sim = 0.6256235327761042
Pohodlné čištění odjímatelného kartáče od vlasů a chlupů.
sim = 0.6236062694453651
Sedací soupravou opravdu vyčistilo nádherné i fleky co nadělala malá dcera.
sim = 0.6182303703979435
Dlouhý kabel,příslušenství včetně kartáčku na palubku.
sim 

In [None]:
sims = wmd_sim[fastTextModel.preprocess(tagger,['Snadná obsluha a pohodové vysávání.'])[0]]

for i in range(num_best):
    print('sim = {:04}'.format(sims[i][1]))
    print(fastTextModel.sentences_pos[sims[i][0]])

In [None]:
import concurrent.futures

def worker(sen):
    out = {
        'sentence': sen,
        'sim_list': [],
    }
    sim = wmd_sim[sen]
    for index in range(fastTextModel.sentences_pos_len):
        d = {
            'sim': sims[index][1],
            'sentence': fastTextModel.sentences_pos[sims[index][0]]
        }
        out['sim_list'].append(d)
    return out

start = time()
out = []

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:        
    future_to_sim_d = {executor.submit(worker, sentence): sentence for sentence in fastTextModel.sentences_pos[:10]}
    for future in concurrent.futures.as_completed(future_to_sim_d):
       sim_d= future_to_sim_d[future]


print('It took {:02} seconds to run.'.format(time() - start))


In [4]:
from fse.models import Average, SIF
from fse import IndexedList
#model = Average(fastTextModel.model_pos)
model = SIF(fastTextModel.model_pos)
model.train(IndexedList(fastTextModel.sentences_pos_processed))
model.sv.similarity(0,1)


-0.10448504

In [13]:
print(model.sv.similarity(0,1).round(3))
print(model.sv.distance(0,1).round(3))
print(fastTextModel.sentences_pos_processed[499])
print(fastTextModel.sentences_pos[499])

with open('file.txt', 'w') as file:
    for line in fastTextModel.sentences_pos:
        file.write(line+'\t0\n')

-0.104
1.104
['onen', 'system', 'pokud', 'ostatni', 'naradi', 'ryobi']
One + systém (pokud máte i ostatní nářadi ryobi).


In [None]:
distance = fastTextModel.model_pos.wv.wmdistance(fastTextModel.sentences_pos_processed[0],
                                                 fastTextModel.sentences_pos_processed[1])
print('distance = %.4f' % distance)

In [None]:
sentences_vectors = []
i = 0
for vector in model.sv:
    print(i)
    i += 1
    sentences_vectors.append(vector)
    if i == 500:
        break
    
    
s = IndexedList(fastTextModel.sentences_pos_processed)
print(s[0])
model.sv.most_similar(0, indexable=s.items)

In [None]:
# sentence2vec similarities
sim_matrix = []
dist_matrix = []

for i in range(fastTextModel.sentences_pos_len):
    vec_sims = []
    vec_dists = []
    for j in range(fastTextModel.sentences_pos_len):
        val = model.sv.similarity(i,j)
        if val < 0.0:
            val = 0.0
        if val > 1.0:
            val = 1.0
        vec_sims.append(val)
        vec_dists.append(1.0-val)
    sim_matrix.append(np.array(vec_sims))
    dist_matrix.append(np.array(vec_dists))

print(len(sim_matrix))
print(len(dist_matrix))
print(len(sentences_vectors))

dist_matrix = np.array(dist_matrix)
sim_matrix = np.array(sim_matrix)

In [None]:
num_clusters = 15
rng = random.Random(datetime.now())
kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=60,
                                 avoid_empty_clusters=True, rng=rng)

assigned_clusters = kclusterer.cluster(sim_matrix, assign_clusters=True)
output = {}
for k in range(0, num_clusters):
    output[k] = []
    
for j, sen in enumerate(fastTextModel.sentences_pos):
    output[assigned_clusters[j]].append(sen+'\t'+str(j))
    

dir = "kmeans_clusters_sent2vec_wmd_similarity_cos_"+str(num_clusters)
if not os.path.exists(dir):
    os.makedirs(dir)
#f = open("clusters"+str(num_clusters)+".txt", "w", encoding='utf-8')
for key, value in output.items():
    with open(dir+"/"+str(key)+".txt",  "w", encoding='utf-8') as file:
        print("cluster: " +str(key) + " sentences: " +str(len(value)))
        for val in value:
            file.write(val + "\n")


In [None]:
import os
# sent2vec embeddings and cosine distance
num_clusters = 15
rng = random.Random(datetime.now())
kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=60,
                                 avoid_empty_clusters=True, rng=rng)

labels = kclusterer.cluster(sentences_vectors, assign_clusters=True)

cnt = Counter(labels)
print(cnt)

with open('kmeans_sent2vec_cos_'+str(num_clusters)+'.tsv', 'w') as file:
    for j, sen in enumerate(fastTextModel.sentences_pos):
        file.write(sen + '\t' + str(labels[j]) + '\n')

In [None]:
num_clusters = 15
rng = random.Random(datetime.now())
kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=60,
                                 avoid_empty_clusters=True, rng=rng)

labels = kclusterer.cluster(dist_matrix, assign_clusters=True)
cnt = Counter(labels)
print(cnt)

with open('kmeans_wmd_distance_cos'+str(num_clusters)+'.tsv', 'w') as file:
    for j, sen in enumerate(fastTextModel.sentences_pos):
        file.write(sen + '\t' + str(labels[j]) + '\n')
        

In [None]:
num_clusters = 15
rng = random.Random(datetime.now())
kclusterer = KMeansClusterer(num_clusters, distance=nltk.cluster.util.cosine_distance, repeats=60,
                                 avoid_empty_clusters=True, rng=rng)

labels = kclusterer.cluster(sim_matrix, assign_clusters=True)
cnt = Counter(labels)
print(cnt)

with open('kmeans_wmd_similarity_cos'+str(num_clusters)+'.tsv', 'w') as file:
    for j, sen in enumerate(fastTextModel.sentences_pos):
        file.write(sen + '\t' + str(labels[j]) + '\n')
        

In [None]:
num_clusters = 15
dir = "kmeans_wmd_similarity_cos_"+str(num_clusters)

for cluster in range(num_clusters):
    tmp_sim_matrix = []
    cluster_sentences = []
    with open(dir+"/"+str(2)+".txt",  "r", encoding='utf-8') as file:
        cluster_sentences = [int(line[:-1].split('\t')[1]) for line in file]
        
    
    for i in cluster_sentences:
        vec_sims = []
        for j in cluster_sentences:
            vec_sims.append(model.sv.similarity(i,j))
        tmp_sim_matrix.append(vec_sims)
    print('\n'.join(['\t'.join([str(cell) for cell in row]) for row in tmp_sim_matrix]))
    break
            

In [None]:
from sklearn.cluster import AffinityPropagation

clustering = AffinityPropagation(damping=0.7, affinity='precomputed', convergence_iter=20).fit(sim_matrix)

labels = clustering.labels_

no_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(no_clusters)

with open('AffinityPropagation_wmd_sim.tsv', 'w') as file:
    for j, sen in enumerate(fastTextModel.sentences_pos):
        file.write(sen + '\t' + str(labels[j]) + '\n')

In [None]:
from sklearn.cluster import DBSCAN

clustering = DBSCAN(eps=0.3, metric='precomputed', min_samples=10, algorithm='brute').fit(dist_matrix)


labels = clustering.labels_
no_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(no_clusters)

with open('DBSCAN_wmd_sim.tsv', 'w') as file:
    for j, sen in enumerate(fastTextModel.sentences_pos):
        file.write(sen + '\t' + str(labels[j]) + '\n')

In [None]:
import hdbscan

clustering = hdbscan.HDBSCAN(min_cluster_size=10, metric='precomputed', min_samples=10).fit(dist_matrix)

labels = clustering.labels_

no_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(no_clusters)


with open('HDBSCAN_wmd_dist.tsv', 'w') as file:
    for j, sen in enumerate(fastTextModel.sentences_pos):
        file.write(sen + '\t' + str(labels[j]) + '\n')

In [None]:
from sklearn.cluster import AgglomerativeClustering

clustering = AgglomerativeClustering(affinity='precomputed', linkage='ward', n_clusters=5)

labels = clustering.fit_predict(dist_matrix)

no_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(no_clusters)

with open('AgglomerativeClustering_wmd_sim.tsv', 'w') as file:
    for j, sen in enumerate(fastTextModel.sentences_pos):
        file.write(sen + '\t' + str(labels[j]) + '\n')