# Exploratory clustering on doc2vec reinferred corpus vectors

In [1]:
import os
import gensim
import numpy as np
import scipy as sp
from smart_open import smart_open
import multiprocessing



from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.manifold import TSNE


## Load Corpus and Model

In [2]:
with open('/Users/eslt0101/Projects/EviDENce/ML/stopwords-nl-stopwords-iso_expanded.txt','r') as stopwords_in:
    dutch_stopwords =stopwords_in.readlines()
    for i,sw in enumerate(dutch_stopwords):
        dutch_stopwords[i] = sw.replace('\n','')
        
    dutch_stopwords = gensim.utils.simple_preprocess(' '.join(dutch_stopwords),deacc=True)
    #print(dutch_stopwords)

In [3]:
def read_corpus(corpus_token_file,labeled=False):
    with smart_open(corpus_token_file,'r') as tf:
        for i,text_line in enumerate(tf):
            if labeled :
                #yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(text_line), [fn_line.rstrip()])
                yield gensim.models.doc2vec.TaggedDocument([token for token in gensim.utils.simple_preprocess(text_line,deacc=True) if token not in dutch_stopwords], [i])
            else:
                yield [token for token in gensim.utils.simple_preprocess(text_line,deacc=True) if token not in dutch_stopwords]

In [4]:
def read_corpus_lookup(corpus_filenames_file, corpus_token_file) :
    with smart_open(corpus_filenames_file, 'r') as fnf, smart_open(corpus_token_file,'r') as tf:
        i=0
        for (fn_line,tf_line) in zip(fnf,tf):
            yield ([i],[fn_line.rstrip()],[tf_line])
            i+=1
        

In [5]:
corpus_dir = '/Users/eslt0101/Data/eScience/EviDENce/Data/NR-Teksts/EviDENce_NR_output/TargetSize150'
#corpus_text_file = 'test_GV_corpus_terms_para.txt'
#corpus_filenames = 'test_GV_corpus_terms_para_filenames.txt'
corpus_token_file = 'GV_corpus_terms_para_150.txt'
corpus_text_file = 'GV_corpus_terms_para_150.txt'
corpus_filenames = 'GV_corpus_terms_para_150_filenames.txt'
full_ctokf = os.path.join(corpus_dir,corpus_token_file)
full_ctf = os.path.join(corpus_dir,corpus_text_file)
full_cff = os.path.join(corpus_dir,corpus_filenames) 

In [6]:
corp = list(read_corpus(full_ctokf,labeled=True))

In [7]:
corp_lookup = list(read_corpus_lookup(full_cff,full_ctokf))

In [8]:
corp_lookup[0]

([0],
 ['05_JKKV_2003_JACQUES_FURTH_conversation_clipped_150_paragraph_104-108_text.txt'],
 ["Op het werk , ja. [ ja ? ] En het was zo dat eh toen op een gegeven ogenblik die staking uitbrak , toen staakten wij ook , maar die twee die eh die eisten van de eh van de van de eh fabriek [ leiding ? ] nou ja dat was ook een eh eh [ Verwalter ? ] Neeee , nee nee , het het dat toen nog niet . En dat was ook geen Jood eh geen Joodse man , het was een christenman , die die eh fabriek dan leidde . En eh , die eiste dat eh dat hij eh dat ze dat heette in het vak ' je riem afgooien ' als je meestaakte , hè , ook als je protesteerde of zo. En zonder meer dan heette dat dat je de riem afgooide . En die eh wilden hun eh riem natuurlijk niet afgooien en die eisten van die eh fabrikant eh directeur zal ik maar zeggen , waarnemer , eiste ie dat ie eh dat eh de fabriek bleef draaien , want zij bleven werken . \n"])

In [9]:
corp[0]

TaggedDocument(words=['werk', 'ogenblik', 'staking', 'uitbrak', 'staakten', 'eisten', 'fabriek', 'leiding', 'verwalter', 'neeee', 'jood', 'joodse', 'man', 'christenman', 'fabriek', 'leidde', 'eiste', 'heette', 'vak', 'riem', 'afgooien', 'meestaakte', 'protesteerde', 'heette', 'riem', 'afgooide', 'riem', 'natuurlijk', 'afgooien', 'eisten', 'fabrikant', 'directeur', 'zeggen', 'waarnemer', 'eiste', 'ie', 'ie', 'fabriek', 'bleef', 'draaien', 'bleven', 'werken'], tags=[0])

In [10]:
model = gensim.utils.SaveLoad.load('/Users/eslt0101/Projects/EviDENce/ML/model_default_v50_mc2_e30_freeze.d2v')





model

In [11]:
model.wv.vocab['dood'].count

967

## Load reinferred corpus vectors 

In [12]:
reinf_corp_medvec= np.load('/Users/eslt0101/Projects/EviDENce/ML/reinferred_corpus_terms_para_150_medvec.npy')

In [13]:
reinf_corp_medvec

array([[-0.34251951,  0.52918094,  0.0650135 , ...,  0.31043735,
        -0.1035799 , -0.3434217 ],
       [-0.28651591,  0.3921694 ,  0.11584316, ...,  0.12913995,
        -0.12679765, -0.18873925],
       [-0.14764674,  0.30281365,  0.11900603, ...,  0.2351317 ,
         0.07376267, -0.15956078],
       ...,
       [-0.09300769,  0.43088813, -0.18835094, ..., -0.13740871,
        -0.04644159, -0.13512794],
       [-0.30387661,  0.31770231, -0.03013355, ...,  0.07130665,
        -0.00204581, -0.46188374],
       [-0.33581319,  0.05694224,  0.07295781, ...,  0.10489847,
        -0.11573624, -0.00200729]])

## Try clustering on reinferred vectors

This doesn't seem to work

In [14]:
dbscan = DBSCAN(eps=0.05,metric='cosine',min_samples=5)

In [15]:
%time dbscan.fit(reinf_corp_medvec)

CPU times: user 15.9 s, sys: 2.16 s, total: 18.1 s
Wall time: 6.7 s


DBSCAN(algorithm='auto', eps=0.05, leaf_size=30, metric='cosine',
    metric_params=None, min_samples=5, n_jobs=None, p=None)

In [16]:
labels = dbscan.labels_
components = dbscan.components_
core_sample_indices = dbscan.core_sample_indices_
print(core_sample_indices)

[   60   248   291   359   393   395   541   972   980   994  1012  1038
  1048  1110  1112  1117  1126  1139  1142  1649  1800  1983  2016  2020
  2025  2041  2051  2056  2077  2085  2089  2092  2105  2214  2240  2242
  2244  2245  2250  2270  2277  2279  2282  2287  2300  2306  2821  2822
  2825  2830  2837  2845  2970  3138  3151  3161  3162  3317  3761  3766
  3769  3780  3792  3798  3810  3819  3828  3836  3849  3859  4062  4066
  4118  4211  4234  4239  4243  4254  4258  4259  4264  4292  4390  4404
  4410  4423  4424  4425  4491  4683  5736  5766  5771  5782  5792  5799
  5814  5836  5845  5848  5852  5855  6065  6139  6163  6498  6503  6523
  6534  6535  6538  6575  6578  6661  6662  6672  6681  6685  6689  6707
  7022  7196  7306  7365  7813  7827  8231  8261  8423  8431  8480  8527
  8973  8977  8979  8984  8995  9008  9014  9015  9016  9017  9018  9020
  9725 10053 10054 10056 10060 10061 10062 10067 10070 10077 10078 10079
 10371 10454 10461 10474 10487 10488 10491 10585 10

In [17]:
print(set(labels))
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print(n_clusters_)

{0, 1, 2, -1}
3


In [18]:
print(len(labels[labels == 1]))

29


In [19]:
np.where(labels == 1
        )

(array([10053, 10054, 10056, 10057, 10058, 10060, 10061, 10062, 10064,
        10066, 10067, 10069, 10070, 10071, 10072, 10073, 10076, 10077,
        10078, 10079, 14694, 14698, 14700, 14701, 14702, 14703, 14708,
        14711, 14712]),)

In [20]:
print(corp_lookup[10053])

([10053], ['GV_Wieberdink_dodenmars_02a_conversation_clipped_150_paragraph_107-114_text.txt'], ['Américains , mais vous savez que les Américains avançaient , peut-être ils seraient venus Nous ne savions pas combien de temps il faudrait pour les rejoindre . Mais nous avons marché comme ça vers les Américains pendant dix jours , et .. pendant ces dix jours on ne marchait pas tout le temps bien sûr , on marchait , on se reposait , et la nuit on essayait de trouver une grange , ou bien un endroit assez confortable pour dormir , jusqu ´ au lendemain matin avant de repartir marcher . Oui . On a marché pendant dix jours . Et c ´ étaient vous et Lon qui allaient chaque fois demander ... Chaque fois , oui , parce que , moi , je parlais quatre langues et Lon en parlait presque quatre aussi avec le hollandais , alors ... nous étions Et surtout vous parliez allemand comme une Allemande . \n'])


Try T-SNE

In [157]:
%time medvec_embedded = TSNE(n_components=3).fit_transform(reinf_corp_medvec)

CPU times: user 32min 12s, sys: 24.5 s, total: 32min 36s
Wall time: 24min 18s


In [22]:
#np.save('TSNE_embedded_reinf_corp_medvec',medvec_embedded)
medvec_embedded = np.load('/Users/eslt0101/Projects/EviDENce/ML/TSNE_embedded_reinf_corp_medvec.npy')

In [43]:
print((medvec_embedded.shape))
print(medvec_embedded[0:2,0])
print(medvec_embedded[1][0])

(27065, 3)
[-11.83685   -6.883695]
-6.883695


In [52]:
import matplotlib.pyplot as plt
from plotly import __version__
import plotly.offline as py
from plotly.offline import download_plotlyjs, init_notebook_mode,plot, iplot
#import plotly.plotly as  py
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [66]:
#selection of relevant snippets (imported)
relevant =[20106, 26710, 7015, 15229, 14945, 19753, 14020, 19807, 10385, 20424, 20311, 17276]
relevant2 = [20106, 10415, 13946, 1263, 24390, 18340, 17276, 24378, 2, 4207, 20153, 17197, 25237, 5441, 8439, 26618]
print(type(relevant))
print(medvec_embedded[20106,0])

<class 'list'>
-8.856896


In [67]:
xx=medvec_embedded[:,0]
yy=medvec_embedded[:,1]
zz=medvec_embedded[:,2]

trace1 = go.Scatter3d(
    x=xx,
    y=yy,
    z=zz,
    mode='markers',
    marker=dict(
        size=12,
        line=dict(
            color='rgba(217,217,217,0.14)',
            width=0.5
        ),
        opacity=0.1
    )

)


trace2 = go.Scatter3d(
    x=xx[relevant],
    y=yy[relevant],
    z=zz[relevant],
    mode='markers',
    marker=dict(
        color='rgb(127,127,127)',
        size=12,
        line=dict(
            color='rgb(204,204,204)',
            width=0.5
        ),
        opacity=0.9
    )

)


trace3 = go.Scatter3d(
    x=xx[relevant2],
    y=yy[relevant2],
    z=zz[relevant2],
    mode='markers',
    marker=dict(
        color='rgb(255,0,0)',
        size=12,
        line=dict(
            color='rgb(204,204,204)',
            width=0.5
        ),
        opacity=0.9
    )

)

data=[trace1,trace2,trace3]
layout=go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data,layout=layout)
py.iplot(fig,filename='medvec_scatter_3d')