In [30]:
import pandas as pd
import numpy as np
import pymorphy2
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
import sklearn.manifold as sm
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.cluster import MeanShift
from sklearn.decomposition import TruncatedSVD
import networkx as nx
from copy import deepcopy
import gc
from tqdm import tqdm_notebook
import cPickle as pickle
import random as r
import os
from keras.datasets import imdb
from keras.models import Sequential, Model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Input, merge, dot
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

os.environ['KERAS_BACKEND'] = 'theano'
RusLem = pymorphy2.MorphAnalyzer()


def bprint(l, sep = " "):
    if len(l) == 0:
        print ""
    print sep.join(l)
    
def lematization(sent):
    tokens = re.findall('[\w]+', sent.strip().lower(), re.U)
    return " ".join([RusLem.parse(token)[0].normal_form for token in tokens])

In [7]:
train_data_wiki = pd.read_csv("./russe-wsi-kit/data/main/wiki-wiki/train.csv",
                              dtype={'gold_sense_id': str, 'predict_sense_id': str}, 
                              sep = '\t')
train_data_wiki.head()

Unnamed: 0,context_id,word,gold_sense_id,predict_sense_id,positions,context
0,1,замок,1,,"0-5, 339-344",замок владимира мономаха в любече . многочисле...
1,2,замок,1,,"11-16, 17-22, 188-193","шильонский замок замок шильйон ( ) , известный..."
2,3,замок,1,,299-304,проведения архитектурно - археологических рабо...
3,4,замок,1,,111-116,"топи с . , л . белокуров легенда о завещании м..."
4,5,замок,1,,"134-139, 262-267",великий князь литовский гедимин после успешной...


In [50]:
train_data_wiki.describe()

Unnamed: 0,context_id,word,gold_sense_id,predict_sense_id,positions,context
count,439,439,439,0.0,439,439
unique,439,4,2,0.0,381,439
top,408,замок,1,,"0-3, 122-125, 138-141, 152-155, 168-171","был главный инженер людовика xiv , маршал фран..."
freq,1,138,279,,5,1


In [51]:
train_data_wiki['context'] = np.asarray([row.decode('utf-8') for row in train_data_wiki['context']])
train_data_wiki['word'] = np.asarray([row.decode("utf-8") for row in train_data_wiki['word']])

In [6]:
grouped_train_data_wiki = train_data_wiki.groupby('word')

In [7]:
for g in grouped_train_data_wiki.groups:
    print g, RusLem.parse(g)[0].normal_form

суда суд
замок замок
бор бора
лук лук


In [8]:
for e, i in enumerate(train_data_wiki.itertuples()):
    print i, i.context_id

Pandas(Index=0, context_id='1', word=u'\u0437\u0430\u043c\u043e\u043a', gold_sense_id='1', predict_sense_id=nan, positions='0-5, 339-344', context=u'\u0437\u0430\u043c\u043e\u043a \u0432\u043b\u0430\u0434\u0438\u043c\u0438\u0440\u0430 \u043c\u043e\u043d\u043e\u043c\u0430\u0445\u0430 \u0432 \u043b\u044e\u0431\u0435\u0447\u0435 . \u043c\u043d\u043e\u0433\u043e\u0447\u0438\u0441\u043b\u0435\u043d\u043d\u044b\u0435 \u0443\u043a\u0440\u0435\u043f\u043b\u0435\u043d\u043d\u044b\u0435 \u043c\u043e\u043d\u0430\u0441\u0442\u044b\u0440\u0438 \u0442\u0430\u043a\u0436\u0435 \u043d\u0435 \u044f\u0432\u043b\u044f\u043b\u0438\u0441\u044c \u0437\u0430\u043c\u043a\u0430\u043c\u0438 \u043a\u0430\u043a \u0442\u0430\u043a\u043e\u0432\u044b\u043c\u0438\xa0\u2014 \u044d\u0442\u043e \u0431\u044b\u043b\u0438 \u043a\u0440\u0435\u043f\u043e\u0441\u0442\u0438 . \u0440\u0430\u043d\u043d\u0438\u0435 \u0435\u0432\u0440\u043e\u043f\u0435\u0439\u0441\u043a\u0438\u0435 \u0437\u0430\u043c\u043a\u0438 \u0441\u0442\u0440\

In [66]:
def normalize_dataset(dataset):
    #format: context_id, word, gold_sense_id, left_context, right_context
    new_dataset = []
    word_index = {}
    cur_word_idx = 1
#     new_dataset['context'] = np.asarray([lematization(row) for row in dataset['context']])
    
    for row in tqdm_notebook(dataset.itertuples()):
        positions = [(p.strip().split("-")[0], p.strip().split("-")[1]) 
                     for p in row.positions.strip().split(",")]
        context_id = row.context_id
        gold_sense_id = row.gold_sense_id
        word = row.word
        
#         for p in positions:
#             left_context = lematization(row.context[:int(p[0])])
#             right_context = lematization(row.context[int(p[1]):])
        context = lematization(row.context).split()
        encoding_context = []
        for w in context:
            if w not in word_index:
                word_index[w] = cur_word_idx
                cur_word_idx += 1
            encoding_context.append(word_index[w])    
        new_dataset.append([context_id, word, gold_sense_id, ",".join(map(str, encoding_context))])
        
    return np.asarray(new_dataset), word_index

In [67]:
train_data_wiki_norm, word_index = normalize_dataset(train_data_wiki)




In [69]:
len(word_index.keys())

4162

In [71]:
pickle.dump(word_index, open("word_index", "w"))

In [72]:
pd.DataFrame(train_data_wiki_norm)

Unnamed: 0,0,1,2,3
0,1,замок,1,"1,2,3,4,5,6,7,8,9,10,11,1,12,13,14,15,16,17,18..."
1,2,замок,1,"67,1,1,68,69,4,70,71,12,72,73,74,75,76,77,78,7..."
2,3,замок,1,"95,96,97,98,99,100,101,102,103,104,105,46,106,..."
3,4,замок,1,"124,118,125,126,127,128,129,130,118,131,132,13..."
4,5,замок,1,"138,40,139,140,141,142,143,144,145,146,147,85,..."
5,6,замок,1,"172,173,174,4,175,176,52,177,1,178,46,179,180,..."
6,7,замок,1,"213,214,215,216,217,46,218,219,4,57,220,221,22..."
7,8,замок,1,"166,140,167,40,168,169,119,4,14,1,170,171,4,11..."
8,9,замок,1,"268,21,269,270,1,4,271,77,272,273,274,275,276,..."
9,10,замок,1,"1,275,1,15,256,4,279,46,303,88,89,234,304,305,..."


In [13]:
pd.DataFrame(train_data_wiki_norm[:300, :])

Unnamed: 0,0,1,2,3
0,1,замок,1,"1,2,3,4,5,6,7,8,2,9,10,3,10,11,6,3,4,12,4,3,2,..."
1,2,замок,1,"30,10,8,25,4,12,20,5,10,18,6,1,2,3,4,5,6,1,2,3..."
2,3,замок,1,"23,11,4,7,16,9,16,12,10,16,6,2,11,13,10,24,16,..."
3,4,замок,1,"24,4,23,25,6,20,6,8,6,15,16,8,4,5,22,11,4,7,6,..."
4,5,замок,1,"7,16,8,10,5,10,18,6,5,12,27,1,25,6,8,10,24,4,7..."
5,6,замок,1,"4,12,6,15,16,1,6,15,4,18,6,7,6,12,2,17,2,8,4,6..."
6,7,замок,1,"5,4,8,4,9,16,31,6,13,4,1,27,18,20,24,7,16,12,1..."
7,8,замок,1,"20,21,12,6,19,16,9,10,3,10,12,6,24,11,4,5,20,5..."
8,9,замок,1,"4,9,10,12,6,10,1,6,5,28,3,23,15,16,8,8,6,4,24,..."
9,10,замок,1,"1,2,3,4,5,6,20,24,2,8,5,16,11,6,1,2,3,4,5,6,15..."


In [14]:
bprint(train_data_wiki['context'].values, sep='\n')

замок владимира мономаха в любече . многочисленные укрепленные монастыри также не являлись замками как таковыми — это были крепости . ранние европейские замки строились преимущественно из дерева они опоясывались деревянной оградой — палисадом уже тогда вокруг замков стали появляться рвы . примером такого замка может служить вышгородский замок киевских князей . каменное замковое строительство распространилось в западной и центральной европе лишь к xii веку . главной частью средневекового замка являлась центральная башня — донжон , выполнявшая функции цитадели . помимо своих оборонительных функций , донжон являлся непосредственным жилищем феодала . также в главной башне
шильонский замок замок шильйон ( ) , известный в русскоязычной литературе как шильо́нский за́мок , расположен на швейцарской ривьере , у кромки женевского озера , в  км от города монтре . замок представляет собой комплекс из элементов разного времени постройки .
проведения архитектурно - археологических работ эстонским ре

In [15]:
for v in train_data_wiki['context'].values:
    print len(v.split(" "))

91
41
45
30
74
86
92
70
82
40
80
80
70
81
76
78
67
63
85
88
91
75
74
92
82
79
78
77
72
72
87
87
52
64
93
85
88
93
36
82
80
41
34
28
88
83
78
45
63
77
81
89
91
83
71
78
81
74
90
91
81
34
84
43
75
73
93
85
74
54
82
89
74
90
81
35
74
83
34
37
81
68
74
35
81
47
94
47
92
85
73
74
80
89
92
78
78
41
74
82
70
83
80
84
70
78
72
53
72
76
50
68
80
86
86
70
72
83
83
90
76
71
72
44
78
81
68
84
87
79
73
76
2
87
78
67
80
72
79
45
91
51
91
72
91
45
85
53
77
73
91
42
83
87
39
44
93
90
90
47
93
87
75
60
84
74
72
84
90
90
91
74
87
92
77
84
89
94
94
92
84
93
89
86
83
89
58
75
42
77
75
70
47
59
76
90
68
78
78
37
87
55
93
62
49
48
44
46
95
31
56
44
92
48
43
90
78
52
43
81
64
95
88
81
42
89
93
89
80
87
92
55
47
46
92
74
42
68
49
53
41
55
66
43
50
88
40
64
80
84
47
64
54
55
35
68
87
92
51
38
57
37
71
73
41
80
59
90
53
64
76
89
49
36
82
80
84
82
49
79
86
39
84
32
50
89
87
76
85
46
53
45
83
46
83
87
73
97
63
75
72
44
70
90
86
84
54
74
52
78
48
57
43
86
80
83
90
56
87
87
82
36
66
75
77
91
81
91
43
89
48
66
85
66

In [73]:
pickle.dump(train_data_wiki_norm, open("train_data_wiki_norm_all", "w"))

# Label prepare: positive and negative mining

In [2]:
train_data_wiki_norm = pickle.load(open("train_data_wiki_norm_all", "r"))

In [3]:
word_index = pickle.load(open("word_index", "r"))

In [4]:
print len(word_index)

4162


In [5]:
pd_train_data_wiki_norm = pd.DataFrame(train_data_wiki_norm,
                                       columns=['context_id', 'word', 'gold_sense_id', 'context'])
pd_train_data_wiki_norm.head()

Unnamed: 0,context_id,word,gold_sense_id,context
0,1,замок,1,"1,2,3,4,5,6,7,8,9,10,11,1,12,13,14,15,16,17,18..."
1,2,замок,1,"67,1,1,68,69,4,70,71,12,72,73,74,75,76,77,78,7..."
2,3,замок,1,"95,96,97,98,99,100,101,102,103,104,105,46,106,..."
3,4,замок,1,"124,118,125,126,127,128,129,130,118,131,132,13..."
4,5,замок,1,"138,40,139,140,141,142,143,144,145,146,147,85,..."


In [6]:
#{word:list of context_id}
#{word: {sense_id: context_id}}
#{context_id: value vector}

# word2context_ids = {}
word2sense_ids = {}
context_id2context = {}

for row in train_data_wiki_norm:
    word = row[1]
    context_id = row[0]
    sense_id = row[2]
    context = row[3]
    
    if word not in word2sense_ids:
        word2sense_ids[word] = {}
    if sense_id not in word2sense_ids[word]:
        word2sense_ids[word][sense_id] = []
    word2sense_ids[word][sense_id].append(context_id)
    
    context_id2context[context_id] = context

In [7]:
def sampling(negative_ratio = 5):
    dataset_1 = []
    dataset_2 = []
    target = []
    for word in tqdm_notebook(word2sense_ids.keys()):
        for sense_id in word2sense_ids[word].keys():
             #add positive label - all pair in current sense
            for context_id_1 in word2sense_ids[word][sense_id]:
                for context_id_2 in word2sense_ids[word][sense_id]:
                    if context_id_1  == context_id_2:
                        continue
                    dataset_1.append(map(int, context_id2context[context_id_1].split(",")))
                    dataset_2.append(map(int, context_id2context[context_id_2].split(",")))
                    target.append([1])
            negative_sample_cnt = negative_ratio * len(word2sense_ids[word][sense_id])
            for context_id_1 in word2sense_ids[word][sense_id]:
                for another_sense_id in word2sense_ids[word].keys():
                    negative_samples = r.sample(word2sense_ids[word][another_sense_id], 
                                                min(len(word2sense_ids[word][another_sense_id]),
                                                    negative_sample_cnt))
                    for ns in negative_samples:
                        dataset_1.append(map(int, context_id2context[context_id_1].split(",")))
                        dataset_2.append(map(int, context_id2context[ns].split(",")))
                        target.append([0])
    return np.asarray(dataset_1), np.asarray(dataset_2), np.asarray(target)

In [8]:
dataset_1, dataset_2, target = sampling()




In [9]:
max(max(dataset_2))

4162

In [10]:
np.asarray(map(lambda x: len(x), dataset_1)).max()

86

In [11]:
target.ravel().sum()

30440

In [12]:
X_train_1 = sequence.pad_sequences(dataset_1, maxlen=np.asarray(map(lambda x: len(x), dataset_1)).max())

In [13]:
X_train_2 = sequence.pad_sequences(dataset_2, maxlen=np.asarray(map(lambda x: len(x), dataset_1)).max())

In [14]:
X_train_1_reshape = np.reshape(X_train_1, (X_train_1.shape[0], X_train_1.shape[1], 1))
X_train_2_reshape = np.reshape(X_train_2, (X_train_2.shape[0], X_train_2.shape[1], 1))

In [None]:
embedding_vecor_length = 32
maxlen=np.asarray(map(lambda x: len(x), dataset_1)).max()

In [33]:
print X_train_1_reshape.shape, X_train_2_reshape.shape, len(word_index)

(82945, 86, 1) (82945, 86, 1) 4162


In [19]:
target.shape

(82945, 1)

In [1]:
batch_size=64

input_shape = (86,)
left_input = Input(input_shape)
right_input = Input(input_shape)

side_net = Sequential()
side_net.add(Embedding(len(word_index) + 1, embedding_vecor_length, input_length=maxlen))
side_net.add(LSTM(10))

embedding_left = side_net(left_input)
embedding_right = side_net(right_input)

diff = dot(inputs=[embedding_left, embedding_right], axes=-1, normalize=True)

prediction = Dense(1,activation='sigmoid')(diff)
siamese_net = Model(input=[left_input,right_input], output=prediction)

siamese_net.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(siamese_net.summary())
siamese_net.fit([X_train_1, X_train_2], target, epochs=10, batch_size=batch_size)

NameError: name 'Input' is not defined