# Sample code for word2vec model

### import libraries

In [1]:
import numpy as np
import pandas as pd
import nltk

### set environments

In [2]:
corpus_add = './sample.txt'
stpwrds = nltk.corpus.stopwords.words('english')
vec_dim = 100
n_window = 5
n_samples = 20  # negative sampling 개수
learning_rate = 0.01
def sigmoid(x):
    return 1 / (1 + np.round(np.exp(-x),10))


### Read sample corpus

In [3]:
corpus_file = open(corpus_add)
text = corpus_file.readline()
# tokenize the corpus
tokens = nltk.word_tokenize(text)

# Stopword 제거
tokens = [k for k in tokens if k not in stpwrds]

# 숫자제거
tokens = [k for k in tokens if not k.isdigit()]

# 처음 5개만 보여주기
tokens[:5] 

['anarchism', 'originated', 'term', 'abuse', 'first']

### Build word matrix

In [4]:
# get the number of unique words in the corpus
unique_words_n = len(set(tokens))

# build standard word matrix
# initialize the elements values
word_matrix = pd.DataFrame(data = np.random.normal(0,0.5,size = (unique_words_n,vec_dim)),
                           index = set(tokens), columns=range(vec_dim), dtype=np.float32)

# 처음 5개만 보여주기
word_matrix.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
organization,0.178017,0.01236,-0.503194,-0.459824,0.496037,0.119252,0.329428,-0.767484,0.207051,-0.192051,...,-0.114578,-0.917741,0.410301,-0.434145,-0.74053,-0.130829,0.092579,-0.116697,0.320323,0.990064
state,0.15427,0.70072,-0.108441,0.416652,0.194425,-0.358219,0.452353,0.8908,0.410773,0.017911,...,-0.077098,-0.513307,-1.016771,-0.379128,1.048174,-0.975451,0.37777,0.416691,0.42959,-0.162068
early,0.347029,0.255167,0.717321,-0.582404,-0.573712,0.136319,0.159591,-0.94041,0.160085,0.100121,...,-0.401748,0.35194,-0.308923,0.80672,-0.458799,-0.378255,0.116207,0.511778,-0.303354,0.428216
experiment,-0.781766,0.176941,-0.143949,-0.162064,-0.011218,0.4417,0.274888,1.031349,0.371849,0.732819,...,0.628236,0.478328,0.078996,0.946773,-0.288128,0.000134,0.911601,0.544589,-0.89271,0.840078
american,0.032122,-0.425582,0.474305,-0.960936,-0.497471,0.047829,-0.092854,-0.124465,-0.618539,-0.257411,...,0.026571,-0.313493,0.604312,-0.515188,-0.887631,-0.10832,0.135509,-0.154214,0.007211,-0.198859


### Settings for negative sampling

In [5]:
# Negative sampling을 위한 단어별 확률분포 생성
# 해당어 확률 = [해당어 빈도수^(3/4)] / [빈도수^(3/4)의 총합]
total_num_words = len(tokens)       # 중복 허용한 모든 token 갯수
freqdist = nltk.FreqDist(tokens)    # corpus 내 token의 빈도수
word_temp1 = pd.DataFrame([freqdist])
word_temp1 = word_temp1/ total_num_words
word_temp1 = pow(word_temp1, 0.75)
denom = word_temp1.sum(axis = 1)[0] # 모든 word_temp1들의 합을 분모로
word_prob = word_temp1 / denom   # 클래스 변수에 바로 할당
word_idx = word_prob.columns
word_prob = word_prob.values[0].tolist()

## Starts train (just for 1 iteration)

### Get input word

In [6]:
input_word = tokens[n_window]
input_word

'used'

### Positive sample

In [7]:
positive_sample = tokens[:n_window] + tokens[n_window + 1 : n_window + n_window + 1]
positive_sample

['anarchism',
 'originated',
 'term',
 'abuse',
 'first',
 'early',
 'working',
 'class',
 'radicals',
 'including']

### Negative sample

In [8]:
return_idx = np.random.choice(np.arange(unique_words_n), size=n_samples, replace=False, p=word_prob)
negative_sample = word_idx[return_idx].tolist()
negative_sample

['accepted',
 'septentrionale',
 'property',
 'concerning',
 'related',
 'relations',
 'proudhon',
 'including',
 'one',
 'rothbard',
 'moment',
 'anarchism',
 'organisation',
 'interest',
 'inclined',
 'description',
 'published',
 'many',
 'find',
 'power']

### Input word vector

##### which means hidden layer in the network

In [9]:
input_word_vector = word_matrix.ix[input_word]
input_word_vector.head()

0   -1.008624
1   -1.051884
2    0.186226
3    0.016142
4    0.522768
Name: used, dtype: float32

### Sample word vectors 

##### which mean W_2 in the network

In [10]:
sample_word = set(positive_sample + negative_sample)
sample_word_vector = word_matrix.ix[sample_word]
W_2 = sample_word_vector
W_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
including,0.740121,0.326963,0.116851,-0.679121,0.909204,-0.300451,0.379016,0.03354,-0.689721,-0.816815,...,-0.118457,0.196268,-0.017542,0.406544,-0.04329,1.109424,-0.443299,0.423337,-0.539742,0.026163
early,0.347029,0.255167,0.717321,-0.582404,-0.573712,0.136319,0.159591,-0.94041,0.160085,0.100121,...,-0.401748,0.35194,-0.308923,0.80672,-0.458799,-0.378255,0.116207,0.511778,-0.303354,0.428216
interest,0.072003,-0.981782,-0.010239,0.043778,-0.11437,-0.364081,-0.480681,0.284645,-0.455918,-0.131446,...,-0.690226,-0.146965,-0.408784,-0.165564,0.118621,-0.062836,-0.468301,-0.296385,-0.03005,0.617558
concerning,-0.085401,-0.134851,0.429632,0.607253,-0.45864,-0.254063,0.089445,0.024281,0.202941,0.828504,...,-0.402616,0.330462,1.107138,0.298232,0.477581,0.92381,0.485569,-0.619266,0.207266,0.637589
accepted,-0.225716,0.064858,0.265654,-0.464554,-0.250714,1.257679,-0.73125,0.41124,0.218622,0.001807,...,0.297231,-0.77395,-0.386476,-0.322347,-0.878875,0.326871,-0.143735,0.417166,-0.628345,0.09759


### Label data expressed 't'

In [11]:
output_size = len(sample_word)
t = pd.DataFrame(data = np.zeros(output_size), index = sample_word)
t.ix[positive_sample] = 1
t = t.values[:,0]
t

array([ 1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,
        1.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,
        0.,  0.])

### Output layer

In [12]:
hidden_layer = input_word_vector.values
hidden_layer = hidden_layer.reshape([1,vec_dim])
output_layer = sigmoid(np.dot(W_2, hidden_layer.T))  
output_layer = output_layer.reshape([1, output_size])
output_layer

array([[ 0.43614423,  0.35265267,  0.41607535,  0.56135166,  0.26467645,
         0.67611516,  0.5656935 ,  0.83834314,  0.49247238,  0.75924212,
         0.94391167,  0.96142739,  0.90824676,  0.44844458,  0.84265763,
         0.04379153,  0.22700165,  0.05689625,  0.71179259,  0.9695062 ,
         0.58161747,  0.48007843,  0.04298136,  0.01124441,  0.97288418,
         0.49216807,  0.8620277 ,  0.76595682]], dtype=float32)

### Loss 1

In [13]:
loss1 = output_layer - t
loss1 = loss1.reshape([output_size,1])  # dot 계산을 위해서 reshape
loss1

array([[-0.56385577],
       [-0.64734733],
       [ 0.41607535],
       [ 0.56135166],
       [ 0.26467645],
       [ 0.67611516],
       [ 0.5656935 ],
       [-0.16165686],
       [ 0.49247238],
       [-0.24075788],
       [ 0.94391167],
       [ 0.96142739],
       [ 0.90824676],
       [-0.55155542],
       [ 0.84265763],
       [-0.95620847],
       [ 0.22700165],
       [ 0.05689625],
       [ 0.71179259],
       [-0.0304938 ],
       [-0.41838253],
       [ 0.48007843],
       [-0.95701864],
       [ 0.01124441],
       [-0.02711582],
       [ 0.49216807],
       [ 0.8620277 ],
       [ 0.76595682]])

### E

In [14]:
E = np.dot(loss1, hidden_layer)
E

array([[ 0.56871864,  0.59311089, -0.10500486, ..., -0.21230315,
        -0.6072509 , -0.01142167],
       [ 0.65293026,  0.68093434, -0.1205532 , ..., -0.24373942,
        -0.69716809, -0.0131129 ],
       [-0.41966371, -0.43766303,  0.07748424, ...,  0.15666082,
         0.44809709,  0.00842817],
       ..., 
       [-0.49641268, -0.51770374,  0.09165472, ...,  0.18531127,
         0.53004601,  0.00996953],
       [-0.8694621 , -0.9067532 ,  0.16053237, ...,  0.32457094,
         0.92837056,  0.01746154],
       [-0.77256267, -0.80569776,  0.14264142, ...,  0.2883983 ,
         0.82490593,  0.0155155 ]])

### Updated W_2

In [15]:
W_2_updated = W_2 - (learning_rate * E )
W_2_updated

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
including,0.734434,0.321032,0.117901,-0.67903,0.912152,-0.299862,0.383608,0.033802,-0.692735,-0.816438,...,-0.119073,0.196886,-0.01987,0.410775,-0.041514,1.113084,-0.442858,0.425461,-0.533669,0.026278
early,0.3405,0.248357,0.718527,-0.5823,-0.570328,0.136995,0.164863,-0.94011,0.156625,0.100554,...,-0.402456,0.352649,-0.311596,0.811578,-0.45676,-0.374054,0.116713,0.514215,-0.296382,0.428347
interest,0.0762,-0.977406,-0.011014,0.043711,-0.116545,-0.364516,-0.48407,0.284452,-0.453694,-0.131724,...,-0.689771,-0.147421,-0.407066,-0.168686,0.11731,-0.065537,-0.468626,-0.297951,-0.034531,0.617474
concerning,-0.079739,-0.128946,0.428587,0.607162,-0.461575,-0.254649,0.084873,0.024021,0.205942,0.828129,...,-0.402003,0.329847,1.109456,0.29402,0.475812,0.920166,0.48513,-0.62138,0.20122,0.637476
accepted,-0.223046,0.067642,0.265161,-0.464596,-0.252098,1.257402,-0.733405,0.411117,0.220037,0.00163,...,0.29752,-0.774239,-0.385383,-0.324333,-0.879709,0.325153,-0.143942,0.416169,-0.631195,0.097537
relations,0.561545,-0.366048,-0.941935,0.148983,0.281081,0.319473,0.194546,0.681491,0.081489,0.316217,...,-0.017412,-0.364949,-0.014031,-0.931108,-0.017125,-0.470063,-0.103121,-0.339655,1.65509,0.78831
related,0.712442,-0.743974,0.837177,-0.037657,-0.173902,0.579249,-0.368772,-0.118076,0.166707,0.905029,...,0.161729,0.014328,-1.41226,0.130445,0.193942,-0.25296,0.106263,0.917256,-0.378621,-0.405202
class,0.170868,-0.479523,-0.159625,-0.910371,0.720062,-0.687383,0.706359,0.432734,-0.463275,0.282448,...,0.009837,1.043092,-1.113404,-0.07307,-0.205836,-0.651583,0.20032,0.346566,0.175106,0.379246
organisation,0.020319,0.273812,0.215488,0.709826,-1.085477,-0.345429,-0.153258,0.144962,-0.353311,0.211769,...,-0.53152,-0.73678,-0.315313,0.920881,0.597342,-1.104765,-0.06777,0.023965,-0.873267,-0.370129
originated,0.200388,0.8259,0.248848,-0.100319,0.551914,-0.331153,0.118834,-0.54285,0.550593,-0.085135,...,0.063964,-0.025433,0.544401,-0.059867,-0.651718,0.554068,-0.382881,0.36517,-0.460378,-0.506772


### EH

In [16]:
EH = np.sum(np.dot(loss1, hidden_layer), axis = 0)
EH

array([ -5.73443403e+00,  -5.98038301e+00,   1.05877215e+00,
         9.17715958e-02,   2.97214634e+00,   5.93716323e-01,
         4.63030176e+00,   2.64220559e-01,  -3.03906967e+00,
         3.80124862e-01,  -1.63573773e+00,   4.02039443e-01,
         3.75374222e+00,   3.82587645e+00,   7.68113094e-01,
         8.11074141e-01,   1.30169056e+00,   6.49754122e-03,
         3.26352407e+00,  -1.27272951e+00,  -1.60448401e+00,
         1.44600884e+00,  -1.06122934e+00,   2.10430067e+00,
        -3.09392392e+00,  -3.24218740e+00,   3.65527171e-01,
         3.78359079e+00,  -6.81368464e+00,  -2.66663878e-01,
         1.51146181e-01,   6.34437524e-01,  -2.28006701e+00,
         1.86471985e+00,  -1.22901853e+00,   4.96351817e-01,
        -3.22436928e+00,   1.35935384e+00,   1.77922508e+00,
        -3.30203838e+00,   5.40425225e-01,   3.07789100e+00,
         1.02265607e+00,  -7.07132596e-01,  -1.25753260e+00,
        -3.54663389e+00,  -2.56373395e+00,   2.55871011e+00,
        -2.43825817e-01,

### Updated input word vector

In [17]:
input_word_vector_updated = input_word_vector - learning_rate * EH.T
input_word_vector_updated

0    -0.951280
1    -0.992080
2     0.175639
3     0.015224
4     0.493047
5     0.098491
6     0.768116
7     0.043831
8    -0.504148
9     0.063059
10   -0.271351
11    0.066694
12    0.622705
13    0.634671
14    0.127422
15    0.134548
16    0.215936
17    0.001078
18    0.541383
19   -0.211132
20   -0.266166
21    0.239877
22   -0.176046
23    0.349080
24   -0.513248
25   -0.537843
26    0.060637
27    0.627656
28   -1.130316
29   -0.044237
        ...   
70   -0.362986
71   -0.993482
72    0.035586
73   -0.653232
74    0.187010
75    0.329066
76    0.202640
77    1.805390
78   -0.262456
79   -0.738214
80    0.333580
81   -0.132500
82    0.291289
83    0.570336
84   -0.200879
85   -0.252269
86   -0.119447
87    0.062601
88   -0.101042
89    0.276966
90   -0.103132
91    0.103311
92   -0.389464
93    0.707730
94    0.297154
95    0.612143
96    0.073668
97    0.355114
98    1.015732
99    0.019105
Name: used, dtype: float64

### Update word matrix

In [18]:
word_matrix.ix[input_word] = input_word_vector_updated
word_matrix.ix[sample_word] = W_2_updated

word_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
organization,0.178017,0.012360,-0.503194,-0.459824,0.496037,0.119252,0.329428,-0.767484,0.207051,-0.192051,...,-0.114578,-0.917741,0.410301,-0.434145,-0.740530,-0.130829,0.092579,-0.116697,0.320323,0.990064
state,0.154270,0.700720,-0.108441,0.416652,0.194425,-0.358219,0.452353,0.890800,0.410773,0.017911,...,-0.077098,-0.513307,-1.016771,-0.379128,1.048174,-0.975451,0.377770,0.416691,0.429590,-0.162068
early,0.340500,0.248357,0.718527,-0.582300,-0.570328,0.136995,0.164863,-0.940110,0.156625,0.100554,...,-0.402456,0.352649,-0.311596,0.811578,-0.456760,-0.374054,0.116713,0.514215,-0.296382,0.428347
experiment,-0.781766,0.176941,-0.143949,-0.162064,-0.011218,0.441700,0.274888,1.031349,0.371849,0.732819,...,0.628236,0.478328,0.078996,0.946773,-0.288128,0.000134,0.911601,0.544589,-0.892710,0.840078
american,0.032122,-0.425582,0.474305,-0.960936,-0.497471,0.047829,-0.092854,-0.124465,-0.618539,-0.257411,...,0.026571,-0.313493,0.604312,-0.515188,-0.887631,-0.108320,0.135509,-0.154214,0.007211,-0.198859
refers,0.643002,0.240671,0.243144,-0.007076,-0.452451,-0.025668,-1.007756,-1.266100,-1.076234,0.435269,...,-0.475868,0.138739,-0.407778,-0.472809,-0.347058,-0.284329,0.507907,0.450244,-0.912864,-0.411998
works,0.048489,-0.460001,-0.673984,-0.184851,0.559281,-0.834754,0.759632,-0.414641,0.546397,-0.787098,...,-0.317517,-0.796270,0.272211,-0.049600,-0.222002,1.133706,0.330690,-0.368745,-0.363998,0.619318
elements,-0.495332,0.118620,-0.102118,-0.194777,0.097554,-0.782855,-0.508893,0.718485,0.174574,-0.140182,...,-0.992329,-0.067197,-0.350958,-0.131273,-1.101565,0.421922,0.370904,-0.450559,-0.394048,0.280111
zero,-0.899467,0.762909,0.804438,-0.163444,0.405143,-0.069352,0.594824,0.294959,-0.128104,0.789006,...,-0.833783,0.499133,0.217589,-0.507802,-0.401358,-0.170136,1.312550,-0.616037,-0.904264,-0.089283
predecessors,-0.054446,0.318987,-0.020159,-0.412162,-0.996863,0.259839,-1.012385,-0.347314,0.083482,0.955007,...,-0.057760,-0.091509,0.396677,1.298418,0.092405,-0.080861,-0.109151,0.061561,-0.608312,-0.332783
