In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
from nltk.tokenize import word_tokenize
import xml.etree.ElementTree as ET
from lxml import etree
parser = etree.XMLParser(recover=True)
import os
data_path = 't2-doc/'

In [2]:
#Get information of t2-doc/
data = {}
for xml in os.listdir(data_path):
    with open(data_path + xml) as f:
        index = int(xml.split('.xml')[0])
        data[index] = {}
        #print(index)
        tmp = f.read().split('\n\n')
        title = etree.fromstringlist(['<title>', tmp[0], '</title>'], parser=parser)[0].text
        try:
            abstract = etree.fromstringlist(['<abstract>', tmp[1], '</abstract>'], parser=parser)[0].text
        except:
            abstract = etree.fromstringlist(['<abstract>', tmp[2], '</abstract>'], parser=parser)[0].text
        #print(title)
        #print(abstract)
        data[index]['title'] = title
        data[index]['abstract'] = abstract
        

alltitle = [data[i]['title'].strip() for i in sorted(data)]
allabs = [data[i]['abstract'].strip() for i in sorted(data)]

#tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(alldata)]
tagged_title = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(alltitle)]
tagged_abs = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(allabs)]

In [3]:
model_title = Doc2Vec.load('d2v_t.model')
model_abs = Doc2Vec.load('d2v_a.model')

In [None]:
#D2V
max_epochs = 100
vec_size = 64
alpha = 0.025

model_title = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=5,
                workers=10,
                dm =1)
  
model_title.build_vocab(tagged_title)

model_abs = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=5,
                workers=10,
                dm =1)
  
model_abs.build_vocab(tagged_abs)



for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model_title.train(tagged_title,
                total_examples=model_title.corpus_count,
                epochs=model_title.iter)
    model_abs.train(tagged_abs,
                total_examples=model_abs.corpus_count,
                epochs=model_abs.iter)
    # decrease the learning rate
    model_title.alpha -= 0.0002
    model_abs.alpha -= 0.0002
    # fix the learning rate, no decay
    model_title.min_alpha = model_title.alpha
    model_abs.min_alpha = model_abs.alpha

model_title.save("d2v_t.model")
model_abs.save("d2v_a.model")
print("Model Saved")

In [4]:
#Negative Sampling
import networkx as nx
G = nx.read_edgelist('t2-train.txt', nodetype=int, create_using=nx.DiGraph())

#negative sampling
neg = []
for node in G:
    for nbr, datadict in G.adj[node].items():
        for nnbr, datadict in G.adj[nbr].items():
            if nnbr not in G.adj[node]:
                neg.append(np.array([node, nnbr]))
neg = np.array(neg)
print(neg.shape)
np.random.shuffle(neg)
pos_num = sum([1 for line in open('t2-train.txt')])
neg_sample = neg[:pos_num]

(735227, 2)


In [5]:
#build embeddings dict
embeddings = []
for i in range(len(tagged_title)):
    title_emb = model_title.infer_vector(alltitle[i])
    abs_emb = model_abs.infer_vector(allabs[i])
    embeddings.append(np.hstack((title_emb, abs_emb)))
    #embeddings.append(model.infer_vector(alldata[i]))
embeddings = np.array(embeddings)
print(embeddings.shape)

(17500, 128)


In [6]:
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier



In [7]:
#Split training/testing data
X = []
y = []
with open('t2-train.txt') as f:
    for line in f:
        id1, id2 = map(int, line.split())
        X.append(np.hstack((embeddings[id1-1], embeddings[id2-1])))
        y.append(1)
    for id1, id2 in neg_sample:
        X.append(np.hstack((embeddings[id1-1], embeddings[id2-1])))
        y.append(0)
X = np.array(X)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=33)

In [8]:
#XGboost
params = {
    'eta': 0.4,
    'max_depth': 4,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'seed': i,
    'silent': False,
    'nthreads':8
}
dtrain = xgb.DMatrix(X_train, label=y_train)
watchlist = [(xgb.DMatrix(X_train, y_train), 'train'), (xgb.DMatrix(X_test, y_test), 'val')]
model = xgb.train(params, dtrain, 1000, watchlist, verbose_eval=True, early_stopping_rounds=10)

[0]	train-logloss:0.684872	val-logloss:0.68635
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 100 rounds.
[1]	train-logloss:0.679031	val-logloss:0.681266
[2]	train-logloss:0.674677	val-logloss:0.677153
[3]	train-logloss:0.672033	val-logloss:0.675128
[4]	train-logloss:0.669731	val-logloss:0.67274
[5]	train-logloss:0.66662	val-logloss:0.669932
[6]	train-logloss:0.664597	val-logloss:0.668265
[7]	train-logloss:0.6625	val-logloss:0.666457
[8]	train-logloss:0.660732	val-logloss:0.665321
[9]	train-logloss:0.659594	val-logloss:0.664396
[10]	train-logloss:0.657937	val-logloss:0.663097
[11]	train-logloss:0.656895	val-logloss:0.662439
[12]	train-logloss:0.655277	val-logloss:0.661476
[13]	train-logloss:0.653278	val-logloss:0.659945
[14]	train-logloss:0.651716	val-logloss:0.658665
[15]	train-logloss:0.650155	val-logloss:0.657363
[16]	train-logloss:0.648359	val-logloss:0.655647
[17]	train-logloss:0.647115	val-lo

[164]	train-logloss:0.571905	val-logloss:0.617985
[165]	train-logloss:0.571515	val-logloss:0.617923
[166]	train-logloss:0.571113	val-logloss:0.617891
[167]	train-logloss:0.570678	val-logloss:0.617844
[168]	train-logloss:0.570234	val-logloss:0.617809
[169]	train-logloss:0.569854	val-logloss:0.617744
[170]	train-logloss:0.569472	val-logloss:0.6176
[171]	train-logloss:0.569127	val-logloss:0.61777
[172]	train-logloss:0.56885	val-logloss:0.617765
[173]	train-logloss:0.568541	val-logloss:0.617525
[174]	train-logloss:0.568312	val-logloss:0.617509
[175]	train-logloss:0.56803	val-logloss:0.617483
[176]	train-logloss:0.56771	val-logloss:0.617453
[177]	train-logloss:0.567335	val-logloss:0.617427
[178]	train-logloss:0.56694	val-logloss:0.617088
[179]	train-logloss:0.566629	val-logloss:0.617093
[180]	train-logloss:0.56622	val-logloss:0.617095
[181]	train-logloss:0.565804	val-logloss:0.617009
[182]	train-logloss:0.565489	val-logloss:0.616988
[183]	train-logloss:0.565039	val-logloss:0.616634
[184]	tr

[330]	train-logloss:0.522534	val-logloss:0.613365
[331]	train-logloss:0.522353	val-logloss:0.613462
[332]	train-logloss:0.522048	val-logloss:0.613459
[333]	train-logloss:0.521767	val-logloss:0.613462
[334]	train-logloss:0.521586	val-logloss:0.613519
[335]	train-logloss:0.52141	val-logloss:0.613589
[336]	train-logloss:0.521149	val-logloss:0.613446
[337]	train-logloss:0.520912	val-logloss:0.613361
[338]	train-logloss:0.520567	val-logloss:0.613325
[339]	train-logloss:0.520275	val-logloss:0.613383
[340]	train-logloss:0.520073	val-logloss:0.61322
[341]	train-logloss:0.519849	val-logloss:0.61324
[342]	train-logloss:0.519599	val-logloss:0.613276
[343]	train-logloss:0.519357	val-logloss:0.613276
[344]	train-logloss:0.519051	val-logloss:0.61317
[345]	train-logloss:0.518767	val-logloss:0.613132
[346]	train-logloss:0.518603	val-logloss:0.61319
[347]	train-logloss:0.518344	val-logloss:0.613111
[348]	train-logloss:0.518048	val-logloss:0.613308
[349]	train-logloss:0.517728	val-logloss:0.613195
[350]

[495]	train-logloss:0.483145	val-logloss:0.61287
[496]	train-logloss:0.482898	val-logloss:0.612912
[497]	train-logloss:0.48265	val-logloss:0.61302
[498]	train-logloss:0.482397	val-logloss:0.613091
[499]	train-logloss:0.482142	val-logloss:0.613204
[500]	train-logloss:0.481941	val-logloss:0.613037
[501]	train-logloss:0.481652	val-logloss:0.612996
[502]	train-logloss:0.481482	val-logloss:0.613049
[503]	train-logloss:0.48121	val-logloss:0.612916
[504]	train-logloss:0.481031	val-logloss:0.612916
[505]	train-logloss:0.480851	val-logloss:0.612985
[506]	train-logloss:0.480669	val-logloss:0.61288
[507]	train-logloss:0.480564	val-logloss:0.612877
[508]	train-logloss:0.480268	val-logloss:0.612887
[509]	train-logloss:0.480028	val-logloss:0.613014
[510]	train-logloss:0.479782	val-logloss:0.613009
[511]	train-logloss:0.479593	val-logloss:0.612767
[512]	train-logloss:0.479439	val-logloss:0.61273
[513]	train-logloss:0.47915	val-logloss:0.612819
[514]	train-logloss:0.478933	val-logloss:0.612983
[515]	t

In [9]:
#Result
testing_data = []
with open('t2-test.txt') as f:
    for line in f:
        id1, id2 = map(int, line.split())
        testing_data.append(np.hstack((embeddings[id1-1], embeddings[id2-1])))
testing_data = np.array(testing_data)
pred = model.predict(xgb.DMatrix(testing_data))
mean = np.median(pred)
print(pred)
pred[pred>=mean] = 1
pred[pred<mean] = 0
np.savetxt('pred.txt', pred.reshape(-1, 1), fmt='%d')

[0.7111117  0.52830935 0.50367373 ... 0.13654596 0.7100221  0.7023309 ]


In [10]:
!python3 pred-txt-to-csv.py pred.txt