In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
from nltk.tokenize import word_tokenize
import xml.etree.ElementTree as ET
from lxml import etree
parser = etree.XMLParser(recover=True)
import os
from sklearn.preprocessing import StandardScaler
data_path = 't3-doc/'
dirs = os.listdir()

In [2]:
#Get information of t3-doc/
data = {}
for xml in os.listdir(data_path):
    with open(data_path + xml) as f:
        index = int(xml.split('.xml')[0])
        data[index] = {}
        tmp = f.read().split('\n\n')
        date = etree.fromstringlist(['<date>', tmp[0], '</date>'], parser=parser)[0].text
        title = etree.fromstringlist(['<title>', tmp[1], '</title>'], parser=parser)[0].text
        try:
            abstract = etree.fromstringlist(['<abstract>', tmp[2], '</abstract>'], parser=parser)[0].text
        except:
            abstract = etree.fromstringlist(['<abstract>', tmp[3], '</abstract>'], parser=parser)[0].text
        data[index]['date'] = date
        data[index]['title'] = title
        data[index]['abstract'] = abstract
        

alltitle = [data[i]['title'].strip() for i in sorted(data)]
allabs = [data[i]['abstract'].strip() for i in sorted(data)]
alldate = [data[i]['date'].strip() for i in sorted(data)]
#tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(alldata)]
tagged_title = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(alltitle)]
tagged_abs = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(allabs)]
doc_n = len(alltitle)

#Get date information (int)
from datetime import datetime

for i in range(len(alldate)):
    date = alldate[i]
    #print(date)
    for fmt in ['%a, %d %b %Y', '%a, %d %b %y']:
        try:
            tmp = datetime.strptime(date, fmt)
        except ValueError as v:
            if 'unconverted' in v.args[0]:
                ulr = len(v.args[0].partition('unconverted data remains: ')[2])
                if ulr:
                    date = date[:-ulr]
                    try:
                        tmp = datetime.strptime(date, fmt)
                        break
                    except ValueError:
                        continue
                else:
                    raise v
            else:
                pass
    alldate[i] = int(datetime.strftime(tmp, '%Y%m%d'))
    #print(alldate[i])


In [4]:
#Calculate Doc Sims

if "sims_title.npy" not in dirs or "sims_abs.npy" not in dirs:
    gen_titles = [[w.lower() for w in word_tokenize(t)] for t in alltitle]
    gen_abss = [[w.lower() for w in word_tokenize(t)] for t in allabs]
    dict_title = gensim.corpora.dictionary.Dictionary(gen_titles)
    dict_abs = gensim.corpora.dictionary.Dictionary(gen_abss)
    corpus_title = [dict_title.doc2bow(gen_title) for gen_title in gen_titles]
    corpus_abs = [dict_title.doc2bow(gen_abs) for gen_abs in gen_abss]
    tf_idf_title = gensim.models.TfidfModel(corpus_title)
    tf_idf_abs = gensim.models.TfidfModel(corpus_abs)
    sims_title = gensim.similarities.Similarity('sim',tf_idf_title[corpus_title], 
                                            num_features=len(dict_title))
    sims_abs = gensim.similarities.Similarity('sim',tf_idf_abs[corpus_abs], 
                                            num_features=len(dict_abs))
    tmp1 = []
    tmp2 = []
    for line in sims_title:
        tmp1.append(line)
    for line in sims_abs:
        tmp2.append(line)
    del sims_title, sims_abs
    sims_title = np.array(tmp1)
    sims_abs = np.array(tmp2)
    del tmp1, tmp2
    np.save('sims_title.npy', sims_title)
    np.save('sims_abs.npy', sims_abs)
else:
    sims_title = np.load('sims_title.npy')
    sims_abs = np.load('sims_abs.npy')


In [5]:
#D2V

if "d2v_t.model" not in dirs or "d2v_a.model" not in dirs:
    max_epochs = 100
    vec_size = 64
    alpha = 0.025

    model_title = Doc2Vec(size=vec_size,
                    alpha=alpha, 
                    min_alpha=0.00025,
                    min_count=5,
                    workers=10,
                    dm =1)

    model_title.build_vocab(tagged_title)

    model_abs = Doc2Vec(size=vec_size,
                    alpha=alpha, 
                    min_alpha=0.00025,
                    min_count=5,
                    workers=10,
                    dm =1)

    model_abs.build_vocab(tagged_abs)



    for epoch in range(max_epochs):
        print('iteration {0}'.format(epoch))
        model_title.train(tagged_title,
                    total_examples=model_title.corpus_count,
                    epochs=model_title.iter)
        model_abs.train(tagged_abs,
                    total_examples=model_abs.corpus_count,
                    epochs=model_abs.iter)
        # decrease the learning rate
        model_title.alpha -= 0.0002
        model_abs.alpha -= 0.0002
        # fix the learning rate, no decay
        model_title.min_alpha = model_title.alpha
        model_abs.min_alpha = model_abs.alpha

    model_title.save("d2v_t.model")
    model_abs.save("d2v_a.model")
    print("Model Saved")
else:
    model_title = Doc2Vec.load('d2v_t.model')
    model_abs = Doc2Vec.load('d2v_a.model')

In [6]:
#Negative Sampling
import networkx as nx
G = nx.read_edgelist('t3-train.txt', nodetype=int, create_using=nx.DiGraph())

#negative sampling
neg = []
for node in G:
    for nbr, datadict in G.adj[node].items():
        for nnbr, datadict in G.adj[nbr].items():
            if nnbr not in G.adj[node]:
                neg.append(np.array([node, nnbr]))
neg = np.array(neg)
print(neg.shape)
np.random.shuffle(neg)
pos_num = sum([1 for line in open('t3-train.txt')])
neg_sample = neg[:pos_num]

(815719, 2)


In [7]:
#build embeddings dict
embeddings = []
for i in range(len(tagged_title)):
    title_emb = model_title.infer_vector(alltitle[i])
    abs_emb = model_abs.infer_vector(allabs[i])
    embeddings.append(np.hstack((title_emb, abs_emb)))
    #embeddings.append(model.infer_vector(alldata[i]))
embeddings = np.array(embeddings)
print(embeddings.shape)

(17500, 128)


In [8]:
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier



In [9]:
#Split training/testing data
#scaler = StandardScaler()
X = []
y = []
with open('t3-train.txt') as f:
    for line in f:
        id1, id2 = map(int, line.split())
        emb1 = embeddings[id1-1]
        emb2 = embeddings[id2-1]
        X.append(np.hstack((emb1, emb2, alldate[id1-1], alldate[id2-1],
                            sims_title[id1-1][id2-1], sims_abs[id1-1][id2-1])))
        y.append(1)
    for id1, id2 in neg_sample:
        emb1 = embeddings[id1-1]
        emb2 = embeddings[id2-1]
        X.append(np.hstack((emb1, emb2, alldate[id1-1], alldate[id2-1],
                            sims_title[id1-1][id2-1], sims_abs[id1-1][id2-1])))
        y.append(0)
#scaler.fit(X)
X = np.array(X)

y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=33)

In [11]:
#XGboost
params = {
    'eta': 0.4,
    'max_depth': 3,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'seed': i,
    'silent': False,
    'nthreads':8
}
dtrain = xgb.DMatrix(X_train, label=y_train)
watchlist = [(xgb.DMatrix(X_train, y_train), 'train'), (xgb.DMatrix(X_test, y_test), 'val')]
model = xgb.train(params, dtrain, 1000, watchlist, verbose_eval=True, early_stopping_rounds=10)

[0]	train-logloss:0.644239	val-logloss:0.6436
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 10 rounds.
[1]	train-logloss:0.617355	val-logloss:0.617322
[2]	train-logloss:0.603824	val-logloss:0.603682
[3]	train-logloss:0.594511	val-logloss:0.594986
[4]	train-logloss:0.584159	val-logloss:0.585384
[5]	train-logloss:0.577993	val-logloss:0.579356
[6]	train-logloss:0.574453	val-logloss:0.576494
[7]	train-logloss:0.571396	val-logloss:0.573583
[8]	train-logloss:0.568035	val-logloss:0.570097
[9]	train-logloss:0.565637	val-logloss:0.567599
[10]	train-logloss:0.564095	val-logloss:0.566293
[11]	train-logloss:0.56304	val-logloss:0.565381
[12]	train-logloss:0.561412	val-logloss:0.564269
[13]	train-logloss:0.558828	val-logloss:0.561755
[14]	train-logloss:0.55729	val-logloss:0.560557
[15]	train-logloss:0.555953	val-logloss:0.559459
[16]	train-logloss:0.554937	val-logloss:0.558666
[17]	train-logloss:0.553723	val-lo

[164]	train-logloss:0.501022	val-logloss:0.522981
[165]	train-logloss:0.500956	val-logloss:0.522973
[166]	train-logloss:0.500702	val-logloss:0.522812
[167]	train-logloss:0.5005	val-logloss:0.522682
[168]	train-logloss:0.500272	val-logloss:0.52251
[169]	train-logloss:0.500131	val-logloss:0.522429
[170]	train-logloss:0.500017	val-logloss:0.522452
[171]	train-logloss:0.499855	val-logloss:0.522398
[172]	train-logloss:0.499674	val-logloss:0.522217
[173]	train-logloss:0.499559	val-logloss:0.522141
[174]	train-logloss:0.499354	val-logloss:0.522241
[175]	train-logloss:0.499122	val-logloss:0.522116
[176]	train-logloss:0.49892	val-logloss:0.52201
[177]	train-logloss:0.498699	val-logloss:0.521911
[178]	train-logloss:0.498451	val-logloss:0.52181
[179]	train-logloss:0.498239	val-logloss:0.521575
[180]	train-logloss:0.497997	val-logloss:0.52144
[181]	train-logloss:0.497807	val-logloss:0.521394
[182]	train-logloss:0.497651	val-logloss:0.521362
[183]	train-logloss:0.497454	val-logloss:0.521414
[184]	t

[329]	train-logloss:0.473992	val-logloss:0.512761
[330]	train-logloss:0.473801	val-logloss:0.512782
[331]	train-logloss:0.473658	val-logloss:0.51276
[332]	train-logloss:0.473529	val-logloss:0.512732
[333]	train-logloss:0.473401	val-logloss:0.512735
[334]	train-logloss:0.473251	val-logloss:0.512724
[335]	train-logloss:0.4732	val-logloss:0.512762
[336]	train-logloss:0.473129	val-logloss:0.512743
[337]	train-logloss:0.473028	val-logloss:0.512779
[338]	train-logloss:0.47288	val-logloss:0.512637
[339]	train-logloss:0.472734	val-logloss:0.512628
[340]	train-logloss:0.472579	val-logloss:0.512675
[341]	train-logloss:0.472531	val-logloss:0.512643
[342]	train-logloss:0.47244	val-logloss:0.512663
[343]	train-logloss:0.472306	val-logloss:0.512758
[344]	train-logloss:0.472213	val-logloss:0.512821
[345]	train-logloss:0.472068	val-logloss:0.512718
[346]	train-logloss:0.471981	val-logloss:0.512778
[347]	train-logloss:0.471813	val-logloss:0.512787
[348]	train-logloss:0.471665	val-logloss:0.512778
[349]

In [12]:
#Result
testing_data = []
with open('t3-test.txt') as f:
    for line in f:
        id1, id2 = map(int, line.split())
        emb1 = embeddings[id1-1]
        emb2 = embeddings[id2-1]
        testing_data.append(np.hstack((emb1, emb2, alldate[id1-1], alldate[id2-1],
                                      sims_title[id1-1][id2-1], sims_abs[id1-1][id2-1])))
testing_data = np.array(testing_data)
pred = model.predict(xgb.DMatrix(testing_data))
mean = np.median(pred)
print(mean)
print(pred)
pred[pred>=mean] = 1
pred[pred<mean] = 0
np.savetxt('pred.txt', pred.reshape(-1, 1), fmt='%d')

0.6219919
[0.8838251  0.17228323 0.70242697 ... 0.82114273 0.9347503  0.52999765]


In [13]:
!python3 pred-txt-to-csv.py pred.txt