-
Notifications
You must be signed in to change notification settings - Fork 2
/
IstexDoc2Vec.py
162 lines (138 loc) · 5.62 KB
/
IstexDoc2Vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# -*- coding: utf-8 -*-
#
# This file is part of Beard.
# Copyright (C) 2016 3ST ERIC Laboratory.
#
# This is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
# more details.
#Load and transform Istex abstracts into doc2vec representation.
# Author : Lucie Martinet <lucie.martinet@univ-lorraine.fr>
# co-author: Hussein AL-NATSHEH <hussein.al-natsheh@ish-lyon.cnrs.fr.>
# Affiliation: University of Lyon, ERIC Laboratory, Lyon2
# Thanks to ISTEX project for the foundings
import os, argparse
import json
from gensim.models.doc2vec import LabeledSentence, Doc2Vec
import string as st
class LoadFileJson() :
def __init__(self):
self.index = dict()
self.count = 0
self.wiki_count = 0
# loads randomly and as uniformly as possible in years, n istex doc
def LoadDocumentsIstexAndUCBL(self, directory, ucbl, wiki=None, max_nb_wiki_paragraphs=None, paragraphs_per_article=None) :
first_year = 1990
last_year = 2016
## UCBL data loading
r=open(ucbl,'r')
data=json.load(r)
r.close()
# for UCBL data
for doc in data :
line = doc["title"]+" __ " + doc["abstract"]
wordsl = self.UnicodAndTokenize(line)
try :
yield LabeledSentence(words = wordsl ,tags=['DOC_%s' % str(self.count)]) # tags
self.index["UCBL"+doc["doi"]] = str(self.count)
self.count += 1
except : continue
## ISTEX data loading
for y in sorted(range(first_year,last_year), reverse=True) : # ordered to complete the list of documents to read in case the smallest files does not contain enough documents
try :
f=open(os.path.join(directory,str(y)+"json_perfect.json"), "r")
js = json.load(f)
f.close()
except :
print os.path.join(directory,str(y)+"json_perfect.json")
print "Year failed ", y
pass
try :
f=open(os.path.join(directory, str(y)+"articles_json_perfect.json"), "r")
js += json.load(f)
f.close()
except :
print "Year article", y
pass
for doc in js: # choose randomly nb ids
abstract = ""
if "abstract" in doc :
abstract = doc["abstract"]
line = doc["title"]+" __ " + abstract
wordsl = self.UnicodAndTokenize(line)
try :
yield LabeledSentence(words = wordsl ,tags=['DOC_%s' % str(self.count)])
self.index["ISTEX"+doc["doi"]] = str(self.count)
self.count += 1
except : pass
## Wikipedia data itteration
if wiki is not None:
for sub in os.listdir(wiki):
subdir = os.path.join(wiki, sub)
for fname in os.listdir(subdir):
paragraphs_per_article_count = 0
for line in open(os.path.join(subdir, fname)): # each line represent a paragraph in wiki article
if max_nb_wiki_paragraphs is not None and self.wiki_count >= max_nb_wiki_paragraphs:
break
if len(line.split()) > 2 and line[:8] != '<doc id=': # to verify if the line is a paragraph
lst = self.UnicodAndTokenize(line)
yield LabeledSentence(words = lst ,tags=['WIKI_%s' % str(self.wiki_count)])
self.wiki_count += 1
if paragraphs_per_article is not None:
paragraphs_per_article_count += 1
if paragraphs_per_article_count >= paragraphs_per_article:
break
print 'number of wikipedia documents: ', self.wiki_count
print 'number of ISTEX documents: ', self.count
def UnicodAndTokenize(self,line) :
try :
line = line.encode('utf-8','ignore').decode('utf-8')
except:
line2 = ""
for w in line.split() :
try :
line2+=w.encode('utf-8','ignore').decode('utf-8')+" "
except :
if w[-1] in ['?','.','!'] :
line2 += w[-1] + " "
line = line2.rstrip() # remove last space if it exists
lst = line.lower().split()
lst = [ i for i in lst if not i.isdigit()]
return lst
if __name__ == "__main__" :
parser = argparse.ArgumentParser()
parser.add_argument("--istex_dir", default='sample_data/ISTEX/', type=str) # contains .json files
parser.add_argument("--ucbl_file", default='sample_data/sportArticlesAsIstex.json', type=str) # is a .json file
parser.add_argument("--wiki_dir", default=None, type=str) # contains wikipedia text files
parser.add_argument("--max_nb_wiki", default=None, type=int) # maximum number of Wikipedia paragraphs to use
parser.add_argument("--paragraphs_per_article", default=1, type=int)
parser.add_argument("--output_file", default='UcblIstex_matrix_model', type=str) # is a file containing the model: you can use it with Doc2Vec.load
parser.add_argument("--vec_size", default=300, type=int)
parser.add_argument("--window", default=8, type=int)
parser.add_argument("--min_count", default=10, type=int)
parser.add_argument("--workers", default=4, type=int)
parser.add_argument("--iter", default=5, type=int)
args = parser.parse_args()
istex = args.istex_dir
wiki = args.wiki_dir
max_nb_wiki_paragraphs = args.max_nb_wiki
paragraphs_per_article = args.paragraphs_per_article
ucbl = args.ucbl_file
output = args.output_file
v_size = args.vec_size
window_size = args.window
min_count = args.min_count
workers = args.workers
n_iter = args.iter
corpus = LoadFileJson()
if paragraphs_per_article < 1 and max_nb_wiki_paragraphs > 0:
print 'You need to set "paragraphs_per_article" to a number bigger than 0 if you want to load wikipedia articles'
else:
data = corpus.LoadDocumentsIstexAndUCBL(istex, ucbl, wiki, max_nb_wiki_paragraphs, paragraphs_per_article)
model = Doc2Vec(data, min_count=min_count, size=v_size, workers=workers, iter=n_iter, window=window_size)
print "Vocabulary size after training: " , len(model.vocab.keys())
print "count of documents", corpus.count + corpus.wiki_count
f=open(output+"keysIndex","w")
json.dump(corpus.index,f)
f.close
Doc2Vec.save(model, output)