In [1]:
import json
from elasticsearch import Elasticsearch
import requests
from tqdm import tqdm_notebook as tqdm

In [3]:
res = requests.get('http://localhost:9200')
print(res.content)

b'{\n  "name" : "cpusrv-xeon-101",\n  "cluster_name" : "elasticsearch",\n  "cluster_uuid" : "uHSOXgwSQoe6tNgXTGU5kA",\n  "version" : {\n    "number" : "7.6.1",\n    "build_flavor" : "default",\n    "build_type" : "tar",\n    "build_hash" : "aa751e09be0a5072e8570670309b1f12348f023b",\n    "build_date" : "2020-02-29T00:15:25.529771Z",\n    "build_snapshot" : false,\n    "lucene_version" : "8.4.0",\n    "minimum_wire_compatibility_version" : "6.8.0",\n    "minimum_index_compatibility_version" : "6.0.0-beta1"\n  },\n  "tagline" : "You Know, for Search"\n}\n'


In [4]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

## IDs under Consideration for the project

In [34]:
IDoutfileName = "./data/dblpPaperIDs_" + str(2) + "Thresholded.json"
with open(IDoutfileName, 'r') as f:
    paperIdList = json.load(f)
len(paperIdList)

475839

In [35]:
AITopLevelTopics = set(['Artificial intelligence', 'Computer vision', 'Data mining',
                     'Data science', 'Machine learning', 'Natural language processing',
                     'Pattern recognition', 'Speech recognition'])

In [36]:
papersUnderConsideration = set(paperIdList)
PapersOutFileName = './data/es/dblp_AIpapers_v1.json'
with open('dblp_papers_v11.txt', 'r') as file:
    with open(PapersOutFileName, 'w') as outfile:
        for line in file:
            data = json.loads(line)
            paperID = data.get('id','')
            if paperID not in papersUnderConsideration:
                continue

            dataDict = dict()
            
            dataDict['id'] = paperID
            dataDict['title'] = data.get('title', '')
#             references = list()
#             for reference in data.get('references',[]):
#                 if reference in papersUnderConsideration:
#                     references.append(reference)
#             dataDict['references'] = references
            if 'venue' in data:
                dataDict['venue'] = data['venue']['raw']
            
            dataDict['authors'] = []
            if 'authors' in data:
                dataDict['authors'] = [d['name'] for d in data['authors']]
                
            dataDict['year'] = data['year']
    
            dataDict['abstract'] = []
            if 'indexed_abstract' in data:
                dataDict['abstract'] = [w for w in data['indexed_abstract']['InvertedIndex'].keys() if len(w) > 1]
            abstractString = ' '.join(word for word in dataDict['abstract'])
            abstractString = abstractString.replace('\n', ' ').replace('\r', '')
            dataDict['abstract'] = abstractString
            
            dataDict['fos'] = [ d['name'] for d in data['fos'] if d['w'] > 0 or d['name'] in AITopLevelTopics]
            
            indexDict = {'index': {'_id': dataDict['id']}}
            
            json.dump(indexDict, outfile)
            outfile.write('\n')
            json.dump(dataDict, outfile)
            outfile.write('\n')

In [48]:
body = []
with open(PapersOutFileName, 'r') as file:
    for line in file:
        data = json.loads(line)
        body.append(data)

In [69]:
records = [body[i] for i in range(len(body)) if i % 2 != 0]
len(records)

475839

In [74]:
for record in tqdm(records):
    res = es.index(index='dblp_v1',doc_type='paper',id=record['id'],body=record)

HBox(children=(IntProgress(value=0, max=475839), HTML(value='')))




In [76]:
fields = ['id', 'title', 'venue', 'authors', 'year', 'abstract', 'fos']

In [139]:
queryBody = {
    "query": {
        "multi_match" : {
            "query" : "sentence embeddings",
            "fields" : ['title', 'abstract']
        }
    }
}

In [140]:
res= es.search(index='dblp_v1',body=queryBody)

In [141]:
res

{'took': 17,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 6584, 'relation': 'eq'},
  'max_score': 16.353527,
  'hits': [{'_index': 'dblp_v1',
    '_type': 'paper',
    '_id': '2175723921',
    '_score': 16.353527,
    '_source': {'id': '2175723921',
     'title': 'Towards Universal Paraphrastic Sentence Embeddings',
     'venue': 'international conference on learning representations',
     'authors': ['John Wieting',
      'Mohit Bansal',
      'Kevin Gimpel',
      'Karen Livescu'],
     'year': 2016,
     'abstract': 'Abstract: We consider the problem of learning general-purpose, paraphrastic sentence embeddings based on supervision from Paraphrase Database (Ganitkevitch et al., 2013). compare six compositional architectures, evaluating them annotated textual similarity datasets drawn both same distribution as training data and wide range other domains. find that most complex such long short-term memory (LSTM)

In [39]:
!curl -XPOST localhost:9200/dblp_v1/paper/_bulk --data-binary  @/home/du0/15CS30003/nairp2/ontology/data/es/dblp_AIpapers_v1.json

In [5]:
!curl 'localhost:9200/_cat/indices?v'

health status index   uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   test    bT3mFwXHSvimc6ztC8q7Lw   1   1          4            0     15.7kb         15.7kb
yellow open   dblp_v1 xbTu48DzRoijqo1DTXQQlw   1   1     475839            0    629.4mb        629.4mb


In [123]:
from clio_lite import clio_search
from clio_lite import clio_keywords

url = "http://localhost:9200"
index = "dblp_v1"
query = "finance"

keywords = clio_keywords(url=url, index=index, query=query, 
                         fields=['title','abstract'],
                         )

In [124]:
keywords

[{'key': 'finance', 'score': 1715.9172190632926},
 {'key': 'economics', 'score': 13.099794878825975},
 {'key': 'financial', 'score': 12.132793995808761},
 {'key': 'applications', 'score': 8.201792215672679},
 {'key': 'portfolio', 'score': 6.861277926869297},
 {'key': 'stock', 'score': 6.691882871534194},
 {'key': 'bankruptcy', 'score': 5.276549542559988},
 {'key': 'cvar', 'score': 4.96973724661665},
 {'key': 'intraday', 'score': 3.2293785689523222},
 {'key': 'series', 'score': 1.85564460243314},
 {'key': 'systems', 'score': 1.5042725679228746}]

In [138]:
print(type('hello'))

<class 'str'>
