#  FIR Final Project

In [1]:
!pip uninstall -y gensim
!pip3 install gensim
!pip install nltk

Found existing installation: gensim 4.3.2
Uninstalling gensim-4.3.2:
  Successfully uninstalled gensim-4.3.2
Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Using cached gensim-4.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
Installing collected packages: gensim
Successfully installed gensim-4.3.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: nltk
[0mSuccessfully installed nltk-3.8.1

[1m[

In [1]:
elastic = True
training = False

In [2]:
import re
import json
import nltk
if elastic:
    import elasticsearch
    import elasticsearch.helpers

from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
from itertools import islice

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ut-student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Elastic Search - Text similarity search with vector fields
In Elasticsearch 7.0, we introduced experimental field types for high-dimensional vectors, and now the 7.3 release brings support for using these vectors in document scoring.
We could use text embeddings to allow for retrieving similar questions:
- During indexing, each question is run through a sentence embedding model to produce a numeric vector.
- When a user enters a query, it is run through the same sentence embedding model to produce a vector. To rank the responses, we calculate the vector similarity between each question and the query vector. When comparing embedding vectors, it is common to use cosine similarity.

For this we need to create the Elasticsearch index, which includes mappings for the properties (title, question, tags, etc) encoded as a vector.

**source:** https://www.elastic.co/search-labs/text-similarity-search-with-vectors-in-elasticsearch

In [3]:
def read_documents(file_name):
    """
    Returns a generator of documents to be indexed by elastic, read from file_name
    """
    with open(file_name, 'r') as documents:
        for index in range(2000):  # change this to retrive more than 1000 docs
            line = next(documents)
            doc_line = json.loads(line)
            if ('index' in doc_line):
                id = doc_line['index']['_id']
            elif ('PMID' in doc_line):
                doc_line['_id'] = id
                yield doc_line
            else:
                raise ValueError('Woops, error in index file')


def create_index(es, index_name, body={}):
    # delete index when it already exists
    es.indices.delete(index=index_name, ignore=[400, 404])
    # create the index 
    es.indices.create(index=index_name, body=body)


def index_documents(es, collection_file_name, index_name, body={}):
    create_index(es, index_name, body)
    # bulk index the documents from file_name
    return elasticsearch.helpers.bulk(
        es, 
        read_documents(collection_file_name),
        index=index_name,
        chunk_size=2000,
        request_timeout=30
    )

In [5]:
# Connect to the ElasticSearch server
if elastic:
    es = elasticsearch.Elasticsearch(host='localhost')
    body = {
               "mappings": {
                    "properties": {
                        "title-abstract-vector": {
                            "type": "dense_vector",
                            "dims": 300,
                            "index": "true",
                            "similarity": "cosine"
                        }
                    }
                }
            }
    # Index the collection into the index called 'genomics-word2vec'
    index_name = 'genomics-word2vec'
    index_documents(es, 'data01/FIR-s05-medline.json', 'genomics-word2vec', body)



### We want to user Word2Vec representation to retrive the documents for every query. 
We need to have an embedding for every title/abstract from our dataset. For this, we need to train the word2vec model using our title/abstract data from the dataset. That's why we retrieved all of them from elastic and our sentences for training the word2vec model are concatenated title and abstract for every item.

Word2Vec documentation: https://rare-technologies.com/word2vec-tutorial/

In [6]:
def get_all_titles_abstract_from_elastic():
    query = {
              "size": 1000,
              "_source": {
                "include": [
                  "AB", "TI", "title-abstract-vector"
                ]
              },
              "query": {
                "match_all": {}
              }
        }

    resp = es.search(index=index_name, body=query, scroll='1m')

    print('Total %d hits found.' % resp["hits"]["total"]["value"])

    results = resp['hits']['hits']
    max_length = resp['hits']["total"]["value"]
    while len(results) < max_length:
        resp = es.scroll(scroll_id=resp['_scroll_id'], scroll="1m")
        results += resp['hits']['hits']

    return results

In [7]:
if elastic:
    sentences = get_all_titles_abstract_from_elastic()

Total 1000 hits found.




### This is the retriving from disk version of the above code.

In [8]:
def get_all_titles_abstract_from_disk(input_file_path):
    data = []
    ids = []

    with open(input_file_path) as f:
        for line in tqdm(f):
            # json_line = json.loads(line)
            if '_id' in line:
                ids.append(json.loads(line)['index']['_id'])
            if 'TI' in line:
                data.append(json.loads(line))

    data = [elem['AB']+' '+elem['TI'] for elem in data if 'AB' in list(elem.keys())]

    return data, ids

In [9]:
if not elastic:
    input_file_path = 'FIR-s05-medline.json'
    sentences, ids = get_all_titles_abstract_from_disk(input_file_path)

In [10]:
if elastic:
    documents = {elem['_id'] : elem['_source']['AB']+' '+elem['_source']['TI'] for elem in sentences if 'AB' in list(elem['_source'].keys())}
else:
    documents = {ids[i] : elem for i,elem in enumerate(sentences)}
print(documents['3'])

The global fold of maltose binding protein in complex with beta-cyclodextrin has been determined using a CNS-based torsion angle molecular dynamics protocol involving direct refinement against dipolar couplings and carbonyl chemical shift changes that occur upon alignment. The shift changes have been included as structural restraints using a new module, CANI, that has been incorporated into CNS. Force constants and timesteps have been determined that are particularly effective in structure refinement applications involving high molecular weight proteins with small to moderate numbers of NOE restraints. Solution structures of the N Direct structure refinement of high molecular weight proteins against residual dipolar couplings and carbonyl chemical shift changes upon alignment: an application to maltose binding protein.


## Preprocess Our Data
Prepare your text data by tokenizing and cleaning it.We need only the abstract and the title, so only the 'AB' and 'TI' keys are retrieved.

In [11]:
if elastic:
    sentences = [elem['_source']['AB']+' '+elem['_source']['TI'] for elem in sentences if 'AB' in list(elem['_source'].keys())]

In [12]:
if training:
    w2v_sentences = []
    for s in sentences:
        for sent in sent_tokenize(s):
            w2v_sentences.append(word_tokenize(sent.lower()))
    print(w2v_sentences[0])

### Training Word2Vec model using **Gensim**
We will save the model in 'w2v_genomics_model.bin' so we can skip this step and use the loaded version of it. We are training the model with the parameters from this paper: Section 4.1.4 - https://dl.acm.org/doi/10.1145/3476415.3476433 

In [13]:
# Train a word2vec model
if training:
    w2v_genomics = Word2Vec(workers=8, min_count=10, window=10, vector_size=300)
    w2v_genomics.build_vocab(w2v_sentences)
    w2v_genomics.train(w2v_sentences, total_examples=w2v_genomics.corpus_count, epochs=10)

    w2v_genomics.save('w2v_genomics_model.bin')

In [14]:
if not training:
    w2v_genomics = Word2Vec.load('w2v_genomics_model.bin')

In [15]:
w2v_genomics.wv['dipolar'][0]

0.16212283

###  Every document embbeding
We calculate this as the average of embbedings of every word of a document

In [16]:
doc_embeddings = {}
for (key, value) in documents.items():
    word_vector = [0] * 300
    words_length = 0

    for word in word_tokenize(value):
        if word in w2v_genomics.wv:
            words_length += 1
            embd = w2v_genomics.wv[word]
            for index in range(300):
                word_vector[index] += embd[index]

    for index in range(300):
        word_vector[index] /= words_length

    doc_embeddings[key] = word_vector

len(doc_embeddings['3'])

300

### This is a step only for elastic approach
Now we need to update the **title-abstract-vector** field with the embbeding values we computed in the above cell.

In [17]:
if elastic:
    for key, value in doc_embeddings.items():
        document = {"doc": {"title-abstract-vector": value}}
        es.update(index=index_name, id=int(key), body=document)



In [18]:
def get_first_item(index_name, es, doc_id):
    result = es.get(index=index_name, id=doc_id)
    return result['_source']

In [20]:
if elastic:
    print(get_first_item(index_name, es, 3))

{'AB': 'The global fold of maltose binding protein in complex with beta-cyclodextrin has been determined using a CNS-based torsion angle molecular dynamics protocol involving direct refinement against dipolar couplings and carbonyl chemical shift changes that occur upon alignment. The shift changes have been included as structural restraints using a new module, CANI, that has been incorporated into CNS. Force constants and timesteps have been determined that are particularly effective in structure refinement applications involving high molecular weight proteins with small to moderate numbers of NOE restraints. Solution structures of the N', 'AD': 'Protein Engineering Network Center of Excellence and Department of Medical Genetics and Microbiology, University of Toronto, Ontario, Canada.', 'CY': 'Netherlands', 'DA': '20011105', 'DCOM': '20020401', 'DP': '2001 Sep', 'EDAT': '2001/11/06 10:00', 'IP': '1', 'IS': '0925-2738', 'JID': '9110829', 'LA': 'eng', 'LR': '20021101', 'MHDA': '2002/04

### The final step
Based on the **title-abstract-vector** we can calculate similarity using cosineSImilarity. The source property contains a *painless* script that check if the document has a dense vector field and computes the similarity between the doc embbeding and the query embbeding. 

We also can use for calculating the similarity these metrics (https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html):
##### Elastic approach
- dot product
- l2 norm

##### Disk approach
- any metric that make sense :)

In [21]:
def calculate_query_embbeding(query):
    query_vector = [0] * 300

    for word in word_tokenize(query):
        word_vector = w2v_genomics.wv[word]
        for i in range(300):
            query_vector += word_vector[i]
    query_vector = [x/len(query) for x in query_vector]

    return query_vector

In [22]:
def find_similar_doc_with_query(query, index_name, es, top):  # elastic approach
    query_vector = calculate_query_embbeding(query)

    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
              "source": "doc['title-abstract-vector'].size() == 0 ? 0 : cosineSimilarity(params.query_vector, 'title-abstract-vector') + 1",
              "params": {"query_vector": query_vector}
            }
        }
    }
    body = {
        "size": top,
        "query": script_query
    }
    res = es.search(index=index_name, body=body)
    # print(len(res['hits']['hits']))
    return res['hits']['hits'][:top]

In [23]:
def find_similar_doc_with_query_disk(query, top):
    query_vector = calculate_query_embbeding(query)
    embbeding_similarity = {}
    for key, value in doc_embeddings.items():
        embbeding_similarity[key] = cosine_similarity([query_vector], [value])
    similar_docs = dict(sorted(embbeding_similarity.items(), key=lambda item: item[1]), reversed=True)
    return dict(islice(similar_docs.items(), top))

In [24]:
if elastic:
    most_similar_docs = find_similar_doc_with_query('molecule', index_name, es, 5)
else:
    most_similar_docs = find_similar_doc_with_query_disk('molecule', 5)

In [25]:
most_similar_docs

[{'_index': 'genomics-word2vec',
  '_type': '_doc',
  '_id': '872',
  '_score': 1.1436908,
  '_ignored': ['AB.keyword'],
  '_source': {'AB': 'To elucidate genomic organization of BmTXKbeta and BmTXKS2, two scorpion venom peptides from Chinese scorpion Buthus martensii Karsch(BmK) were first isolated and their genomic regions characterized using the PCR method. Analysis of nucleotide sequence shows that there exists different intron location in the venom genes. The region encoding mature peptide of BmTXKbeta is disrupted by an intron with 886 bp, whereas the intron of BmTXKS2 is located within its propeptide coding region, which is different from other scorpion toxin genes with their introns within the signal peptide coding region.',
   'AD': 'Department of Biotechnology, College of Life Sciences, Wuhan University, China.',
   'CY': 'England',
   'DA': '20011108',
   'DCOM': '20020417',
   'DP': '2001 May',
   'EDAT': '2001/11/09 10:00',
   'IP': '5',
   'IS': '1521-6543',
   'JID': '10