In [1]:
import sys
from pathlib import Path, PurePath as PPath

print('Python ver: {}\nPython env: {}'.format(sys.version, Path(sys.prefix).name))
print('Currrent dir: {}\n'.format(Path.cwd()))

def add_to_sys_path(this_path, up=False):
    """
    Prepend this_path to sys.path.
    If up=True, path refers to parent folder (1 level up).
    """
    if up:
        # NB: Path does not have a str method.
        newp = str(PPath(this_path).parent)
    else:
        newp = str(PPath(this_path)) 
    
    if newp not in sys.path:
        sys.path.insert(1, newp)
        print('Path added to sys.path: {}'.format(newp))

# if notebook inside another folder, eg ./notebooks:
nb_folder = 'notebooks'
add_to_sys_path(Path.cwd(), Path.cwd().name.startswith(nb_folder))


def get_project_dirs(which=['data', 'images'], nb_folder='notebooks'):
    dir_lst = []
    if Path.cwd().name.startswith(nb_folder):
        dir_fn = Path.cwd().parent.joinpath
    else:
        dir_fn = Path.cwd().joinpath
        
    for d in which:
        DIR = dir_fn(d)
        if not DIR.exists():
            Path.mkdir(DIR)
        dir_lst.append(DIR)
    return dir_lst

DIR_DATA, DIR_IMG = get_project_dirs()
DATA_RAW = DIR_DATA.joinpath('raw')
DATA_INTERIM = DIR_DATA.joinpath('intermediate')
DATA_READY = DIR_DATA.joinpath('production')

def data_subfolders(folders=[DATA_RAW, DATA_INTERIM, DATA_READY]):
    if folders is not None:
        for d in folders:
            if not d.exists():
                Path.mkdir(d)
                print(f'Created: {d}')
data_subfolders()

import numpy as np
import scipy as sp
from scipy import stats as sps
import pandas as pd
#pd.set_option("display.max_colwidth", 200)

import matplotlib as mpl
from matplotlib import pyplot as plt
plt.ion()
plt.style.use('seaborn-muted')

from pprint import pprint as pp

# Filtered dir() for method discovery:
def filter_dir(obj, start_with_str='_', exclude=True):
    return [d for d in dir(obj) if not d.startswith(start_with_str) == exclude]


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import HTML, Markdown #, IFrame
# for presentations:
#display(HTML("<style>.container { width:100% !important; }</style>"))


def new_section(title='New section'):
    style = "text-align:center;background:#c2d3ef;padding:16px;color:#ffffff;font-size:2em;width:98%"
    return HTML('<div style="{}">{}</div>'.format(style, title))


def add_div(div_class, div_start, div_text, output_string=True):
    from IPython import get_ipython
    from IPython.display import HTML, Markdown
    """
    Behaviour with default `output_string=True`:
    The cell is overwritten with the output string, but the cell mode is still in 'code' not 'markdown':
    ```
    [x]
    add_div('alert-warning', 'Tip: ', 'some tip here', output_string=True)
    [x]
    <div class="alert alert-warning"><b>Tip: </b>some tip here</div>
    ```
    The only thing to do is change the cell mode to Markdown.
    If `output_string=False`, the HTML output is displayed in an output cell.
    """
    accepted = ['alert-info', 'alert-warning', 'alert-danger']
    if div_class not in accepted:
        return HTML(f"""<div class="alert"><b>Wrong class:</b> `div_start` is one of {accepted}.
                    </div>""")
    div = f"""<div class="alert {div_class}"><b>{div_start}</b>{div_text}</div>"""
    if output_string:
        return get_ipython().set_next_input(div, 'markdown')
    else:
        return Markdown(div) #HTML(div)

# autoreload extension
from IPython import get_ipython
ipython = get_ipython()

if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

Python ver: 3.6.7 (default, Feb 28 2019, 07:28:18) [MSC v.1900 64 bit (AMD64)]
Python env: dsml
Currrent dir: C:\Users\catch\Documents\GitHub\Bioinformatics\PostEraLeadsSmiles

Path added to sys.path: C:\Users\catch\Documents\GitHub\Bioinformatics\PostEraLeadsSmiles


---
---
# Refactored SMILES code extracting function from PostEra's submission page of lead compounds against COVID19. 
url = "https://covid.postera.ai/covid/submissions"
  
The output of the main function `get_submissions_dict()` is now a dictionnary:
```
dict(submitter_id: [smiles code of each submission])
```

## Note:
The extraction is a slow process as the details page must be visited, hence parsed, even though there may only be a single entry (in wich case it is already listed in the main submission page).
---

In [2]:
import postera_smiles as psmiles

In [3]:
Markdown(psmiles.__doc__)


# Module: postera_smiles.py
The main function `get_submissions_dict()` performs two extractions:
* First extraction on https://covid.postera.ai/covid/submissions:
  The submitter's details page address from the anchor.href of the h5 (class="mb-1") tag.
* Second extraction on the the submitter's details page:
  All the alt.string from the img (class="card-img") tags.

Submission page entry example on https://covid.postera.ai/covid/submissions:
```
<div class="card h-100">
    <div class="card-header">
      <h5 class="mb-1"><a href="submissions/fedd1f79-9ed6-4487-a801-e6f14abf8e11">PET-SGC-fed</a></h5>
    </div>
    <img src="/synthesize/CC1C=CN=CC=1NC(=O)CCCCN1CCN(C(=O)COC2C=CC(C)=CC=2)CC1" class="card-img" alt="CC1C=CN=CC=1NC(=O)CCCCN1CCN(C(=O)COC2C=CC(C)=CC=2)CC1">
    <div class="card-body">
      <a style="width: 100%" href="/covid/submissions/fedd1f79-9ed6-4487-a801-e6f14abf8e11" class="mb-2 btn btn-primary">View</a>
    </div>
</div>
```
Details page example (here with single entry): 
```
<div id="smiles_list" class="row row-cols-1 row-cols-md-4">  
    <div class="col col-md-3 mb-4">
      <div class="card h-100">
        <img src="/synthesize/CC1C=CN=CC=1NC(=O)CCCCN1CCN(C(=O)COC2C=CC(C)=CC=2)CC1" class="card-img" alt="CC1C=CN=CC=1NC(=O)CCCCN1CCN(C(=O)COC2C=CC(C)=CC=2)CC1">
        <div class="card-body text-left">
          <p><strong>PET-SGC-fed-1</strong></p>
          <p>CC1C=CN=CC=1NC(=O)CCCCN1CCN(C(=O)COC2C=CC(C)=CC=2)CC1</p>
        </div>
      </div>
    </div>
</div>
```
Call examples:
--------------
## 1. Command line: Optional argument is a folder path. 
The json filename has pattern <2020_12_31_23_59>_postera_smiles.json.
Command:
```
python postera_smiles.py ./data/intermediate
```
Output:
```
Getting PostEra.ai COVID submissions SMILES codes (Note: slow process!)...
Submissions SMILES json file saved as:
    data/intermediate/2020_03_27_15_38_postera_smiles.json
```

## 2. Using the postera_smiles.py module:
```
import postera_smiles as psmiles
from pathlib import Path

submitters_dict = psmiles.get_submissions_dict()

fname = Path.cwd().joinpath('data', 'intermediate', psmiles.get_stamped_filename())
psmiles.save_as_json(fname, submitters_dict)

# to load:

jdict = psmiles.load_json(fname)
list(jdict.keys())[:5]
```


# Example (the extraction takes almost 3 minutes!):

In [4]:
submitters_dict = psmiles.get_submissions_dict()

fname = Path.cwd().joinpath('data', 'intermediate', psmiles.get_stamped_filename())

psmiles.save_as_json(fname, submitters_dict)

## Load a saved file:

In [5]:
jdict = psmiles.load_json(fname)
list(jdict.keys())[:5]

['PET-SGC-fed', 'JAN-GHE-fd8', 'ALE-UNK-fca', 'DAR-DIA-fc9', 'PET-SGC-fc6']

In [6]:
jdict['PET-SGC-fed']
jdict['JAN-GHE-fd8']

['CC1C=CN=CC=1NC(=O)CCCCN1CCN(C(=O)COC2C=CC(C)=CC=2)CC1']

['C(CN1CCN(C(=O)C)CC1)1=C(NCC)N=CC(C#N)=C1',
 'C1(C=C(C#CC)C=NC=1NC1=CC=CC=C1)CN1CCN(C(C)=O)CC1',
 'C1(C(NCCO)=NC=C(Cl)C=1)CN1CCN(C(=O)C)CC1',
 'C1(C=C(C#N)C=NC=1NCC)CN1CC(C1)1CN(C1)C(C)=O',
 'C1(C(NCC)=NC=C(C#N)C=1)CN1C[C@]([H])2[C@@]([H])(C1)CN(C(=O)C)C2',
 'C1(C=C(C#N)C=NC=1NCC)CN1CCN(C(C)=O)CCC1',
 'C1(C(NCC)=CC=C(C#N)C=1)C(=O)N1CCN(C(=O)C)CC1',
 'C1(C=C(C#N)C=CC=1NC1CC1)CN1CCN(C(C)=O)CC1',
 'C1(C(NCC)=CC=C(C#CC)C=1)CN1CCN(C(=O)C)CC1',
 'C1(C=C(F)C=CC=1NCC)CN1CCN(C(C)=O)CC1',
 'C1(C=C(C#N)C=NC=1NC1CC1)CNCCNC(C)=O',
 'C1(C(CNC(=O)N2CCOCC2)=CC=C(C#N)C=1)CN1CCN(C(=O)C)CC1']

## Check tiny molecules:

In [7]:
out = []
for k, smiles in jdict.items():
    for i, v in enumerate(smiles):
        if len(v) < 3:
            out.append(k)
            break
        
out

['ELE-UNK-bfe']

In [8]:
jdict['ELE-UNK-bfe']  # SMILES('C') == methane ???

['C']

## Retrieve all codes

In [9]:
all_smiles = [v for k in jdict.keys() for i, v in enumerate(jdict[k])]
len(all_smiles)

2063