In [58]:
import sys
from pathlib import Path, PurePath as PPath

print('Python ver: {}\nPython env: {}'.format(sys.version, Path(sys.prefix).name))
print('Currrent dir: {}\n'.format(Path.cwd()))

def add_to_sys_path(this_path, up=False):
    """
    Prepend this_path to sys.path.
    If up=True, path refers to parent folder (1 level up).
    """
    if up:
        # NB: Path does not have a str method.
        newp = str(PPath(this_path).parent)
    else:
        newp = str(PPath(this_path)) 
    
    if newp not in sys.path:
        sys.path.insert(1, newp)
        print('Path added to sys.path: {}'.format(newp))

# if notebook inside another folder, eg ./notebooks:
nb_folder = 'notebooks'
add_to_sys_path(Path.cwd(), Path.cwd().name.startswith(nb_folder))


def get_project_dirs(which=['data', 'images'], nb_folder='notebooks'):
    dir_lst = []
    if Path.cwd().name.startswith(nb_folder):
        dir_fn = Path.cwd().parent.joinpath
    else:
        dir_fn = Path.cwd().joinpath
        
    for d in which:
        DIR = dir_fn(d)
        if not DIR.exists():
            Path.mkdir(DIR)
        dir_lst.append(DIR)
    return dir_lst

DIR_DATA, DIR_IMG = get_project_dirs()
    
import numpy as np
import scipy as sp
from scipy import stats as sps
import pandas as pd
#pd.set_option("display.max_colwidth", 200)

import matplotlib as mpl
from matplotlib import pyplot as plt
plt.ion()
plt.style.use('seaborn-muted')

from pprint import pprint as pp

# Filtered dir() for method discovery:
def filter_dir(obj, start_with_str='_', exclude=True):
    return [d for d in dir(obj) if not d.startswith(start_with_str) == exclude]


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import HTML, Markdown #, IFrame
# for presentations:
#display(HTML("<style>.container { width:100% !important; }</style>"))


def new_section(title='New section'):
    style = "text-align:center;background:#c2d3ef;padding:16px;color:#ffffff;font-size:2em;width:98%"
    return HTML('<div style="{}">{}</div>'.format(style, title))


def add_div(div_class, div_start, div_text, output_string=True):
    from IPython import get_ipython
    from IPython.display import HTML, Markdown
    """
    Behaviour with default `output_string=True`:
    The cell is overwritten with the output string, but the cell mode is still in 'code' not 'markdown':
    ```
    [x]
    add_div('alert-warning', 'Tip: ', 'some tip here', output_string=True)
    [x]
    <div class="alert alert-warning"><b>Tip: </b>some tip here</div>
    ```
    The only thing to do is change the cell mode to Markdown.
    If `output_string=False`, the HTML output is displayed in an output cell.
    """
    accepted = ['alert-info', 'alert-warning', 'alert-danger']
    if div_class not in accepted:
        return HTML(f"""<div class="alert"><b>Wrong class:</b> `div_start` is one of {accepted}.
                    </div>""")
    div = f"""<div class="alert {div_class}"><b>{div_start}</b>{div_text}</div>"""
    if output_string:
        return get_ipython().set_next_input(div, 'markdown')
    else:
        return Markdown(div) #HTML(div)

# autoreload extension
from IPython import get_ipython
ipython = get_ipython()

if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

Python ver: 3.6.7 (default, Feb 28 2019, 07:28:18) [MSC v.1900 64 bit (AMD64)]
Python env: dsml
Currrent dir: C:\Users\catch\Documents\GitHub\Bioinformatics\PostEraLeadsSmiles



---
---
# SMILES code extracting function from PostEra's submission page of lead compounds against COVID19. 
url = "https://covid.postera.ai/covid/submissions"

The is no time stamp of submission, so the added entries are appended (my understanding).  
**Update**: 2020-03-26: Fix write mode: from 'w+' to 'w' :: file is now overwritten if same name is used; before: appended => duplications. 

---

In [None]:
import postera_smiles as psmiles

fn = DIR_DATA.joinpath(psmiles.get_YmdH_filename())
psmiles.extract_postera_smiles(save_as=fn)

---
---
# TODO:
# Correcting the `extract_postera_smiles` function:

* The value extracted in the 'submission_id' column is actually the id of the submission-detail page. Since there is no submission count (per submitter), I must visit the details page and extract the (id, smile code) tuple from there, at a &mdash; hopefully &mdash; small cost.
* Possible redesign (& one that could accommodate incremental update): save/cache submitters and details page ids from submission page; append if new found; visist details page; extract.


---
---
# TODO:

# Automating the updates:

* There is no submission timestamp  
* There is no assurance that the submission page listing has the same order 


In [35]:
fname1 = DIR_DATA.joinpath('2020_03_25_15_postera_smiles.csv')
fname2 = DIR_DATA.joinpath('2020_03_26_11_postera_smiles.csv')
fname3 = DIR_DATA.joinpath('2020_03_26_14_postera_smiles.csv')

In [36]:
df1 = pd.read_csv(fname1, index_col='submission_id')
df1.shape[0]
df2 = pd.read_csv(fname2, index_col='submission_id')
df2.shape[0]

136

195

In [28]:
# new entries:

df = df2.merge(df1, how='left', left_index=True, right_index=True, suffixes=('', '_old'))
# pd merge pre-pending space to col names??
df.columns = [c.strip() for c in df.columns.tolist()]
dfnew = df[df.submission_smiles_old.isna()]

Index(['submission_smiles', 'submission_smiles_old'], dtype='object')

Unnamed: 0_level_0,submission_smiles,submission_smiles_old
submission_id,Unnamed: 1_level_1,Unnamed: 2_level_1
fd8d85a5-db4a-4ecf-93d1-416eee50b961,C1(C(NCC)=CC=C(C#N)C=1)C(=O)N1CCN(C(=O)C)CC1,C1(C=C(C#N)C=NC=1NCC)CN1CCN(C(C)=O)CCC1
fc970077-6f76-4de1-8de8-e2a37381b22b,O=C(NC1C=CC=CC=1)N(C1C=CC=NC=1)C(=O)C,N1(CCN(C2C=CC=CC=2)C(=O)C1)C1C=NC=CC=1
fad6815c-0470-4c07-89ff-303cd8ea635d,CC(C1C=CN=C(N(C(NC2C(CN3CCOCC3)=CC=NC=2C)=O)C...,CC(C1C=CN=C(N(C(NC2C(CN3CCOCC3)=CC=NC=2C)=O)C...
fa25ac7f-b7ca-4ac6-8a32-5070858535ad,C[C@H]1C2C(=CC([S@](=O)(=N)C)=CC=2)N(CC1(F)F)...,C[C@H]1C2C(=CC([S@](=O)(=N)C)=NC=2)N(C2CCOCC2...
f9b12666-ae78-4cf8-8d60-dc19988e42d3,ClCC(=O)N1C(NC(C(NC(C)N)CC(=O)N)=O)CCC(C2=NC3...,


In [29]:
df3[df3.submission_smiles_old.isna()]


Unnamed: 0_level_0,submission_smiles,submission_smiles_old
submission_id,Unnamed: 1_level_1,Unnamed: 2_level_1
f9b12666-ae78-4cf8-8d60-dc19988e42d3,ClCC(=O)N1C(NC(C(NC(C)N)CC(=O)N)=O)CCC(C2=NC3...,
f818f65b-ef01-460e-b019-b83f7b7c8031,CS(=O)(=O)NCCC1C=C(F)C=C2C(CCNC(=O)C)=CNC=12,
f4772df7-f5a5-4208-a16e-3d41483cc4c7,ClCC(=O)N1CC(C2C=CC=CC=2)C2C=C(C)C=CC=2C1C1CS...,
eab5efcb-b199-4c70-afb4-8b9adb33206d,CC(C1=CC(Cl)=CC2C1CCN2)NC(CCl)=O,
e4c3c928-4740-468d-990f-257c36b38d62,C1(C(CNC(CS)C(O)=O)=C(O)C=NC=1C)COP(O)(O)=O,
e1c2b579-8004-42d8-997e-3336eabd382a,C(O)C(O)N1C(C(NC(=O)C)CC(N)=O)CN(C(O)CCl)CC1,
de95a6f4-6e93-450e-b4ec-f63e90d17c67,ClCC(O)N1CCN(CC2CCC(C3C=CC(S(N)(=O)=O)=C(O)C=...,
d8fd1356-48a3-47db-b12f-ee2f1a630081,ClC1C(F)=CC(OC2CCN(C(C)=O)CC2)=C2NC(=C(C=12)C...,
d5b502b1-312b-42b7-9521-adcd68725450,CS(=O)(=O)NCC(C1C=CC=CC=1)CC(=O)NC1C(C)=CC=NC=1,
d0156db4-922c-45d6-ad51-9d2b524cbad9,CS(=O)(=O)NCC(C1C=CC=CC=1)C(=O)NC1C=CC=NC=1,
