## Import all what we need

In [1]:
import os
import sys
import json
import urllib3
import requests
import pandas as pd
from io import StringIO
from tqdm import notebook
urllib3.disable_warnings()

## File Paths


<table><tr><td bgcolor=skyblue><font size=24> Could be changed ! </font></td></tr></table>

In [2]:
#########################################################
def_path = '/home/jingxin/data/def'
url_path = '/home/jingxin/data/url'
#########################################################

# Read Master File

Read the master file 'ExoMol All'. The URL is http://www.exomol.com/db/exomol.all.

In [3]:
def master_file():
    exomol_all_url = 'http://www.exomol.com/db/exomol.all'
    content = requests.get(exomol_all_url).text.replace('#','')
    exomol_col_name = ['c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6']
    exomol_all = pd.read_csv(StringIO(content), sep='\\s+', names=exomol_col_name, header=None)
    return(exomol_all)

Get all molecules, their iso-slugs, isoFormula and isotopologue dataset names. 

In [4]:
def mol_iso_names():
    exomol_all = master_file()
    first = exomol_all['c1']
    second = exomol_all['c2']
    third = exomol_all['c3']
    row = len(first)

    iso_slug = pd.DataFrame()
    iso_formula = pd.DataFrame()
    dataset = pd.DataFrame()
    molecule_single = pd.DataFrame()
    num_isotopologues = pd.DataFrame()

    for i in notebook.tqdm(range(row)):
            
        _iso_slug = exomol_all[first.isin(['Iso-slug'])]['c0'].values
        _iso_formula = exomol_all[first.isin(['IsoFormula'])]['c0'].values
        _dataset = exomol_all[second.isin(['dataset'])]['c0'].values
        _molecule_single = exomol_all[first.isin(['Molecule'])]['c0'].values
        _num_isotopologues = exomol_all[third.isin(['isotopologues'])]['c0'].values

    iso_slug = pd.concat([iso_slug, pd.DataFrame(_iso_slug)])
    iso_formula = pd.concat([iso_formula, pd.DataFrame(_iso_formula)])
    dataset = pd.concat([dataset, pd.DataFrame(_dataset)])
    molecule_single = pd.concat([molecule_single, pd.DataFrame(_molecule_single)])
    num_isotopologues = pd.concat([num_isotopologues, pd.DataFrame(_num_isotopologues).drop(0)])
    iso_drop_duplicates = iso_slug.drop_duplicates()
    dataset_drop_duplicates = dataset.drop_duplicates()
    print('Molecule:', len(molecule_single), ', isotopologue:', len(iso_drop_duplicates), ', dataset:', len(dataset_drop_duplicates))
    return(iso_slug, iso_formula, dataset, molecule_single, num_isotopologues, iso_drop_duplicates, dataset_drop_duplicates) 

In [5]:
iso_slug, iso_formula, dataset, molecule_single, num_isotopologues, iso_drop_duplicates, dataset_drop_duplicates = mol_iso_names()

  0%|          | 0/1611 [00:00<?, ?it/s]

Molecule: 85 , isotopologue: 203 , dataset: 94


In [6]:
iso_slug

Unnamed: 0,0
0,1H2-16O
1,1H2-17O
2,1H2-18O
3,1H-2H-16O
4,1H-2H-16O
...,...
235,29Si-14N
236,30Si-14N
237,40Ca-16O-1H
238,139La-16O


In [7]:
iso_formula

Unnamed: 0,0
0,(1H)2(16O)
1,(1H)2(17O)
2,(1H)2(18O)
3,(1H)(2H)(16O)
4,(1H)(2H)(16O)
...,...
235,(29Si)(14N)
236,(30Si)(14N)
237,(40Ca)(16O)(1H)
238,(139La)(16O)


In [8]:
dataset

Unnamed: 0,0
0,POKAZATEL
1,HotWat78
2,HotWat78
3,VTT
4,Hewitt
...,...
235,SiNfull
236,SiNfull
237,OYT6
238,BDL


In [9]:
molecule_single

Unnamed: 0,0
0,H2O
1,CO2
2,CO
3,CH4
4,NO
...,...
80,NaO
81,SiN
82,CaOH
83,LaO


In [10]:
num_isotopologues

Unnamed: 0,0
1,6
2,14
3,6
4,2
5,7
...,...
81,1
82,4
83,1
84,1


Set the length of molecules list to be the same as the length of iso-slugs and isotopologue dataset names for following loop.

In [11]:
def molecules(molecule_single, num_isotopologues):
    molecule_repeated = pd.DataFrame()
    molecule_num = len(molecule_single)

    for j in range(molecule_num):    
        molecule_repeated = pd.concat([molecule_repeated, pd.DataFrame((molecule_single.values[j] + ' ')
                                                                       * int(num_isotopologues.values[j]))])
    molecule_str = (str(molecule_repeated.values).replace("[['"," ")
                    .replace("']\n ['"," ").replace("']]"," ").replace("+","_p"))
    molecule = pd.read_csv(StringIO(molecule_str), sep='\s+', header=None)    
    return(molecule)

In [12]:
molecule = molecules(molecule_single, num_isotopologues)
molecule

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,230,231,232,233,234,235,236,237,238,239
0,H2O,H2O,H2O,H2O,H2O,H2O,CO2,CO2,CO2,CO2,...,KOH,NaOH,NaO,SiN,SiN,SiN,SiN,CaOH,LaO,H2CS


# Read Def File

Get all URLs of def files. The number of def files should be the same as the number of isotopologue datasets. The URLs contains the names of molecules, iso-slugs and isotopologue datasets. We save their corresponding isoFormula names as another column.

In [13]:
def read_deffile(iso_slug, molecule, dataset):
    def_url = []
    def_num = len(iso_slug)
    for i in notebook.tqdm(range(def_num)):
        url = ('http://www.exomol.com/db/' + molecule[i] + '/'
               + iso_slug.values[i] + '/'+ dataset.values[i] + '/'
               + iso_slug.values[i] + '__' + dataset.values[i] + '.def')
        def_url.append(url)
    return(def_url)

Download def files and save them into ./data/def/ folder. Save the names of these def files with all information we got before, that is to say, save as 'molecule_isoFormula_iso-slug_isotopologue.def'. It will be more convenient for processing data later.

In [14]:
def download_deffile(def_path):
    failed_list = [] 
    molecule = molecules(molecule_single, num_isotopologues)
    def_url = read_deffile(iso_slug, molecule, dataset)
    for _link in notebook.tqdm(def_url):
        link = _link[0]
        _molecule = link.split('/')[-4]
        inital_def_name = link.split('/')[-1]
        new_def_filename = _molecule + '__' + inital_def_name
        print("Downloading file: %s" % new_def_filename)
        print(link)
 
        # Make folders for save doanloaded files.
        if os.path.exists(def_path):
            pass
        else:
            os.makedirs(def_path, exist_ok=True)
        filename = os.path.join(def_path, new_def_filename)
        
        try:
            r = requests.get(link, stream=True, verify=False)
        except Exception:
            failed_list.append(new_def_filename)
            print(' download failed. Go to download next one\n')
              
        # For compute the progess.
        total_size = int(r.headers['Content-Length'])
        temp_size = 0    
   
        # Download started.
        with open(filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    temp_size += len(chunk)
                    f.write(chunk)
                    f.flush()
                    done = int(50 * temp_size / total_size)
                    sys.stdout.write("\r[%s%s] %d%%" % ('█' * done, ' ' * (50 - done),
                                                        100 * temp_size / total_size))
                    sys.stdout.flush()
        print(" Downloaded!\n")
    print("All def files downloaded!\n")    
    print("The files which are failed to download: \n")
    print(failed_list) # Record which file is failed to download.

Extract the def files whose the uncertainty availability row shows YES.

In [15]:
def unc_def(def_path, iso_formula):
    def_col_name = ['c0', '#', 'c1']
    tot = 0
    count = 0
    unc_def_filename = []
    unc_iso_formula = []
    for(dirpath,dirnames,files)in os.walk(def_path):
        for i in range(len(files)):
            filepath = os.path.join(dirpath, files[i])
            tot += 1
            def_df =  pd.read_csv(filepath,sep='\s+', usecols=[0,1,2], names=def_col_name, header=None)
            c1 = def_df['c1']
            if def_df[c1.isin(['Uncertainty'])]['c0'].values.any() == '1':
                unc_def_filename.append(files[i])
                unc_iso_formula.append(iso_formula[0][i])
                count += 1            
                
        print('There are ', tot, ' def files.\n')
        print('The uncertainty availability does not exit or shows NO in other ', tot - count, 'def files.\n')
        print('The uncertainty availability shows YES in the following ', count, ' def files:\n', unc_def_filename)
        
    return(unc_def_filename, unc_iso_formula)

# Get Download Links with API

Get the API URLs of those uncertainty molecules.

In [16]:
def get_api(def_path):
    molecule_str = []
    iso_formula_str = []
    dataset_str = []
    iso = []
    api_url = []
    unc_def_filename, unc_iso_formula = unc_def(def_path, iso_formula)
    for i in range(len(unc_def_filename)):
        molecule_str.append(unc_def_filename[i].replace('_p','+').split('__')[0].replace('+','_p'))
        iso_formula_str.append(unc_iso_formula[i].replace('_p','+'))
        dataset_str.append(unc_def_filename[i].split('__')[-1].split('.')[0])
        
        _iso = (iso_formula_str[i], dataset_str[i])
        iso.append(_iso)
        api_url.append('http://exomol.com/api/?molecule=*&datatype=linelist'.replace('*',molecule_str[i]))
    print('\nThe iso_slug and dataset names which are considered:\n', iso)
    return(api_url, iso)

Get the download links of states.bz2 files and trans.bz2 files from API.

In [17]:
def get_target_url(def_path):
    """Get the download url from API."""
    file_url = []
    api_url, iso = get_api(def_path)
    for i in range(len(iso)):
        response = requests.get(api_url[i])
        if(response.status_code != 200):
            print('ExoMol API Error' + str(response.status_code))

        # If the obtained status code is 200, it is correct.
        else:
            content = response.text            # Get the relevant content.
            json_dict = json.loads(content)    # Convert json into dictionary.

            # Extract files information from dictionary and convert them into list
            iso_slug = iso[i][0]
            dataset = iso[i][1]
            json_list = json_dict[iso_slug]['linelist'][dataset]['files']
            url_show = []
            for j in range(len(json_list)):
                link = json_list[j].get('url')
                try:
                    if((link.endswith('states.bz2') or link.endswith('trans.bz2'))):
                        file_url.append("http://www." + link)
                        url_show.append("http://www." + link)
                except KeyError:
                    print('Keyerror, keep going!')
                    
        print('\nThe number of downloading files for', iso_slug, dataset, ': ', len(url_show))
        print("Download links:")                   
        for k in url_show:
            print(k)
    return (file_url)

# Download States and Trans Files

We write all the download URLs into a text file, name it as api__urls.txt. 

In Linux, we use command 

```
wget -d -r -i /.../save_path/.../api__urls.txt
```

Download states.bz2 files and trans.bz2 files with download links. Save these files into correspoding folders.


In [18]:
def download_files(url_path):
    url_filename = url_path + '/api__urls.txt'

    if os.path.exists(url_path):
        pass
    else:
        os.makedirs(url_path, exist_ok=True)

    target_link = get_target_url(def_path)
    with open(url_filename, 'w') as file:
        file.write('\n'.join(target_link))

In [None]:
download_deffile(def_path)
#download_files(url_path)