### Import all what we need

In [1]:
import os
import sys 
import bz2
import json
import requests
from tqdm import tqdm_notebook as tqdm

## Get Download Links

Download files of molecule $H_2O$.

Find all <font color="green"> Iso-slugs </font> of <font color="skyblue"> Molecule $H_2O$ </font>.

<font color="green"> Iso-slugs: 
    $\sideset{_{}^{1}}{_{2}^{}}{H}{_{}^{17}}{_{}^{}}{O}$, 
    $\sideset{_{}^{1}}{_{2}^{}}{H}{_{}^{18}}{_{}^{}}{O}$, 
    $\sideset{_{}^{2}}{_{2}^{}}{H}{_{}^{16}}{_{}^{}}{O}$, 
    $\sideset{_{}^{1}}{_{}^{}}{H}{_{}^{2}}{_{}^{}}{H}{_{}^{17}}{_{}^{}}{O}$, 
    $\sideset{_{}^{1}}{_{}^{}}{H}{_{}^{2}}{_{}^{}}{H}{_{}^{18}}{_{}^{}}{O}$, 
    $\sideset{_{}^{1}}{_{2}^{}}{H}{_{}^{16}}{_{}^{}}{O}$, 
    $\sideset{_{}^{1}}{_{}^{}}{H}{_{}^{2}}{_{}^{}}{H}{_{}^{16}}{_{}^{}}{O}$ </font>

In [2]:
api_url = 'http://exomol.com/api/?molecule=H2O&datatype=linelist'
response = requests.get(api_url)
content = response.text  
json_dict = json.loads(content)

print('IsoFormula: ', json_dict.keys())

IsoFormula:  dict_keys(['(1H)2(17O)', '(1H)(2H)(18O)', '(1H)2(16O)', '(1H)(2H)(17O)', '(1H)(2H)(16O)', '(2H)2(16O)', '(1H)2(18O)'])


Find all <font color="orange"> Isotopologue dataset names </font> for each <font color="green"> Iso-slug </font>.


<font color="green"> Iso-slug: $\sideset{_{}^{1}}{_{2}^{}}{H}{_{}^{17}}{_{}^{}}{O}$ </font> $\longrightarrow$ <font color="orange"> Isotopologue: HotWat78 </font>

<font color="green"> Iso-slug: $\sideset{_{}^{1}}{_{2}^{}}{H}{_{}^{18}}{_{}^{}}{O}$ </font> $\longrightarrow$ <font color="orange"> Isotopologue: HotWat78 </font>

<font color="green"> Iso-slug: $\sideset{_{}^{2}}{_{2}^{}}{H}{_{}^{16}}{_{}^{}}{O}$ </font> $\longrightarrow$ <font color="orange"> Isotopologue </font> None

<font color="green"> Iso-slug: $\sideset{_{}^{1}}{_{}^{}}{H}{_{}^{2}}{_{}^{}}{H}{_{}^{17}}{_{}^{}}{O}$ </font> $\longrightarrow$ <font color="orange"> Isotopologue </font> None

<font color="green"> Iso-slug: $\sideset{_{}^{1}}{_{}^{}}{H}{_{}^{2}}{_{}^{}}{H}{_{}^{18}}{_{}^{}}{O}$ </font> $\longrightarrow$ <font color="orange"> Isotopologue </font> None

<font color="green"> Iso-slug: $\sideset{_{}^{1}}{_{2}^{}}{H}{_{}^{16}}{_{}^{}}{O}$ </font> $\longrightarrow$ <font color="orange"> Isotopologue: BT2, CKYKKY, POKAZATEL </font>

<font color="green"> Iso-slug: $\sideset{_{}^{1}}{_{}^{}}{H}{_{}^{2}}{_{}^{}}{H}{_{}^{16}}{_{}^{}}{O}$ </font> $\longrightarrow$ <font color="orange"> Isotopologue: Janka, VTT </font>
    


In [3]:
print('Isotopologues for each IsoFormula:\n')
print('IsoFormula: (1H)2(17O)       Isotopologue dataset names: ', json_dict['(1H)2(17O)']['linelist'].keys(), '\n')
print('IsoFormula: (1H)2(18O)       Isotopologue dataset names: ', json_dict['(1H)2(18O)']['linelist'].keys(), '\n')
print('IsoFormula: (2H)2(16O)       Isotopologue dataset names: ', json_dict['(2H)2(16O)']['linelist'].keys(), '\n')
print('IsoFormula: (1H)(2H)(17O)    Isotopologue dataset names: ', json_dict['(1H)(2H)(17O)']['linelist'].keys(), '\n')
print('IsoFormula: (1H)(2H)(18O)    Isotopologue dataset names: ', json_dict['(1H)(2H)(18O)']['linelist'].keys(), '\n')
print('IsoFormula: (1H)2(16O)       Isotopologue dataset names: ', json_dict['(1H)2(16O)']['linelist'].keys(), '\n')
print('IsoFormula: (1H)(2H)(16O)    Isotopologue dataset names: ', json_dict['(1H)(2H)(16O)']['linelist'].keys())

Isotopologues for each IsoFormula:

IsoFormula: (1H)2(17O)       Isotopologue dataset names:  dict_keys(['data type', 'HotWat78']) 

IsoFormula: (1H)2(18O)       Isotopologue dataset names:  dict_keys(['data type', 'HotWat78']) 

IsoFormula: (2H)2(16O)       Isotopologue dataset names:  dict_keys(['data type']) 

IsoFormula: (1H)(2H)(17O)    Isotopologue dataset names:  dict_keys(['data type']) 

IsoFormula: (1H)(2H)(18O)    Isotopologue dataset names:  dict_keys(['data type']) 

IsoFormula: (1H)2(16O)       Isotopologue dataset names:  dict_keys(['data type', 'BT2', 'CKYKKY', 'POKAZATEL']) 

IsoFormula: (1H)(2H)(16O)    Isotopologue dataset names:  dict_keys(['data type', 'Janka', 'VTT'])


Take <font color="green"> IsoFormula </font> and the corresponding <font color="orange"> Isotopologue dataset names </font> into a list. 

In [None]:
molecule = 'H2O'
api_url = 'http://exomol.com/api/?molecule=H2O&datatype=linelist'


iso = [('(1H)2(17O)', 'HotWat78'), ('(1H)2(18O)', 'HotWat78'),
       ('(1H)2(16O)', 'BT2'), ('(1H)2(16O)', 'CKYKKY'),('(1H)2(16O)', 'POKAZATEL'),
       ('(1H)(2H)(16O)', 'Janka'), ('(1H)(2H)(16O)', 'VTT')]


Get the download links of states.bz2 files and trans.bz2 files from webset.

In [None]:
def get_target_url():
    """
    Get the download url from api.
    
    """
    
    response = requests.get(api_url)
    
    if(response.status_code != 200):
        print('ExoMol API Error' + str(response.status_code))
        
    # If the obtained status code is 200, it is correct.
    else:
        content = response.text            # Get the relevant content.
        json_dict = json.loads(content)    # Convert json into dictionary.
        
        print("Download links:")
        
        file_url = []
        for i in range(len(iso)):
            # Extract files information from dictionary and convert them into list
            iso_formula = iso[i][0]
            isotopologue = iso[i][1]
            json_list = json_dict[iso_formula]['linelist'][isotopologue]['files']

            for j in range(len(json_list)):
                link = json_list[j].get('url')
                try:
                    if((link.endswith('states.bz2') or link.endswith('trans.bz2'))):
                    #if((link.endswith('states.bz2'))):
                        file_url.append("http://www." + link)
                except KeyError:
                    print('Keyerror, keep going!')
                        
        for k in file_url:
            print(k)
            
        return file_url


In [None]:
target_link = get_target_url()

## Download Files

Download states.bz2 files and trans.bz2 files with download links. Save these files into correspoding folders which names by download links and file names.

In [None]:
def download_target_series(file_url):
    
    failed_list = [] 
    for link in tqdm(file_url):
        
        file_name = link.split('/')[-1]                
        print("Downloading file:%s" % file_name)
        print(link)

        # Make folders for saving doanloaded files.
        full_name = "./data/" + file_url[0].split('//')[-1]
        dir_name = "/".join(full_name.split('/')[:-1])
        if os.path.exists(dir_name):
            pass
        else:
            os.makedirs(dir_name, exist_ok=True)
        save_name = os.path.join(full_name)
        
        try:
            r = requests.get(link, stream=True, verify=False)
        except Exception:
            failed_list.append(file_name.split('\\')[-1])
            print(' download failed. Go to download next one\n')
              
        # For compute the progess.
        total_size = int(r.headers['Content-Length'])
        temp_size = 0    
   
        # Download started.
        with open(save_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    temp_size += len(chunk)
                    f.write(chunk)
                    f.flush()
                    done = int(50 * temp_size / total_size)
                    sys.stdout.write("\r[%s%s] %d%%" % ('█' * done, ' ' * (50 - done), 100 * temp_size / total_size))
                    sys.stdout.flush()
        
        
        print(" Downloaded!\n")

    print("All files downloaded!")

    print(failed_list) # Record which file is failed to download.

    return


## Decompress bz2 Files

In [None]:
path = "./data/www.exomol.com/db/H2O"

# Make a folder for saving decompressed files.
de_path = "./data/decompress/H2O"
if os.path.exists(de_path):
    pass
else:
    os.makedirs(de_path, exist_ok=True)

    
for(dirpath,dirnames,files)in os.walk(path):
    count = 0
    for filename in files:
        filepath = os.path.join(dirpath, filename)
        de_filepath = os.path.join(de_path, filename.split('.')[0] + '_' + filename.split('.')[1] + '.inp') 
        count += 1
        print(count)
        with open(de_filepath, 'wb') as de_file, bz2.BZ2File(filepath, 'rb') as file:
            size = os.path.getsize(filepath)
            if (size < (float(1024) * 100)): 
                for data in iter(lambda: file.read(50 * 1024), b''):
                    de_file.write(data)
            else:
                for data in iter(lambda: file.read(100 * 1024), b''):
                    de_file.write(data)

                    
print("Finished decompression")
