## Import all what we need

In [1]:
import os
import math
import glob
import requests
import numpy as np
import pandas as pd
from io import StringIO
from tqdm import tqdm_notebook as tqdm

# Part 1: HITRAN Online Information

Get the names of molecules, iso-slugs and isotopoluge datasets from the api__urls.txt which saved the URLs with molecule, iso-slug and isotopologue. Combine them with '/' for reading files from folders more convenient later.

In [2]:
molecule = []
iso_slug = []
isotopologue = []
path_mol_iso = []
for line in open('./data/url/api__urls.txt'):
    molecule.append(line.split('/')[-4])
    iso_slug.append(line.split('/')[-3])
    isotopologue.append(line.split('/')[-2])
    path_mol_iso.append(line.split('/')[-4] + '/' + line.split('/')[-3]
                        + '/' + line.split('/')[-2])

molecule_list = list(set(molecule))
molecule_list.sort(key=molecule.index)

iso_slug_list = list(set(iso_slug))
iso_slug_list.sort(key=iso_slug.index)

isotopologue_list = list(set(isotopologue))
isotopologue_list.sort(key=isotopologue.index)

path_mol_iso_list = list(set(path_mol_iso))
path_mol_iso_list.sort(key=path_mol_iso.index)

print('Molecule:', molecule_list)
print('Iso-slug:', iso_slug_list)
print('Isotopologue:', isotopologue_list)
print('Total:', path_mol_iso_list)


Molecule: ['AlH', 'C2H2', 'C2', 'CO2', 'H2O', 'H3O_p', 'NH3', 'TiO']
Iso-slug: ['27Al-1H', '12C2-1H2', '12C2', '12C-16O2', '1H2-16O', '1H3-16O_p', '14N-1H3', '46Ti-16O', '47Ti-16O', '48Ti-16O', '49Ti-16O', '50Ti-16O']
Isotopologue: ['AlHambra', 'aCeTY', '8states', 'UCL-4000', 'POKAZATEL', 'eXeL', 'CoYuTe', 'Toto']
Total: ['AlH/27Al-1H/AlHambra', 'C2H2/12C2-1H2/aCeTY', 'C2/12C2/8states', 'CO2/12C-16O2/UCL-4000', 'H2O/1H2-16O/POKAZATEL', 'H3O_p/1H3-16O_p/eXeL', 'NH3/14N-1H3/CoYuTe', 'TiO/46Ti-16O/Toto', 'TiO/47Ti-16O/Toto', 'TiO/48Ti-16O/Toto', 'TiO/49Ti-16O/Toto', 'TiO/50Ti-16O/Toto']


Convert the iso-slug names into the ones which are shown in the table of HITRAN online website. It will help us to get their corresponding molecule numbers, isotopologue numbers and fractional abundances. 

The HITRAN online URL is: https://hitran.org/docs/iso-meta/.

In [19]:
unc_formula = pd.DataFrame(eval(str(iso_slug_list).replace('1H','H')
                                .replace('-','').replace('_p','+')))
unc_formula.columns = ['exomol formula']
unc_formula

Unnamed: 0,exomol formula
0,27AlH
1,12C2H2
2,12C2
3,12C16O2
4,H216O
5,H316O+
6,14NH3
7,46Ti16O
8,47Ti16O
9,48Ti16O


$C_2H_2$ information for HITRAN format.

In [20]:
path_mol_iso_list = ['C2H2/12C2-1H2/aCeTY']
hitran_online = pd.DataFrame()
hitran_online['molecule ID'] = ['26']
hitran_online['isotopologue ID'] = ['1']
hitran_online['exomol formula'] = ['12C2H2']
hitran_online['fractional abundance'] = ['0.977599']
hitran_online['Q(296K)'] = ['412.45']
hitran_online


Unnamed: 0,molecule ID,isotopologue ID,exomol formula,fractional abundance,Q(296K)
0,26,1,12C2H2,0.977599,412.45


In [21]:
path_mol_iso = path_mol_iso_list[0]
M_mol_iso = 'M of ' + path_mol_iso.replace('/','__')
molecule_id = int(hitran_online['molecule ID'][0])
isotopologue_id = int(hitran_online['isotopologue ID'][0])
fractional_abundance = float(hitran_online['fractional abundance'][0])

In [22]:
read_path = './data/www.exomol.com/db/'

# Create a folder for saving result files.
# If the folder exists, save files directory,otherwise, create it.
result_path = './data/result/'
# Determine whether the folder exists or not.
if os.path.exists(result_path):
    pass
else:
    # Create the folder.
    os.makedirs(result_path, exist_ok=True)

# Part 2: Process Data

## 2.1 Read States File

Consider column names of states file with def files.

In [23]:
path_mol_iso = path_mol_iso_list[0]
def_path = glob.glob('./data/def/' + '*' + path_mol_iso.split('/')[1]
                     + '__' + path_mol_iso.split('/')[2] + '.def')
def_reader = pd.read_csv(def_path[0], sep='\\s+', names=['1','2','3','4','5'], header=None)
list(def_reader[def_reader['4'].isin(['label'])]['1'].values)

['totalSym', 'v1', 'v2', 'v3', 'v4', 'v5', 'v5', 'v7', 'vibSym', 'K', 'rotSym']

In [24]:
states_col_name = (['ID','energy','g_tot','J_tot','Unc']
                   + ['Gtot','v1','v2','v3','v4','v5','v6','v7',
                      'Gvib','J','K','tau','Grot','W','n1','n2',
                      'n3','n4','l4','n5','l5','Ecalc'])

Read compressed states file in chunks directly. Extract rows of states file whose uncertainty indices are small than 0.001 to be the considered states file.

In [26]:
s_df = dict()
states_df = pd.DataFrame()
states_filenames = glob.glob(read_path + path_mol_iso + '/' + path_mol_iso.split('/')[1]
                             + '__' + path_mol_iso.split('/')[2] + '.states.bz2')

for states_filename in states_filenames:
    s_df[states_filename] = pd.read_csv(states_filename, compression='bz2', sep='\s+',
                                        header=None, names=states_col_name,
                                        chunksize=100_000_000, iterator=True,
                                        low_memory=False)
    for chunk in s_df[states_filename]:
        states_df = states_df.append(chunk)
        
# Extract rows of states file whose uncertainty indices are small than 0.001.
unc_states_df = states_df[states_df['Unc'] < float(0.001)]
unc_states_df

Unnamed: 0,ID,energy,g_tot,J_tot,Unc,Gtot,v1,v2,v3,v4,...,Grot,W,n1,n2,n3,n4,l4,n5,l5,Ecalc
0,1,0.000000,1,0,0.0009,A1g,0,0,0,0,...,A1g,1.00,0,0,0,0,0,0,0,-1.000000
1,2,1230.390303,1,0,0.0006,A1g,0,0,0,1,...,A1g,1.00,0,0,0,1,0,1,0,1230.381731
23,24,4800.137287,1,0,0.0006,A1g,0,0,1,0,...,A1g,1.00,0,1,0,0,0,0,2,4800.130696
4790,4791,1328.073466,3,0,0.0003,A2u,0,0,0,0,...,A1g,1.00,0,0,0,0,2,0,0,1328.070858
4824,4825,6556.464783,3,0,0.0001,A2u,0,2,0,0,...,A1g,1.00,0,2,0,0,0,0,0,6556.457311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2827804,2827805,7027.395462,79,39,0.0005,A1g,0,1,0,3,...,E1g,0.93,0,1,0,3,0,0,0,7030.005164
2845111,2845112,7262.207098,237,39,0.0007,A2g,0,1,0,0,...,E1g,0.95,0,1,0,0,0,3,0,7262.961259
2967785,2967786,6637.994238,83,41,0.0009,A1g,0,1,0,1,...,E2g,-0.97,0,1,0,1,1,0,0,6639.427076
4042389,4042390,7878.770213,351,58,0.0002,A2u,0,1,0,0,...,E1g,0.88,0,1,0,0,0,0,1,7880.271856


## 2.2 Read Partition Function File

In [27]:
pf_col_name = ['T', 'Q']
pf_path = read_path + path_mol_iso + '/' + path_mol_iso.replace('/','__') + '_pf.csv'

pf_url = ('http://www.exomol.com/db/' + path_mol_iso + '/'
          + path_mol_iso.split('/')[1] + '__' + path_mol_iso.split('/')[2] + '.pf')   
response = requests.get(pf_url)
content = response.text  
pf_data = pd.read_csv(StringIO(content), sep='\s+', names=pf_col_name, header=None, engine='python')
pf_data.to_csv(pf_path, header=False)

pf_df = pd.read_csv(pf_path, header=None, names=pf_col_name)
# Partition function defined as sum over states at standard 296K
Q = pf_df.iloc[296-1]['Q']
Q

412.45

## 2.3 Read Transitions Files

Extract rows of transitionos files whose upper states ID and lower states ID are all in considered states file.

In [29]:
def extract_unc_trans(trans_df):
    upper_id = trans_df['i'].values
    lower_id = trans_df['f'].values
    state_id = unc_states_df['ID'].values
    
    # Extract the same upper states ID from states_df
    unc_trans_i_df = pd.DataFrame()
    for id_1 in tqdm(state_id):
        unc_trans_i_df = unc_trans_i_df.append(trans_df[trans_df['i'].isin([id_1])])
        
    # Extract the same lower states ID from states_df
    unc_trans_df = pd.DataFrame()
    for id_2 in tqdm(state_id):
        unc_trans_df = unc_trans_df.append(unc_trans_i_df[unc_trans_i_df['f'].isin([id_2])])
        
    return unc_trans_df

## 2.4 Calculating

HITRAN Parameters for calculating.

In [28]:
T = 296                      # Reference temperature k
h = 6.62607015e-34           # Planck's const (J s)
c = 299792458                # Velocity of light (m s-1)
kB = 1.380649e-23            # Boltzmann's const (J K-1)

c2 = h * c * 100 / kB                  # Second radiation constant (cm K)
pi_c_8 = 1 / (8 * np.pi * c * 100)     # 8 * pi * c (cm-1 s)
c2_T = c2 / T                          # c2 / T  (cm)


Process data for CSV format.

In [30]:
def calculate_csv(unc_states_df, unc_trans_df):
    unc_upper_id = unc_trans_df['i'].values
    unc_lower_id = unc_trans_df['f'].values 
    state_id = unc_states_df['ID']
    unc_trans_num = unc_trans_df['i'].count()

    wavenumber = []                         # Vacuum wavenumber (cm−1)
    intensity = pd.DataFrame()              # Intensities (cm-1/molecule cm-2) at standard 296K         
    A_coefficient = []                      # Einstein A-coefficient
    lower_state_energy = pd.DataFrame()     # lower state energy
    uncertainty = []                        # Uncertainty indices
    weight_upper_state = pd.DataFrame()     # Statistical weight of upper state
    weight_lower_state = pd.DataFrame()     # Statistical weight of lower state
    upper_global_quanta = []                # Upper-state 'global' quanta
    lower_global_quanta = []                # Lower-state 'global' quanta
    upper_local_quanta = []                 # Upper-state 'local' quanta
    lower_local_quanta = []                 # Lower-state 'local' quanta

    for i in tqdm(range(unc_trans_num)):
        id_i = unc_upper_id[i]
        id_f = unc_lower_id[i]
        A = unc_trans_df['A_if'].values[i]                               # Einstein-A coefficient (s−1)
        g_i = unc_states_df[state_id.isin([id_i])]['g_tot'].values       # Total degeneracy of upper state
        g_f = unc_states_df[state_id.isin([id_f])]['g_tot'].values       # Total degeneracy of lower state
        E_i = unc_states_df[state_id.isin([id_i])]['energy'].values      # Upper state energy
        E_f = unc_states_df[state_id.isin([id_f])]['energy'].values      # Lower state energy
        unc_i = unc_states_df[state_id.isin([id_i])]['Unc'].values       # Uncertainty indices of upper state
        unc_f = unc_states_df[state_id.isin([id_f])]['Unc'].values       # Uncertainty indices of lower state
        
        u_n1 = unc_states_df[state_id.isin([id_i])]['n1'].values[0]
        u_n2 = unc_states_df[state_id.isin([id_i])]['n2'].values[0]
        u_n3 = unc_states_df[state_id.isin([id_i])]['n3'].values[0]
        u_n4 = unc_states_df[state_id.isin([id_i])]['n4'].values[0]
        u_l4 = unc_states_df[state_id.isin([id_i])]['l4'].values[0]
        u_n5 = unc_states_df[state_id.isin([id_i])]['n5'].values[0]
        u_l5 = unc_states_df[state_id.isin([id_i])]['l5'].values[0]
        u_J = unc_states_df[state_id.isin([id_i])]['J'].values[0]
        u_K = unc_states_df[state_id.isin([id_i])]['K'].values[0]
        u_Gvib = str(unc_states_df[state_id.isin([id_i])]['Gvib'].values[0])
        u_Gtot = str(unc_states_df[state_id.isin([id_i])]['Gtot'].values[0])
        u_Grot = str(unc_states_df[state_id.isin([id_i])]['Grot'].values[0])

        l_n1 = unc_states_df[state_id.isin([id_f])]['n1'].values[0]
        l_n2 = unc_states_df[state_id.isin([id_f])]['n2'].values[0]
        l_n3 = unc_states_df[state_id.isin([id_f])]['n3'].values[0]
        l_n4 = unc_states_df[state_id.isin([id_f])]['n4'].values[0]
        l_l4 = unc_states_df[state_id.isin([id_f])]['l4'].values[0]
        l_n5 = unc_states_df[state_id.isin([id_f])]['n5'].values[0]
        l_l5 = unc_states_df[state_id.isin([id_f])]['l5'].values[0]
        l_J = unc_states_df[state_id.isin([id_f])]['J'].values[0]
        l_K = unc_states_df[state_id.isin([id_f])]['K'].values[0]
        l_Gvib = str(unc_states_df[state_id.isin([id_f])]['Gvib'].values[0])
        l_Gtot = str(unc_states_df[state_id.isin([id_f])]['Gtot'].values[0])
        l_Grot = str(unc_states_df[state_id.isin([id_f])]['Grot'].values[0])
        V_i = ('%2d%2d%2d%2d%2d%2d%2d%3s' %
               (u_n1,u_n2,u_n3,u_n4,u_l4,u_n5,u_l5,u_Gvib) + ',')    # Upper-state 'global' quanta
        V_f = ('%2d%2d%2d%2d%2d%2d%2d%3s' %
               (l_n1,l_n2,l_n3,l_n4,l_l4,l_n5,l_l5,l_Gvib) + ',')    # Lower-state 'global' quanta
        Q_i = ' %3d%3s%3d%3s' % (u_J,u_Gtot,u_K,u_Grot) + ','        # Upper-state 'local' quanta
        Q_f = ' %3d%3s%3d%3s' % (l_J,l_Gtot,l_K,l_Grot) + ','        # Lower-state 'local' quanta

        unc = math.sqrt(unc_i ** 2 + unc_f ** 2)                     # Uncertainty idices
        v = float(abs(E_i - E_f))                                    # Vacuum wavenumber (cm−1)
        S = g_i * A * np.exp(- c2_T * E_f) * (1 - np.exp(- c2_T * v)) * pi_c_8 / (v ** 2) / Q    # Intensities

        wavenumber.append(v)
        intensity = intensity.append(pd.DataFrame(S))
        A_coefficient.append(A)
        lower_state_energy = lower_state_energy.append(pd.DataFrame(E_f))
        uncertainty.append(unc)
        weight_upper_state = weight_upper_state.append(pd.DataFrame(g_i))
        weight_lower_state = weight_lower_state.append(pd.DataFrame(g_f))
        upper_global_quanta += V_i.split(',')
        lower_global_quanta += V_f.split(',')
        upper_local_quanta += Q_i.split(',')
        lower_local_quanta += Q_f.split(',')
    
    iso_csv_df = pd.DataFrame()
    iso_csv_df['v'] = wavenumber                       # Vacuum wavenumber (cm−1)
    iso_csv_df['S'] = intensity.values                 # Intensities (cm-1/molecule cm-2) at standard 296K  
    iso_csv_df['A'] = A_coefficient                    # Einstein A-coefficient
    iso_csv_df['E_f'] = lower_state_energy.values      # Lower state energy
    iso_csv_df['Ierr'] = uncertainty                   # Uncertainty indices
    iso_csv_df['g_i'] = weight_upper_state.values      # Statistical weight of upper state
    iso_csv_df['g_f'] = weight_lower_state.values      # Statistical weight of lower state
    iso_csv_df[M_mol_iso] = molecule_id                # Molecule number
    iso_csv_df['I'] = isotopologue_id                  # Isotopologue number
    iso_csv_df['gm_a'] = np.nan                        # Air-broadened half-width
    iso_csv_df['gm_s'] = np.nan                        # Self-broadened half-width
    iso_csv_df['n_a'] = np.nan                         # Temperature-dependence exponent for gamma_air
    iso_csv_df['dt_a'] = np.nan                        # Air pressure-induced line shift
    iso_csv_df['V_i'] = list(filter(None, upper_global_quanta))            # Upper-state 'global' quanta
    iso_csv_df['V_f'] = list(filter(None, lower_global_quanta))            # Lower-state 'global' quanta
    iso_csv_df['Q_i'] = list(filter(None, upper_local_quanta))             # Upper-state 'local' quanta
    iso_csv_df['Q_f'] = list(filter(None, lower_local_quanta))             # Lower-state 'local' quanta                  
    iso_csv_df['Iref'] = np.nan                        # Reference indices
    iso_csv_df['*'] = np.nan                           # Flag

    return iso_csv_df
    

# Part 3: Save as CSV Format

In [31]:
trans_col_name = ['i', 'f', 'A_if']
t_df = dict()
species_csv_df = pd.DataFrame()
trans_filenames = glob.glob(read_path + path_mol_iso + '/' + '*trans.bz2')
for trans_filename in tqdm(trans_filenames):
    t_df[trans_filename] = pd.read_csv(trans_filename, compression='bz2', sep='\s+',
                                       usecols=[0,1,2], header=None, names=trans_col_name,
                                       chunksize=100_000_000, iterator=True, low_memory=False)
    # Set an empty DataFrame to avoid meeting empty considered transitions files.
    iso_csv_df = pd.DataFrame()
    for trans_df in t_df[trans_filename]:
        unc_trans_df = extract_unc_trans(trans_df)
        if len(unc_trans_df) != 0:
            iso_csv_df = calculate_csv(unc_states_df, unc_trans_df)
            
    species_csv_df = species_csv_df.append(iso_csv_df)

order = [M_mol_iso, 'I', 'v', 'S', 'A', 'gm_a', 'gm_s', 'E_f', 'n_a', 'dt_a',
         'V_i', 'V_f', 'Q_i', 'Q_f', 'Ierr', 'Iref', '*', 'g_i', 'g_f']
species_csv_df = species_csv_df[order]
# Sort by increasing wavenumber
species_csv_df = species_csv_df.sort_values(['v'], ascending = True).reset_index(drop=True)
# Save into a CSV file with column names which contain molecule name.
species_csv_df.to_csv(result_path + path_mol_iso.replace('/','__') + '.csv', header=True, index=False) 

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1161), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1161), HTML(value='')))




HBox(children=(IntProgress(value=0, max=62), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1161), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1161), HTML(value='')))




HBox(children=(IntProgress(value=0, max=817), HTML(value='')))





In [32]:
species_csv_df

Unnamed: 0,M of C2H2__12C2-1H2__aCeTY,I,v,S,A,gm_a,gm_s,E_f,n_a,dt_a,V_i,V_f,Q_i,Q_f,Ierr,Iref,*,g_i,g_f
0,26,1,10.296416,9.475024e-37,1.087900e-11,,,4051.649572,,,2 0 0 0 0 0 0A1g,0 1 0 0 0 0 1E1u,10A1g 0A1g,11A1u 1E1g,0.000361,,,21,23
1,26,1,19.349552,3.011057e-37,1.335100e-09,,,5368.795869,,,1 1 0 0 0 0 0A2u,0 1 0 0 2 1 0E1g,10A2u 0A1g,11A2g 1E1g,0.000849,,,63,69
2,26,1,26.222544,2.346822e-34,6.598100e-09,,,4035.723444,,,2 0 0 0 0 0 0A1g,1 0 0 3 0 0 0E1u,10A1g 0A1g,11A1u 1E1g,0.000424,,,21,23
3,26,1,27.554793,2.387035e-36,8.581300e-10,,,4775.177141,,,0 1 0 0 2 0 0A2u,0 1 0 1 1 0 0E2g,10A2u 0A1g,11A2g 2E2g,0.000361,,,63,69
4,26,1,38.306563,5.259989e-33,2.559200e-06,,,4764.425371,,,0 1 0 0 2 0 0A2u,0 1 0 0 2 0 0A1g,10A2u 0A1g,11A2g 0A2g,0.000361,,,63,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
874,26,1,201.061380,6.979440e-39,4.021800e-03,,,8846.067174,,,1 1 0 4 0 0 0A1g,3 0 0 0 0 3 0E1u,27A2g 0A2g,26A2u 1E1g,0.000361,,,165,159
875,26,1,201.083395,2.770243e-41,6.622800e-05,,,9123.218857,,,2 1 0 0 1 1 0A1u,0 2 0 3 0 0 0E1g,25A2u 0A2g,24A2g 1E1g,0.000412,,,153,147
876,26,1,201.345554,1.837834e-44,2.980300e-08,,,8998.085846,,,2 1 0 0 1 0 1E2g,1 2 0 0 0 0 0A2u,20A2g 2E2g,20A2u 0A1g,0.000361,,,123,123
877,26,1,201.679633,9.015327e-45,2.165000e-08,,,9097.534654,,,2 1 0 0 1 0 1E2g,1 2 0 0 0 0 0A2u,22A2g 2E2g,22A2u 0A1g,0.000539,,,135,135
