In [1]:
import os
#import vaex
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from io import StringIO
from tqdm import tqdm_notebook as tqdm_nb

In [None]:
molecule = []
iso_slug = []
isotopologue = []
path_mol_iso = []
for line in open('./data/url/api__urls.txt'):
    molecule.append(line.split('/')[-4])
    iso_slug.append(line.split('/')[-3])
    isotopologue.append(line.split('/')[-2])
    path_mol_iso.append(line.split('/')[-4] + '/' + line.split('/')[-3] + '/' + line.split('/')[-2])

molecule_list = list(set(molecule))
molecule_list.sort(key=molecule.index)

iso_slug_list = list(set(iso_slug))
iso_slug_list.sort(key=iso_slug.index)

isotopologue_list = list(set(isotopologue))
isotopologue_list.sort(key=isotopologue.index)

path_mol_iso_list = list(set(path_mol_iso))
path_mol_iso_list.sort(key=path_mol_iso.index)

print('Molecule:', molecule_list)
print('Iso-slug:', iso_slug_list)
print('Isotopologue:', isotopologue_list)
print('Total:', path_mol_iso_list)


In [2]:
path_mol_iso_list = ['AlH/27Al-1H/AlHambra', 'TiO/48Ti-16O/Toto']

In [3]:
def chunk(reader):    
    loop = True
    chunk_size = 100000000
    chunks = []
    while loop:
        try:
            chunk = reader.get_chunk(chunk_size)
            chunks.append(chunk)
        except StopIteration:
            loop = False
            print('Iteration is stopped.')
    df = pd.concat(chunks, ignore_index=True)
    return df


In [4]:
T = 296                      # Reference temperature k
h = 6.62607015e-34           # Planck's const (J s)
c = 299792458                # Velocity of light (m s-1)
kB = 1.380649e-23            # Boltzmann's const (J K-1)

c2 = h * c * 100 / kB                  # Second radiation constant (cm K)
pi_c_8 = 1 / (8 * np.pi * c * 100)     # 8 * pi * c (cm-1 s)
c2_T = c2 / T                          # c2 / T  (cm)


In [5]:
def extract_unc_states(states_df):
    unc = states_df['Unc'].values
    unc_states_num = states_df['Unc'].count()
    unc_states_list = []
    for i in tqdm(range(unc_states_num)):
        if (unc[i] < float(0.001)):
            unc_states_list.append(states_df.loc[i].values)

    unc_states_df = pd.DataFrame(unc_states_list, columns=states_col_name)
    return unc_states_df


In [6]:
def extract_unc_trans(trans_df):
    upper_id = trans_df['i'].values
    lower_id = trans_df['f'].values
    state_id = unc_states_df['N'].values
    
    # Extract the same upper states ID from states_df
    unc_trans_i_df = pd.DataFrame()
    for id_1 in tqdm(state_id):
        unc_trans_i_df = unc_trans_i_df.append(trans_df[trans_df['f'].isin([id_1])])
        
    # Extract the same lower states ID from states_df
    unc_trans_df = pd.DataFrame()
    for id_2 in tqdm(state_id):
        unc_trans_df = unc_trans_df.append(unc_trans_i_df[unc_trans_i_df['i'].isin([id_2])])
        
    return unc_trans_df

In [7]:
def calculate_csv(unc_states_df, unc_trans_df, pf_df):
    Q = pf_df['Q'].values[296+1]         # Partition function defined as sum over states at standard 296K
    
    unc_upper_id = unc_trans_df['i'].values
    unc_lower_id = unc_trans_df['f'].values 
    state_id = unc_states_df['N']
    unc_trans_num = unc_trans_df['i'].count()

    wavenumber = pd.DataFrame()             # Vacuum wavenumber (cm−1)
    intensity = pd.DataFrame()              # Intensities (cm-1/molecule cm-2) at standard 296K         
    A_coefficient = []                      # Einstein A-coefficient
    lower_state_energy = pd.DataFrame()     # lower state energy
    uncertainty = pd.DataFrame()            # Uncertainty indices
    weight_upper_state = pd.DataFrame()     # Nuclear-spin statistical weight of upper state
    weight_lower_state = pd.DataFrame()     # Nuclear-spin statistical weight of lower state


    for i in tqdm(range(unc_trans_num)):
        id_i = unc_upper_id[i]
        id_f = unc_lower_id[i]
        A = unc_trans_df['A_if'].values[i]                               # Einstein-A coefficient (s−1)
        J_f = unc_states_df[state_id.isin([id_f])]['J'].values           # Corresponding total angular momentum
        g_i_ns = unc_states_df[state_id.isin([id_i])]['g'].values        # Nuclear-spin statistical weight of upper state
        g_f_ns = unc_states_df[state_id.isin([id_f])]['g'].values        # Nuclear-spin statistical weight of lower state
        E_i = unc_states_df[state_id.isin([id_i])]['E'].values           # Upper state energy
        E_f = unc_states_df[state_id.isin([id_f])]['E'].values           # Lower state energy
        unc = unc_states_df[state_id.isin([id_i])]['Unc'].values         # Uncertainty indices

        g_f_tot = g_f_ns * (2 * J_f + 1)                # Total degeneracy
        v = (E_i ** 2 + E_f ** 2) ** 0.5                # Vacuum wavenumber (cm−1)
        I = g_f_tot * A * np.exp(- c2_T * E_i) * (1 - np.exp(- c2_T * v)) * pi_c_8 / (v ** 2) / Q    # Intensities

        wavenumber = wavenumber.append(pd.DataFrame(v))
        intensity = intensity.append(pd.DataFrame(I))
        A_coefficient.append(A)
        lower_state_energy = lower_state_energy.append(pd.DataFrame(E_f))
        uncertainty = uncertainty.append(pd.DataFrame(unc))
        weight_upper_state = weight_upper_state.append(pd.DataFrame(g_i_ns))
        weight_lower_state = weight_lower_state.append(pd.DataFrame(g_f_ns))

    iso_csv_df = unc_trans_df
    
    iso_csv_df[M_mol_iso] = '**'                        # Molecule number
    iso_csv_df['I'] = '*'                               # Isotopologue number
    iso_csv_df['v'] = wavenumber.values                 # Vacuum wavenumber (cm−1)
    iso_csv_df['S'] = intensity.values                  # Intensities (cm-1/molecule cm-2) at standard 296K  
    #iso_csv_df['S'].describe().round(6)
    iso_csv_df['A'] = A_coefficient                     # Einstein A-coefficient
    iso_csv_df['gm_a'] = np.nan                         # Air-broadened half-width
    iso_csv_df['gm_s'] = np.nan                         # Self-broadened half-width
    iso_csv_df['E_f'] = lower_state_energy.values       # lower state energy
    iso_csv_df['n_a'] = np.nan                          # Temperature-dependence exponent for gamma_air
    iso_csv_df['dt_a'] = np.nan                         # Air pressure-induced line shift
    iso_csv_df['V_i'] = np.nan                          # Upper-state 'global' quanta
    iso_csv_df['V_f'] = np.nan                          # Lower-state 'global' quanta
    iso_csv_df['Q_i'] = np.nan                          # Upper-state 'local' quanta
    iso_csv_df['Q_f'] = np.nan                          # Lower-state 'local' quanta
    iso_csv_df['Ierr'] = uncertainty.values             # Uncertainty indices
    iso_csv_df['Iref'] = np.nan                         # Reference indices
    iso_csv_df['*'] = np.nan                            # Flag
    iso_csv_df['g_i'] = weight_upper_state.values       # Nuclear-spin statistical weight of upper state
    iso_csv_df['g_f'] = weight_lower_state.values       # Nuclear-spin statistical weight of lower state
            
    order = [M_mol_iso, 'I', 'v', 'S', 'A', 'gm_a', 'gm_s', 'E_f', 'n_a', 'dt_a',
             'V_i', 'V_f', 'Q_i', 'Q_f', 'Ierr', 'Iref', '*', 'g_i', 'g_f']
    iso_csv_df = iso_csv_df[order]
    
    # Sort by increasing wavenumber
    iso_csv_df = iso_csv_df.sort_values(['v'], ascending = True).reset_index(drop=True)

    return iso_csv_df
    

In [8]:
def convert_uncertainty_code(iso_HITRAN_df):
    iso_HITRAN_num = iso_HITRAN_df['Ierr'].count()
    Ierr = []
    for i in range(iso_HITRAN_num):
        uncertainty = iso_HITRAN_df['Ierr'].values[i]
        uncertainty_value = float(uncertainty)
        if (0.0001 <= uncertainty_value < 0.001):
            uncertainty_code = '{:_>6}'.format(4)
        elif (0.00001 <= uncertainty_value < 0.0001):
            uncertainty_code = '{:_>6}'.format(5)
        elif (uncertainty_value < 0.00001):
            uncertainty_code = '{:_>6}'.format(6)
        Ierr.append(uncertainty_code)
    return Ierr

In [9]:
def convert_csv_to_HITRAN(iso_csv_df):
    iso_HITRAN_df = iso_csv_df
    Ierr = convert_uncertainty_code(iso_HITRAN_df)
    
    iso_HITRAN_df[M_mol_iso] = '**'
    iso_HITRAN_df['I'] = '*'
    iso_HITRAN_df['v'] = iso_csv_df['v'].map('{:_>12.6F}'.format)
    iso_HITRAN_df['S'] = iso_csv_df['S'].map('{:_>10.3E}'.format)
    iso_HITRAN_df['A'] = iso_csv_df['A'].map('{:_>10.3E}'.format)
    iso_HITRAN_df['gm_a'] = '_' * 5
    iso_HITRAN_df['gm_s'] = '_' * 5
    iso_HITRAN_df['E_f'] = iso_csv_df['E_f'].map('{:_>10.4F}'.format)
    iso_HITRAN_df['n_a'] = '_' * 4
    iso_HITRAN_df['dt_a'] = '_' * 8
    iso_HITRAN_df['V_i'] = '_' * 15
    iso_HITRAN_df['V_f'] = '_' * 15
    iso_HITRAN_df['Q_i'] = '_' * 15
    iso_HITRAN_df['Q_f'] = '_' * 15
    iso_HITRAN_df['Ierr'] = Ierr
    iso_HITRAN_df['Iref'] = '_' * 12
    iso_HITRAN_df['*'] = '_'
    iso_HITRAN_df['g_i'] = iso_csv_df['g_i'].map('{:_>7.1F}'.format)
    iso_HITRAN_df['g_f'] = iso_csv_df['g_f'].map('{:_>7.1F}'.format)

    return iso_HITRAN_df


In [10]:
read_path = './data/states_trans_pf/'

# Create a folder for saving result files.
# If the folder exists, delete (empty) the folder then create it.
result_path = './data/result/'
if os.path.exists(result_path):                    # Determine whether the folder exists or not.
    for root, dirs, files in os.walk(result_path, topdown=False):
        for name in files:
            os.remove(os.path.join(root, name))    # Delete files in the folder.
        for name in dirs:
            os.rmdir(os.path.join(root, name))     # Delete the sub-folder.
    os.rmdir(result_path)                          # Delete the folder.
os.mkdir(result_path)                              # Create the folder.


In [11]:
states_col_name = ['N', 'E', 'g', 'J', 'Unc', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10',
                   'c11', 'c12', 'c13', 'c14','c15', 'c16', 'c17', 'c18', 'c19',
                   'c20', 'c21', 'c22', 'c23', 'c24', 'c25', 'c26']
trans_col_name = ['i', 'f', 'A_if', 'v_if']
pf_col_name = ['T', 'Q', 'W']
result_col_name = ['M', 'I', 'v', 'S', 'A', 'gamma_air', 'gamma_self',
                   'E_f', 'n_air', 'delta_air', 'V_i', 'V_f', 'Q_i', 'Q_f',
                   'Ierr', 'Iref', '*', 'g_i', 'g_f']


for path_mol_iso in tqdm(path_mol_iso_list):
    M_mol_iso = 'M of ' + path_mol_iso.replace('/','__')
    
    states_path = read_path + path_mol_iso.replace('/','__') + '_states.csv'
    trans_path = read_path + path_mol_iso.replace('/','__') + '_trans.csv'
    pf_path = read_path + path_mol_iso.replace('/','__') + '_pf.csv'

    states_reader = pd.read_csv(states_path, names=states_col_name, header=None, iterator=True, engine='python')
    trans_reader = pd.read_csv(trans_path, names=trans_col_name, header=None, iterator=True, engine='python')
    pf_reader = pd.read_csv(pf_path, names=pf_col_name, header=None, iterator=True, engine='python')
    
    states_df = chunk(states_reader)
    trans_df = chunk(trans_reader)
    pf_df = chunk(pf_reader)
    
    unc_states_df = extract_unc_states(states_df)
    unc_trans_df = extract_unc_trans(trans_df)
    iso_csv_df = calculate_csv(unc_states_df, unc_trans_df, pf_df)
    
    iso_csv_df.to_csv(result_path + path_mol_iso.replace('/','__') + '.csv', header=True)   
    csv_df = iso_csv_df.to_csv(result_path + 'csv_result.csv', header=True, mode='a')
    
    iso_HITRAN_df = convert_csv_to_HITRAN(iso_csv_df)
    HITRAN_df = iso_HITRAN_df.to_csv(result_path + 'demo.txt', sep=' ', index=False, header=True, mode='a')
    

  0%|                                                                                            | 0/2 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1551/1551 [00:00<00:00, 40923.77it/s]

  0%|                                                                                          | 0/143 [00:00<?, ?it/s]
 43%|██████████████████████████████████▏                                             | 61/143 [00:00<00:00, 605.57it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 143/143 [00:00<00:00, 485.99it/s]

  0%|                                                                                          | 0/143 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 143/143 [00:00<00:00, 775.04it/s]

  0%|                                                                                          | 0/613 [00:00<?, ?it/s]
  2%|█▋                              

# Read result

In [14]:
csv_reader = pd.read_csv(result_path + 'csv_result.csv', names = result_col_name, header=None, iterator=True, engine='python')
csv_df = chunk(csv_reader)

In [15]:
csv_df

Unnamed: 0,M,I,v,S,A,gamma_air,gamma_self,E_f,n_air,delta_air,V_i,V_f,Q_i,Q_f,Ierr,Iref,*,g_i,g_f
0,M of AlH__27Al-1H__AlHambra,I,v,S,A,gm_a,gm_s,E_f,n_a,dt_a,V_i,V_f,Q_i,Q_f,Ierr,Iref,*,g_i,g_f
1,**,*,12.591906,1.8045168069331727e-22,1.2873e-05,,,0.0,,,,,,,0.00027400000000000005,,,36,12
2,**,*,39.83393696166815,4.117106246786245e-21,0.00012450000000000002,,,12.591906,,,,,,,0.00013700000000000002,,,60,36
3,**,*,84.47289488060437,1.4827062307889854e-20,0.00045588,,,37.791354,,,,,,,0.000166,,,84,60
4,**,*,146.80026486255178,2.855622357990793e-20,0.0011406,,,75.547889,,,,,,,6.9e-05,,,108,84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
717,**,*,27582.242750200126,4.426369946524666e-114,1.1223999999999996e-51,,,19503.590889,,,,,,,0.0007,,,23,23
718,**,*,27598.829008898316,1.1219118039132067e-112,2.552199999999999e-50,,,19515.319145,,,,,,,0.00099,,,25,25
719,**,*,27642.203894864662,2.4924113706126884e-115,3.4101999999999996e-52,,,19545.989821,,,,,,,0.00063,,,11,11
720,**,*,27671.569415819158,1.002403328058264e-113,6.365699999999998e-51,,,19566.75438,,,,,,,0.00075,,,17,17


In [16]:
csv_df[607:618]

Unnamed: 0,M,I,v,S,A,gamma_air,gamma_self,E_f,n_air,delta_air,V_i,V_f,Q_i,Q_f,Ierr,Iref,*,g_i,g_f
607,**,*,12999.9947966916,6.699832385557435e-45,0.0075253,,,6262.862706,,,,,,,0.0003,,,756,780
608,**,*,13122.104097020909,2.2355562969276543e-37,106.78,,,8750.8075,,,,,,,0.000212,,,732,756
609,**,*,13378.637382908593,2.5384297882524987e-42,3.4287,,,7015.0925,,,,,,,0.0003,,,756,732
610,**,*,13422.257638789635,4.2621538293504993e-38,426.01,,,8680.640831,,,,,,,0.0005,,,540,516
611,**,*,14167.298063585173,5.13568025469715e-41,77.788,,,8422.346406,,,,,,,0.0003,,,756,732
612,**,*,14769.985495949735,4.184592395529345e-40,243.75,,,9554.707806,,,,,,,0.0002,,,924,900
613,**,*,15012.974371216374,2.2610040971287043e-40,384.57,,,9778.1892,,,,,,,0.0003,,,756,732
614,M of TiO__48Ti-16O__Toto,I,v,S,A,gm_a,gm_s,E_f,n_a,dt_a,V_i,V_f,Q_i,Q_f,Ierr,Iref,*,g_i,g_f
615,**,*,13.386746177311423,7.921646772119165e-43,3.4575e-26,,,9.465859,,,,,,,0.0002,,,9,9
616,**,*,20.846356437516846,1.6305481264446265e-42,7.7488e-26,,,14.7406,,,,,,,0.00019,,,11,11


In [17]:
iso_HITRAN_df

Unnamed: 0,M of TiO__48Ti-16O__Toto,I,v,S,A,gm_a,gm_s,E_f,n_a,dt_a,V_i,V_f,Q_i,Q_f,Ierr,Iref,*,g_i,g_f
0,**,*,___13.386746,_7.922E-43,_3.457E-26,_____,_____,____9.4659,____,________,_______________,_______________,_______________,_______________,_____4,____________,_,____9.0,____9.0
1,**,*,___20.846356,_1.631E-42,_7.749E-26,_____,_____,___14.7406,____,________,_______________,_______________,_______________,_______________,_____4,____________,_,___11.0,___11.0
2,**,*,___29.793243,_2.951E-42,_1.512E-25,_____,_____,___21.0670,____,________,_______________,_______________,_______________,_______________,_____6,____________,_,___13.0,___13.0
3,**,*,__139.672550,_1.324E-54,_4.022E-36,_____,_____,___98.7634,____,________,_______________,_______________,_______________,_______________,_____4,____________,_,____5.0,____5.0
4,**,*,__144.202206,_1.694E-52,_2.780E-34,_____,_____,__101.9664,____,________,_______________,_______________,_______________,_______________,_____4,____________,_,____7.0,____7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,**,*,27582.242750,4.426E-114,_1.122E-51,_____,_____,19503.5909,____,________,_______________,_______________,_______________,_______________,_____4,____________,_,___23.0,___23.0
103,**,*,27598.829009,1.122E-112,_2.552E-50,_____,_____,19515.3191,____,________,_______________,_______________,_______________,_______________,_____4,____________,_,___25.0,___25.0
104,**,*,27642.203895,2.492E-115,_3.410E-52,_____,_____,19545.9898,____,________,_______________,_______________,_______________,_______________,_____4,____________,_,___11.0,___11.0
105,**,*,27671.569416,1.002E-113,_6.366E-51,_____,_____,19566.7544,____,________,_______________,_______________,_______________,_______________,_____4,____________,_,___17.0,___17.0


In [18]:
def read_txt_in_chunks(path, chunk_size=1024*1024):
    file = open(path, 'r')
    while True:
        chunk_data = file.read(chunk_size)
        if not chunk_data:
            break
        yield chunk_data

HITRAN_path = result_path + 'demo.txt'
with open(result_path + 'HITRAN_result.txt', 'a') as save_file:
    for chunk in read_txt_in_chunks(HITRAN_path):
        string = str(chunk).replace(' ','').replace('"Mof','').replace('"IvSAgm_agm_sE_fn_adt_aV_iV_fQ_iQ_fIerrIref*g_ig_f','').replace('_',' ')
        save_file.write(string)
