## Import all what we need

In [1]:
import os
import math
import glob
import requests
import numpy as np
import pandas as pd
from io import StringIO
from tqdm import tqdm_notebook as tqdm

# Part 1: HITRAN Online Information

Get the names of molecules, iso-slugs and isotopoluge datasets from the api__urls.txt which saved the URLs with molecule, iso-slug and isotopologue. Combine them with '/' for reading files from folders more convenient later.

In [2]:
molecule = []
iso_slug = []
isotopologue = []
path_mol_iso = []
for line in open('./data/url/api__urls.txt'):
    molecule.append(line.split('/')[-4])
    iso_slug.append(line.split('/')[-3])
    isotopologue.append(line.split('/')[-2])
    path_mol_iso.append(line.split('/')[-4] + '/' + line.split('/')[-3] + '/' + line.split('/')[-2])

molecule_list = list(set(molecule))
molecule_list.sort(key=molecule.index)

iso_slug_list = list(set(iso_slug))
iso_slug_list.sort(key=iso_slug.index)

isotopologue_list = list(set(isotopologue))
isotopologue_list.sort(key=isotopologue.index)

path_mol_iso_list = list(set(path_mol_iso))
path_mol_iso_list.sort(key=path_mol_iso.index)

print('Molecule:', molecule_list)
print('Iso-slug:', iso_slug_list)
print('Isotopologue:', isotopologue_list)
print('Total:', path_mol_iso_list)


Molecule: ['AlH', 'C2H2', 'C2', 'CO2', 'H2O', 'H3O_p', 'NH3', 'TiO']
Iso-slug: ['27Al-1H', '12C2-1H2', '12C2', '12C-16O2', '1H2-16O', '1H3-16O_p', '14N-1H3', '46Ti-16O', '47Ti-16O', '48Ti-16O', '49Ti-16O', '50Ti-16O']
Isotopologue: ['AlHambra', 'aCeTY', '8states', 'UCL-4000', 'POKAZATEL', 'eXeL', 'CoYuTe', 'Toto']
Total: ['AlH/27Al-1H/AlHambra', 'C2H2/12C2-1H2/aCeTY', 'C2/12C2/8states', 'CO2/12C-16O2/UCL-4000', 'H2O/1H2-16O/POKAZATEL', 'H3O_p/1H3-16O_p/eXeL', 'NH3/14N-1H3/CoYuTe', 'TiO/46Ti-16O/Toto', 'TiO/47Ti-16O/Toto', 'TiO/48Ti-16O/Toto', 'TiO/49Ti-16O/Toto', 'TiO/50Ti-16O/Toto']


Convert the iso-slug names into the ones which are shown in the table of HITRAN online website. It will help us to get their corresponding molecule numbers, isotopologue numbers and fractional abundances. 

The HITRAN online URL is: https://hitran.org/docs/iso-meta/.

In [3]:
unc_formula = pd.DataFrame(eval(str(iso_slug_list).replace('1H','H').replace('-','').replace('_p','+')))
unc_formula.columns = ['exomol formula']
unc_formula

Unnamed: 0,exomol formula
0,27AlH
1,12C2H2
2,12C2
3,12C16O2
4,H216O
5,H316O+
6,14NH3
7,46Ti16O
8,47Ti16O
9,48Ti16O


Information for calculations to obtain the HITRAN format data.

In [4]:
hitran_online = pd.DataFrame()
hitran_online['exomol formula'] = unc_formula['exomol formula']
hitran_online['molecule ID'] = ['50','26','51','2','1','52','11','53','53','53','53','53']
hitran_online['isotopologue ID'] = ['1','1','1','1','1','1','1','1','2','3','4','5']
hitran_online['fractional abundance'] = ['1','0.977599','1','0.984204','0.997317','1','0.995872','1','1','1','1','1']
hitran_online


Unnamed: 0,exomol formula,molecule ID,isotopologue ID,fractional abundance
0,27AlH,50,1,1.0
1,12C2H2,26,1,0.977599
2,12C2,51,1,1.0
3,12C16O2,2,1,0.984204
4,H216O,1,1,0.997317
5,H316O+,52,1,1.0
6,14NH3,11,1,0.995872
7,46Ti16O,53,1,1.0
8,47Ti16O,53,2,1.0
9,48Ti16O,53,3,1.0


# Part 2: Process Data to Satisfy HITRAN Format

Convert uncertainty values which we calculated in each molecule codes into uncertainty code. See information from https://hitran.org/docs/uncertainties/.

In [5]:
def convert_uncertainty_code(HITRAN_df):
    HITRAN_num = HITRAN_df['Ierr'].count()
    uncertainty_code = 0
    Ierr = []
    for i in range(HITRAN_num):
        uncertainty = HITRAN_df['Ierr'].values[i]
        uncertainty_value = float(uncertainty)
        if (0.1 <= uncertainty_value < 1):
            uncertainty_code = '{:>1}'.format(1) + '40000'
        elif (0.01 <= uncertainty_value < 0.1):
            uncertainty_code = '{:>1}'.format(2) + '40000'
        elif (0.001 <= uncertainty_value < 0.01):
            uncertainty_code = '{:>1}'.format(3) + '40000'
        elif (0.0001 <= uncertainty_value < 0.001):
            uncertainty_code = '{:>1}'.format(4) + '40000'
        elif (0.00001 <= uncertainty_value < 0.0001):
            uncertainty_code = '{:>1}'.format(5) + '40000'
        elif (0.000001 <= uncertainty_value < 0.00001):
            uncertainty_code = '{:>1}'.format(6) + '40000'
        elif (0.0000001 <= uncertainty_value < 0.000001):
            uncertainty_code = '{:>1}'.format(7) + '40000'
        elif (0.00000001 <= uncertainty_value < 0.0000001):
            uncertainty_code = '{:>1}'.format(8) + '40000'
        elif (uncertainty_value < 0.00000001):
            uncertainty_code = '{:>1}'.format(9) + '40000'
        else:
            uncertainty_code = '{:>1}'.format(0) + '40000'
        Ierr.append(uncertainty_code)
    return Ierr

Convert CSV format into HITRAN format. All intensities less than $1.0 \times e^{-30}$ can be ignored. We only extract those rows whose intensity is larger than $1.0 \times e^{-30}$ .

To save data as HITRAN format, we just use _ to instead of blanks and save as a demo result. Then we will use this demo result to convert it into HITRAN format result.

In [6]:
def convert_csv_to_HITRAN(csv_df):
    HITRAN_df = csv_df[csv_df['S'] > 1.0E-30]
    Ierr = convert_uncertainty_code(HITRAN_df)
    
    HITRAN_df['M'] = HITRAN_df['M'].map('{:_>2}'.format)
    HITRAN_df['I'] = HITRAN_df['I'].map('{:>1}'.format)
    HITRAN_df['v'] = HITRAN_df['v']
    HITRAN_df['S'] = HITRAN_df['S'] * fractional_abundance
    HITRAN_df['S'] = HITRAN_df['S'].map('{:_>10.3E}'.format)
    HITRAN_df['A'] = HITRAN_df['A'].map('{:_>10.3E}'.format)
    HITRAN_df['gm_a'] = '_' * 5
    HITRAN_df['gm_s'] = '_' * 5
    HITRAN_df['E_f'] = HITRAN_df['E_f'].map('{:_>10.4F}'.format)
    HITRAN_df['n_a'] = '_' * 4
    HITRAN_df['dt_a'] = '_' * 8
    HITRAN_df['V_i'] = HITRAN_df['V_i']
    HITRAN_df['V_f'] = HITRAN_df['V_f']
    HITRAN_df['Q_i'] = HITRAN_df['Q_i']
    HITRAN_df['Q_f'] = HITRAN_df['Q_f']
    HITRAN_df['Ierr'] = Ierr
    HITRAN_df['Iref'] = '_' * 12
    HITRAN_df['*'] = '_'
    HITRAN_df['g_i'] = HITRAN_df['g_i'].map('{:_>7.1F}'.format)
    HITRAN_df['g_f'] = HITRAN_df['g_f'].map('{:_>7.1F}'.format)

    return HITRAN_df


Read data in chunks.

In [7]:
def read_txt_in_chunks(path, chunk_size=1024*1024):
    file = open(path, 'r')
    while True:
        chunk_data = file.read(chunk_size)
        if not chunk_data:
            break
        yield chunk_data

Sort HITRAN format data by increasing wavenumbers and then convert wavenumbers format to be similar as other columns which is using _ instead of blank.


Since if there is " in a value of DataFrame, then when we save data into a text file, there will be two more " at the begining and the end of this value. To avoid this problem, we replace ' to be upper and replace " to be lower. We will then convert upper and lower back into ' and " later when we write data into a HITRAN format result text file.


Change the order to Change the column order to satisfy the HITRAN format. Save a demo file as a text file for convering into HITRAN format result text file later.

Read the demo file and replace string upper and lower back to ' and ". Then write data into a text file. After all we obtain the HITRAN format result text file.

In [8]:
def save_hitran(df, demo_path, save_path):
    # Sort HITRAN format data by increasing wavenumbers.
    df = df.sort_values(['v'], ascending = True).reset_index(drop=True)
    # Convert wavenumbers format to be similar as other columns which is using _ instead of blank.
    df['v'] = df['v'].map('{:_>12.6F}'.format)
    # To avoid the changes of ' and " when converting from csv to txt.
    df['V_i'] = df['V_i'].str.replace("'","upper")
    df['V_i'] = df['V_i'].str.replace('"','lower')
    df['V_f'] = df['V_f'].str.replace("'","upper")
    df['V_f'] = df['V_f'].str.replace('"','lower')
    df['Q_i'] = df['Q_i'].str.replace("'","upper")
    df['Q_i'] = df['Q_i'].str.replace('"','lower')
    df['Q_f'] = df['Q_f'].str.replace("'","upper")
    df['Q_f'] = df['Q_f'].str.replace('"','lower')
    # Change the column order to satisfy the HITRAN format.
    order = ['M', 'I', 'v', 'S', 'A', 'gm_a', 'gm_s', 'E_f', 'n_a', 'dt_a',
             'V_i', 'V_f', 'Q_i', 'Q_f', 'Ierr', 'Iref', '*', 'g_i', 'g_f']
    df = df[order]
    # Save a demo file for converting into HITRAN format.
    df.to_csv(demo_path, header=None, index=False)
    
    with open(save_path, 'w') as save_file:
        for chunk in read_txt_in_chunks(demo_path):
            # Replace back to ' and " to satisfy the HITRAN format.
            string = str(chunk).replace(',','').replace('_',' ').replace("upper","'").replace('lower','"')
            save_file.write(string)

A folder for save demo files, the files which are named with demo are not the results.

In [9]:
# Create a folder for saving demo files.
# If the folder exists, save files directory,otherwise, create it.
demo_file_path = './data/result/demo/'
if os.path.exists(demo_file_path):                   # Determine whether the folder exists or not.
    pass
else:
    os.makedirs(demo_file_path, exist_ok=True)       # Create the folder.

A folder for save HITRAN format files, the files which are named as $molecule__iso-slug__isotopologue_HITRAN.txt$.

In [10]:
# Create a folder for saving HITRAN format files.
# If the folder exists, save files directory,otherwise, create it.
hitran_file_path = './data/result/hitran/'
if os.path.exists(hitran_file_path):                   # Determine whether the folder exists or not.
    pass
else:
    os.makedirs(hitran_file_path, exist_ok=True)       # Create the folder.

#### Save HITRAN format files 


Read all molecule CSV format results. 

Concatenate these CSV files into a large CSV file. In this CSV file, data are sorted by wavenumbers and grouped by different molecule__iso-slug__isotopologue names.

Convert this large CSV format result into HITRAN format. Thre are columns names in CSV format result DataFrame, however, we need to calculate intensity $\times$ fractional abundance. Therefore, we remove those rows which have column names. We just use the column isotopologue number (I), extract those rows whose isotopologue numbers are not the string I. Then we reset the type of the none empty column values. After these, we convert CSV format into HITRAN format.

Save HITRAN format files in a loop and save the filenames as $molecule__iso-slug__isotopologue_HITRAN.txt$.

Concatenate HITRAN dataframes for saving as one final HITRAN format text with the whole data.

In [11]:
col_name = ['M', 'I', 'v', 'S', 'A', 'gamma_air', 'gamma_self',
            'E_f', 'n_air', 'delta_air', 'V_i', 'V_f', 'Q_i', 'Q_f',
            'Ierr', 'Iref', '*', 'g_i', 'g_f']

df = dict()
one_csv_df = pd.DataFrame()
HITRAN_df = pd.DataFrame()
csv_filenames = glob.glob('./data/result/csv/' + '*csv')
for csv_filename in csv_filenames:
    df[csv_filename] = pd.read_csv(csv_filename, header=None, names=col_name,
                                   chunksize=100_000_000, iterator=True, low_memory=False)
    formula = csv_filename.replace('_p','+').split('_')[2].replace('1H','H').replace('-','')
    fractional_abundance = float(hitran_online[hitran_online['exomol formula'].isin([formula])]['fractional abundance'].values)
    for chunk in df[csv_filename]:
        # Concatenate CSV files.
        one_csv_df = one_csv_df.append(chunk)
        
        # For converting HITRAN format.
        csv_chunk = chunk
        # Remove the rows which has column names.
        csv_df = csv_chunk[~csv_chunk['I'].isin(['I'])]
        # Reset type of each column values.
        csv_df[['M', 'I']] = csv_df[['M', 'I']].astype(int)
        csv_df[['v', 'S', 'A', 'E_f','Ierr', 'g_i', 'g_f']] = csv_df[['v', 'S', 'A', 'E_f','Ierr', 'g_i', 'g_f']].astype(float)
        # Convert CSV format into HITRAN format.
        hitran_df = convert_csv_to_HITRAN(csv_df)
        
        # Save as HITRAN format per species
        hitran_dfs = hitran_df
        save_hitran_filename = csv_filename.replace('\\','/').split('/')[-1].split('.')[0]
        demo_paths = demo_file_path + save_hitran_filename + '_demo_hitran.txt'
        save_paths = hitran_file_path + save_hitran_filename + '_hitran.txt'
        save_hitrans = save_hitran(hitran_df, demo_paths, save_paths)
        
        # For saving all data into one HITRAN format file.
        HITRAN_df = HITRAN_df.append(hitran_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/panda

## 2.1 Save CSV Format Result

This csv format result includes whole data (all molecules) and is sorted by increasing wavenumbers.

In [12]:
one_csv_df

Unnamed: 0,M,I,v,S,A,gamma_air,gamma_self,E_f,n_air,delta_air,V_i,V_f,Q_i,Q_f,Ierr,Iref,*,g_i,g_f
0,M of AlH__27Al-1H__AlHambra,I,v,S,A,gm_a,gm_s,E_f,n_a,dt_a,V_i,V_f,Q_i,Q_f,Ierr,Iref,*,g_i,g_f
1,50,1,11.529100000000653,5.7017330029911105e-31,0.00010082,,,4708.808865999999,,,X1Sigma+ 3 0,X1Sigma+ 3 0,1 e,0 e,0.0004752946454568997,,,36,12
2,50,1,11.865315999999893,5.652813698704096e-28,6.5365e-05,,,3194.21355,,,X1Sigma+ 2 0,X1Sigma+ 2 0,1 e,0 e,0.0003754583865090777,,,36,12
3,50,1,12.240991000000122,6.054205480631024e-25,3.5205e-05,,,1625.061501,,,X1Sigma+ 1 0,X1Sigma+ 1 0,1 e,0 e,0.0006028805851907988,,,36,12
4,50,1,12.591906,5.794001478941898e-22,1.2873e-05,,,0.0,,,X1Sigma+ 0 0,X1Sigma+ 0 0,1 e,0 e,0.0002913365751154496,,,36,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43251,53,3,23974.384584,8.790238870476749e-27,0.00056213,,,315.54267400000003,,,c1Phi 3-3,X3Delta 0 1,24 f,24 e,0.0008417244204607586,,,49,49
43252,53,3,23999.704662,4.845633397066255e-25,0.027457,,,290.222596,,,c1Phi 3 3,X3Delta 0-1,24 e,23 e,0.0008440379138403677,,,49,47
43253,53,3,23999.704662,4.845456916263653e-25,0.027456,,,290.222596,,,c1Phi 3-3,X3Delta 0 1,24 f,23 f,0.0008440379138403677,,,49,47
43254,53,3,24000.117462,6.6339723425466795e-25,0.044682,,,341.914598,,,c1Phi 3 3,X3Delta 0-1,26 e,25 e,0.0009982484660644362,,,53,51


Save the large concatenated CSV format result file.

In [13]:
# Create a folder for saving result files.
# If the folder exists, save files directory,otherwise, create it.
two_files_path = './data/result/two_files/'
if os.path.exists(two_files_path):                   # Determine whether the folder exists or not.
    pass
else:
    os.makedirs(two_files_path, exist_ok=True)       # Create the folder.
    
save_csv = one_csv_df.to_csv(two_files_path + 'CSV_format.csv', header=True, index=False)

## 2.2 Save HITRAN Format Result

This HITRAN format result includes whole data (all molecules) and is sorted by increasing wavenumbers.

In [14]:
demo_path = demo_file_path + 'demo_hitran.txt'
save_path = two_files_path + 'HITRAN_format.txt'
save_HITRAN = save_hitran(HITRAN_df, demo_path, save_path)