In [1]:
import os
import math
import glob
import requests
import numpy as np
import pandas as pd
from io import StringIO
from tqdm import tqdm_notebook as tqdm

In [2]:
molecule = []
iso_slug = []
isotopologue = []
path_mol_iso = []
for line in open('./data/url/api__urls.txt'):
    molecule.append(line.split('/')[-4])
    iso_slug.append(line.split('/')[-3])
    isotopologue.append(line.split('/')[-2])
    path_mol_iso.append(line.split('/')[-4] + '/' + line.split('/')[-3] + '/' + line.split('/')[-2])

molecule_list = list(set(molecule))
molecule_list.sort(key=molecule.index)

iso_slug_list = list(set(iso_slug))
iso_slug_list.sort(key=iso_slug.index)

isotopologue_list = list(set(isotopologue))
isotopologue_list.sort(key=isotopologue.index)

path_mol_iso_list = list(set(path_mol_iso))
path_mol_iso_list.sort(key=path_mol_iso.index)

#print('Molecule:', molecule_list)
#print('Iso-slug:', iso_slug_list)
#print('Isotopologue:', isotopologue_list)
print('Total:', path_mol_iso_list)


Total: ['AlH/27Al-1H/AlHambra', 'C2H2/12C2-1H2/aCeTY', 'C2/12C2/8states', 'CO2/12C-16O2/UCL-4000', 'H2O/1H2-16O/POKAZATEL', 'H3O_p/1H3-16O_p/eXeL', 'NH3/14N-1H3/CoYuTe', 'TiO/46Ti-16O/Toto', 'TiO/47Ti-16O/Toto', 'TiO/48Ti-16O/Toto', 'TiO/49Ti-16O/Toto', 'TiO/50Ti-16O/Toto']


In [3]:
unc_formula = pd.DataFrame(eval(str(iso_slug_list).replace('1H','H').replace('-','').replace('_p','+')))
unc_formula.columns = ['exomol formula']
unc_formula

Unnamed: 0,exomol formula
0,27AlH
1,12C2H2
2,12C2
3,12C16O2
4,H216O
5,H316O+
6,14NH3
7,46Ti16O
8,47Ti16O
9,48Ti16O


In [4]:

hitran_online = pd.DataFrame()
hitran_online['exomol formula'] = unc_formula['exomol formula']
hitran_online['molecule ID'] = ['50','26','51','2','1','52','11','53','53','53','53','53']
hitran_online['isotopologue ID'] = ['1','1','1','1','1','1','1','1','2','3','4','5']
hitran_online['fractional abundance'] = ['1','0.977599','1','0.984204','0.997317','1','0.995872','0.2','0.2','0.2','0.2','0.2']
hitran_online


Unnamed: 0,exomol formula,molecule ID,isotopologue ID,fractional abundance
0,27AlH,50,1,1.0
1,12C2H2,26,1,0.977599
2,12C2,51,1,1.0
3,12C16O2,2,1,0.984204
4,H216O,1,1,0.997317
5,H316O+,52,1,1.0
6,14NH3,11,1,0.995872
7,46Ti16O,53,1,0.2
8,47Ti16O,53,2,0.2
9,48Ti16O,53,3,0.2


In [5]:
def convert_uncertainty_code(HITRAN_df):
    HITRAN_num = HITRAN_df['Ierr'].count()
    uncertainty_code = 0
    Ierr = []
    for i in range(HITRAN_num):
        uncertainty = HITRAN_df['Ierr'].values[i]
        uncertainty_value = float(uncertainty)
        if (1 <= uncertainty_value):
            uncertainty_code = '{:>1}'.format(0) + '40000'
        elif (0.1 <= uncertainty_value < 1):
            uncertainty_code = '{:>1}'.format(1) + '40000'
        elif (0.01 <= uncertainty_value < 0.1):
            uncertainty_code = '{:>1}'.format(2) + '40000'
        elif (0.001 <= uncertainty_value < 0.01):
            uncertainty_code = '{:>1}'.format(3) + '40000'
        elif (0.0001 <= uncertainty_value < 0.001):
            uncertainty_code = '{:>1}'.format(4) + '40000'
        elif (0.00001 <= uncertainty_value < 0.0001):
            uncertainty_code = '{:>1}'.format(5) + '40000'
        elif (0.000001 <= uncertainty_value < 0.00001):
            uncertainty_code = '{:>1}'.format(6) + '40000'
        elif (0.0000001 <= uncertainty_value < 0.000001):
            uncertainty_code = '{:>1}'.format(7) + '40000'
        elif (uncertainty_value < 0.0000001):
            uncertainty_code = '{:>1}'.format(8) + '40000'
        Ierr.append(uncertainty_code)
    return Ierr

In [6]:
def convert_csv_to_HITRAN(csv_df):
    HITRAN_df = csv_df[csv_df.S > 1.0E-30]
    Ierr = convert_uncertainty_code(HITRAN_df)
    
    HITRAN_df['M'] = HITRAN_df.M.map('{:_>2}'.format)
    HITRAN_df['I'] = HITRAN_df.I.map('{:>1}'.format)
    HITRAN_df['v'] = HITRAN_df.v
    HITRAN_df['S'] = HITRAN_df.S * fractional_abundance
    HITRAN_df['S'] = HITRAN_df.S.map('{:_>10.3E}'.format)
    HITRAN_df['A'] = HITRAN_df.A.map('{:_>10.3E}'.format)
    HITRAN_df['gm_a'] = '_' * 5
    HITRAN_df['gm_s'] = '_' * 5
    HITRAN_df['E_f'] = HITRAN_df.E_f.map('{:_>10.4F}'.format)
    HITRAN_df['n_a'] = '_' * 4
    HITRAN_df['dt_a'] = '_' * 8
    HITRAN_df['V_i'] = HITRAN_df.V_i
    HITRAN_df['V_f'] = HITRAN_df.V_f
    HITRAN_df['Q_i'] = HITRAN_df.Q_i
    HITRAN_df['Q_f'] = HITRAN_df.Q_f
    HITRAN_df['Ierr'] = Ierr
    HITRAN_df['Iref'] = '_' * 12
    HITRAN_df['*'] = '_'
    HITRAN_df['g_i'] = HITRAN_df.g_i.map('{:_>7.1F}'.format)
    HITRAN_df['g_f'] = HITRAN_df.g_f.map('{:_>7.1F}'.format)

    return HITRAN_df


In [7]:
col_name = ['M', 'I', 'v', 'S', 'A', 'gamma_air', 'gamma_self',
            'E_f', 'n_air', 'delta_air', 'V_i', 'V_f', 'Q_i', 'Q_f',
            'Ierr', 'Iref', '*', 'g_i', 'g_f']

df = dict()
one_csv_df = pd.DataFrame()
HITRAN_df = pd.DataFrame()
csv_filenames = glob.glob('./data/result/' + '*csv')
for csv_filename in csv_filenames:
    df[csv_filename] = pd.read_csv(csv_filename, header=None, names=col_name,
                                   chunksize=100_000_000, iterator=True, low_memory=False)
    formula = csv_filename.replace('_p','+').split('_')[2].replace('1H','H').replace('-','')
    fractional_abundance = float(hitran_online[hitran_online['exomol formula'].isin([formula])]['fractional abundance'].values)
    for chunk in df[csv_filename]:
        one_csv_df = one_csv_df.append(chunk)
        csv_chunk = chunk
        csv_df = csv_chunk[~csv_chunk['I'].isin(['I'])]
        csv_df[['M', 'I']] = csv_df[['M', 'I']].astype(int)
        csv_df[['v', 'S', 'A', 'E_f','Ierr', 'g_i', 'g_f']] = csv_df[['v', 'S', 'A', 'E_f','Ierr', 'g_i', 'g_f']].astype(float)
        hitran_df = convert_csv_to_HITRAN(csv_df)
        HITRAN_df = HITRAN_df.append(hitran_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata

In [8]:
one_csv_df

Unnamed: 0,M,I,v,S,A,gamma_air,gamma_self,E_f,n_air,delta_air,V_i,V_f,Q_i,Q_f,Ierr,Iref,*,g_i,g_f
0,M of AlH__27Al-1H__AlHambra,I,v,S,A,gm_a,gm_s,E_f,n_a,dt_a,V_i,V_f,Q_i,Q_f,Ierr,Iref,*,g_i,g_f
1,50,1,11.529100000000653,5.7017330029911105e-31,0.00010082,,,4708.808865999999,,,X1Sigma+ 3 0,X1Sigma+ 3 0,1 e,0 e,0.0004752946454568997,,,36,12
2,50,1,11.865315999999893,5.652813698704096e-28,6.5365e-05,,,3194.21355,,,X1Sigma+ 2 0,X1Sigma+ 2 0,1 e,0 e,0.0003754583865090777,,,36,12
3,50,1,12.240991000000122,6.054205480631024e-25,3.5205e-05,,,1625.061501,,,X1Sigma+ 1 0,X1Sigma+ 1 0,1 e,0 e,0.0006028805851907988,,,36,12
4,50,1,12.591906,5.794001478941898e-22,1.2873e-05,,,0.0,,,X1Sigma+ 0 0,X1Sigma+ 0 0,1 e,0 e,0.0002913365751154496,,,36,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18789,11,1,100.00168899999971,1.9678105748381323e-34,9.2055e-06,,,5151.114544,,,0 0 1 1 1 1 s E',"1 0 0 1 0 1 a E""","E"" 4 1 E""","E' 4 1 E""",0.000928245657140393,,,54,54
18790,11,1,100.01794699999982,1.0950057335684122e-26,0.051754999999999995,,,3442.231307,,,0 0 0 2 0 2 s E',0 0 0 2 0 2 s E',"A2"" 5 1 E""",A2' 4 2 E',0.0009175625319290234,,,132,108
18791,11,1,100.03034000000025,6.825591286025868e-41,4.3845e-13,,,4847.660741,,,0 1 1 0 1 0 s E',"0 1 1 0 1 0 a E""",E' 7 2 E',"E"" 7 6A2'",0.000329802971484491,,,90,90
18792,11,1,100.05063699999937,5.040321921886126e-41,2.3553e-12,,,5150.7658200000005,,,0 0 1 1 1 1 s E',1 0 0 1 0 1 s E',E' 4 0A1',"E"" 4 1 E""",0.0007410533044255319,,,54,54


In [9]:
# Create a folder for saving result files.
# If the folder exists, save files directory,otherwise, create it.
one_file_path = './data/result/one_file/'
if os.path.exists(one_file_path):                   # Determine whether the folder exists or not.
    pass
else:
    os.makedirs(one_file_path, exist_ok=True)       # Create the folder.
    
save_csv = one_csv_df.to_csv(one_file_path + 'csv_format.csv', header=True, index=False)

In [10]:
HITRAN_df = HITRAN_df.sort_values(['v'], ascending = True).reset_index(drop=True)
HITRAN_df['v'] = HITRAN_df['v'].map('{:_>12.6F}'.format)

In [11]:
HITRAN_df['V_i'] = HITRAN_df['V_i'].str.replace("'","upper")
HITRAN_df['V_i'] = HITRAN_df['V_i'].str.replace('"','lower')
HITRAN_df['V_f'] = HITRAN_df['V_f'].str.replace("'","upper")
HITRAN_df['V_f'] = HITRAN_df['V_f'].str.replace('"','lower')
HITRAN_df['Q_i'] = HITRAN_df['Q_i'].str.replace("'","upper")
HITRAN_df['Q_i'] = HITRAN_df['Q_i'].str.replace('"','lower')
HITRAN_df['Q_f'] = HITRAN_df['Q_f'].str.replace("'","upper")
HITRAN_df['Q_f'] = HITRAN_df['Q_f'].str.replace('"','lower')


In [12]:
order = ['M', 'I', 'v', 'S', 'A', 'gm_a', 'gm_s', 'E_f', 'n_a', 'dt_a',
         'V_i', 'V_f', 'Q_i', 'Q_f', 'Ierr', 'Iref', '*', 'g_i', 'g_f']
HITRAN_df = HITRAN_df[order]
HITRAN_df

Unnamed: 0,M,I,v,S,A,gm_a,gm_s,E_f,n_a,dt_a,V_i,V_f,Q_i,Q_f,Ierr,Iref,*,g_i,g_f
0,11,1,____0.058320,_1.556E-30,_4.954E-12,_____,_____,_2311.0266,____,________,0 0 0 0 0 0 aA2lower,0 0 0 0 0 0 sA1upper,A2upper 15 3A1lower,A2lower 15 3A2lower,440000,____________,_,__372.0,__372.0
1,11,1,____0.167508,_2.816E-30,_1.911E-10,_____,_____,_2593.5309,____,________,0 0 0 0 0 0 aA2lower,0 0 0 0 0 0 sA1upper,Elower 16 4 Eupper,Eupper 16 4 Eupper,440000,____________,_,__198.0,__198.0
2,11,1,____0.181234,_5.955E-30,_3.771E-10,_____,_____,_2563.1442,____,________,0 0 0 0 0 0 aA2lower,0 0 0 0 0 0 sA1upper,Eupper 16 5 Elower,Elower 16 5 Elower,440000,____________,_,__198.0,__198.0
3,11,1,____0.183802,_3.294E-30,_7.178E-11,_____,_____,_2327.9395,____,________,0 0 0 0 0 0 aA2lower,0 0 0 0 0 0 sA1upper,Elower 15 2 Eupper,Eupper 15 2 Eupper,440000,____________,_,__186.0,__186.0
4,11,1,____0.190942,_3.694E-30,_1.129E-09,_____,_____,_3042.3525,____,________,0 0 0 0 0 0 aA2lower,0 0 0 0 0 0 sA1upper,A2upper 18 9A1lower,A2lower 18 9A2lower,340000,____________,_,__444.0,__444.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11383,51,1,38816.845590,_4.938E-30,_7.295E-07,_____,_____,__637.6866,____,________,b3Sigmag-29 1F1 5,a3Piu 0-2F1 4,6 e,5 e,440000,____________,_,___13.0,___11.0
11384,51,1,38817.659259,_1.462E-29,_2.450E-06,_____,_____,__663.6266,____,________,b3Sigmag-29 0F3 7,a3Piu 0-1F2 5,6 e,5 e,440000,____________,_,___13.0,___11.0
11385,51,1,38822.293385,_9.384E-30,_1.978E-06,_____,_____,__635.1225,____,________,b3Sigmag-29 0F3 5,a3Piu 0-1F2 3,4 e,3 e,440000,____________,_,____9.0,____7.0
11386,51,1,38823.835324,_9.184E-30,_1.732E-06,_____,_____,__612.1865,____,________,b3Sigmag-29 1F1 3,a3Piu 0-2F1 2,4 e,3 e,440000,____________,_,____9.0,____7.0


In [13]:
HITRAN_df.to_csv(one_file_path + 'demo_hitran.txt', header=None, index=False)

In [14]:
def read_txt_in_chunks(path, chunk_size=1024*1024):
    file = open(path, 'r')
    while True:
        chunk_data = file.read(chunk_size)
        if not chunk_data:
            break
        yield chunk_data

HITRAN_path = one_file_path + 'demo_hitran.txt'
with open(one_file_path + 'HITRAN_format.txt', 'w') as save_file:
    for chunk in read_txt_in_chunks(HITRAN_path):
        string = str(chunk).replace(',','').replace('_',' ').replace("upper","'").replace('lower','"')
        save_file.write(string)
