In [1]:
import glob
import numpy as np
import pandas as pd
from spec_io import get_OF8619_data, get_OF8619_meta

In [2]:
%load_ext autoreload
%autoreload 2

First, let's get list of all the data files, and a list of all the PDF documents containing meta data for those data files.

In [3]:
data_dir = '../input_data/of_8619/'
min_folders = glob.glob(data_dir +'Minerals/*')
min_folders = [folder.split('/')[-1] for folder in min_folders]
spectra_files = glob.glob(data_dir + 'Minerals/**/*.txt', recursive=True)
spectra_meta_files = glob.glob(data_dir + 'Minerals/**/*Description.pdf', recursive=True)

data_and_meta_files = []
for d, m in zip(spectra_files, spectra_meta_files):
    data_and_meta_files.append([d, m])

For each sample, make row we can put into a data frame.

The follow cell loops over the dat file and meta file for each specimen, and creates a row containing the meta data and the reflectance values. This can take a while because it takes a while to parse the meta data from the PDF files.

In [4]:
# build_data_frame --- this can take a while
data = []
count = 0
for i, (datafile, metafile) in enumerate(data_and_meta_files):
    meta = get_OF8619_meta(metafile)
    try:
        header, wvls, refl = get_OF8619_data(datafile, print_fname=False)
        data.append(list(meta.values()) + list(refl))
    except:
        print(i, 'didnt work',  datafile.strip('/')[-1])
        pass
    count += 1

These are the samples that need special treatment

```365 ../input_data/of_8619/Minerals/Kaolinite/1020_Kga1_Kaolinite/1020_KGa-1_Kaolinite_Data.txt
366 ../input_data/of_8619/Minerals/Kaolinite/1013_NMC068179_Kaolinite/1013_NMC068179_Kaolinite_Data.txt
369 ../input_data/of_8619/Minerals/Kaolinite/1021_Macon_Ga_Kaolinite/1021_MaconGa_Kaolinite_Data.txt
371 ../input_data/of_8619/Minerals/Kaolinite/1018_GOTOMINE_Kaolinite/1018_GOTOMINE_Kaolinite_Data.txt
372 ../input_data/of_8619/Minerals/Kaolinite/1014_NMC068188_Kaolinite/1014_NMC068188_Kaolinite_Data.txt
373 ../input_data/of_8619/Minerals/Kaolinite/1011_NMC068173_Kaolinite/1011_NMC68173_Kaolinite_Data.txt
374 ../input_data/of_8619/Minerals/Kaolinite/1007_NMC068122_Kaolinite/1007_NMC068122_Kaolinite_Data.txt
375 ../input_data/of_8619/Minerals/Kaolinite/1012_NMC068177_Kaolinite/1012_NMC068177_Kaolinite_Data.txt
376 ../input_data/of_8619/Minerals/Kaolinite/1022_FSP_Mg_NRProvins_Kaolinite/1022_Mg_NRProvins_Kaolinite_Data.txt
379 ../input_data/of_8619/Minerals/Kaolinite/1015_NMC068191_Kaolinite/1015_NMC068191_Kaolinite_Data.txt
380 ../input_data/of_8619/Minerals/Kaolinite/1019_BuffaloChinaMine_Kaolinite/1019_BuffaloChinaMine_Kaolinite_Data.txt
382 ../input_data/of_8619/Minerals/Dickite/1001_NMC068170_Dickite/1001_NMC068170_Dickite_Data.txt
383 ../input_data/of_8619/Minerals/Dickite/1003_NMC068170_Dickite/1003_NMC068170_Dickite_Data.txt
385 ../input_data/of_8619/Minerals/Dickite/1002_NMC068170_Dickite/1002_NMC068170_Dickite_Data.txt
401 ../input_data/of_8619/Minerals/Phengite/1039_Tibouron_Phengite/1039_Tibouron_Phengite_Data.txt
460 ../input_data/of_8619/Minerals/Talc/1034_NMC068172_Talc/1034_NMC068172_Talc_Data.txt
461 .```

# Make DataFrame

In [5]:
# build the column names from the last file.
df_columns = list(meta.keys()) + list(wvls.astype(int))
data = np.array(data)
data.shape
df = pd.DataFrame(data=data, columns=df_columns)
df.shape

(488, 2166)

In [6]:
df.head()

Unnamed: 0,ID,Dana_Classification,Mineral_Name,Mineral_Group,Sample_Number,Formula,Locality,Source_Type,Source_Name,Date_Acquired,...,2491,2492,2493,2494,2495,2496,2497,2498,2499,2500
0,2155a,"16b.1.1.(1,5)",Ancylite,Ancylite Group,NMC017464,"(Ce,La)Sr(CO3)2(OH) · H2O","Oka, Deux-Montagnes, Quebec, Canada",Purchase,L.I. Cowan,1970,...,0.286647578,0.285574128,0.284534037,0.283488174,0.282631596,0.281855047,0.281145761,0.280419668,0.279789746,0.279236058
1,2156,"16b.1.1.(1,5)",Ancylite,Ancylite Group,NMC064962,"(Ce,La)Sr(CO3)2(OH) · H2O","Mont Saint-Hilaire, Quebec, Canada",Exchange,NMNS Mineral Sciences,1984,...,0.438991497,0.438028644,0.437181443,0.436326524,0.435410609,0.434512655,0.433636963,0.432825367,0.431990574,0.431214889
2,2155b,"16b.1.1.(1,5)",Ancylite,Ancylite Group,NMC017464,"(Ce,La)Sr(CO3)2(OH) · H2O","Oka, Deux-Montagnes, Quebec, Canada",Purchase,L.I. Cowan,1970,...,0.360665103,0.359890252,0.359077324,0.358303537,0.357597838,0.35694931,0.356367946,0.355753859,0.355214953,0.354707142
3,2157,"16b.1.1.(1,5)",Ancylite-?,Ancylite Group,NMC066958,"(Ce,La)Sr(CO3)2(OH) · H2O","Narsarsuk, Greenland",Purchase,The Adit,1980,...,0.11366342,0.113128675,0.112579751,0.112111491,0.111643309,0.11120368,0.110811234,0.110418131,0.110173382,0.109888767
4,2370,55.2.1b.2,Thalenite-(Y),,NMC067407,Y3Si3O10F,"Pickens Mine, Haliburton Co., Ontario, Canada",Purchase,Nature's Window,1994,...,0.108602899,0.108043646,0.107536939,0.107070391,0.106585957,0.106207776,0.105825633,0.105548484,0.105195073,0.10488041


# Assign mineral labels

Create a new column called `cm_ree_labels` by mapping the mineral name to the REE class value contained in `mineral_to_label_dict.csv`. 

If you come across and exception and this fails. Edit the `.csv` file to include the new entry.

This is rather hacky because there can be so many variations to the mineral names.

In [7]:
label_dict = {}
with open('../input_data/mineral_to_label_dict.csv') as f:
    for line in f:
        try:
            k, v = line.strip().split('\t')
            label_dict[k] = v
        except:
            # 
            print(line)
            k, v = line.strip().split(';')
            print(k, v)
            label_dict[k.strip()] = v.strip()
        
df['cm_ree_labels'] = df['Mineral_Name'].map(label_dict, -1)

In [8]:
# Move the last column to the first position
cols = df.columns
cols = [cols[-1]] + list(cols[:-1])
df = df[cols]

In [9]:
df.to_csv('OF8619_with_labels.csv', index=False)