In [123]:
from pathlib import Path
import numpy as np
import pandas as pd
from nptdms import TdmsFile

In [124]:
root = Path("/Users/dinusha_senarathna/Desktop/experimental_data/CARS")
# get all files in all subdirectories
excluded_exts = {".ini", ".png", ".zip", ".tdms_index", ".tdms", ".jpg", ""}

files = [
    p for p in root.rglob("*")
    if p.is_file() and p.suffix.lower() not in excluded_exts
]

excluded_files = [
    p for p in root.rglob("*")
    if p.is_file() and p.suffix.lower() in excluded_exts
]

# get alist of unique extensions in the files with their counts
extensions, counts = np.unique([np.array([f.suffix.lower() for f in files])], return_counts=True)

for ext, count in zip(extensions, counts):
    print(f"{ext}: {count} files")

tdms_files = [file for file in excluded_files if file.suffix.lower() == ".tdms"]

.dat: 735 files
.txt: 240 files


In [125]:
len(files)

975

In [126]:
len(tdms_files)

75

In [127]:
# process each file and extract metadata
file = tdms_files[0]

for file in tdms_files:#[0:1]:
    with TdmsFile.open(file) as tdms:
        try:
            tdms.groups() #Raw Data, Floor Data, Signals
            tds = [ch.name for ch in tdms["Raw Data"].channels()]
            #numpy empty 2d array to hold data
            spectra = np.empty((len(tds), len(tdms["Raw Data"][tds[0]][:])))
            floor = np.empty((len(tdms["Floor Data"].channels()), len(tdms["Floor Data"][tdms["Floor Data"].channels()[0].name][:])))
            for td in tds:
                spectra[tds.index(td), :] = tdms["Raw Data"][td][:]

            floor_chs = [ch.name for ch in tdms["Floor Data"].channels()]
            for fc in floor_chs:
                floor[floor_chs.index(fc), :] = tdms["Floor Data"][fc][:]

            signals_ch = [ch.name for ch in tdms["Signals"].channels()]
            signal = tdms["Signals"][signals_ch[0]][:]
            int_signal = tdms["Signals"][signals_ch[1]][:]
            notes = tdms["Raw Data"].properties
            # notes keys = 'Sample', 'P1 (mW)', 'P2 (mW)', 'P3 (mW)', 'OPO1 (nm)', 'OPO2 (nm)', 'Polarizations', 'Notes', 'Start (px)', 'End (px)'
            sample_name = notes.get('Sample', 'Unknown')
            power1 = notes.get('P1 (mW)', 'Unknown')
            power2 = notes.get('P2 (mW)', 'Unknown')
            power3 = notes.get('P3 (mW)', 'Unknown')
            opo1 = notes.get('OPO1 (nm)', 'Unknown')
            opo2 = notes.get('OPO2 (nm)', 'Unknown')
            polarizations = notes.get('Polarizations', 'Unknown')
            additional_notes = notes.get('Notes', 'Unknown')
            start_px = notes.get('Start (px)', 'Unknown')
            end_px = notes.get('End (px)', 'Unknown')
            print(f"Raw channels: {len(tds)}, Floor channels: {len(floor_chs)}, Signals channels: {len(signal)}")
        except Exception as e:
            print(f"Error processing {file}: {e}")

Raw channels: 253, Floor channels: 251, Signals channels: 253
Raw channels: 190, Floor channels: 188, Signals channels: 190
Raw channels: 307, Floor channels: 3, Signals channels: 307
Raw channels: 297, Floor channels: 3, Signals channels: 297
Raw channels: 300, Floor channels: 3, Signals channels: 300
Raw channels: 194, Floor channels: 1, Signals channels: 194
Raw channels: 218, Floor channels: 3, Signals channels: 218
Raw channels: 222, Floor channels: 3, Signals channels: 222
Raw channels: 239, Floor channels: 3, Signals channels: 239
Raw channels: 252, Floor channels: 3, Signals channels: 252
Raw channels: 295, Floor channels: 3, Signals channels: 295
Raw channels: 191, Floor channels: 3, Signals channels: 191
Raw channels: 240, Floor channels: 3, Signals channels: 240
Raw channels: 159, Floor channels: 3, Signals channels: 159
Raw channels: 215, Floor channels: 3, Signals channels: 215
Raw channels: 231, Floor channels: 3, Signals channels: 231
Raw channels: 219, Floor channels: 3

In [128]:
#pandas datafframe with Year, Month, Day, File name, number of columns
df = pd.DataFrame(columns=["Year", "Month", "Day", "File Name", "Num Columns", "Num Rows", "Type", "Notes", "Spectra", "Floor"])

for file in files:
    with open(file, "r") as f:
        content = f.read()
        try:
            lines = content.splitlines()
            first_line = lines[0]
            num_columns = len(first_line.split())        
            num_rows = len(lines)  # +1 to account for the first line already read

            # extract date from file name
            if len(file.parent.name.split('_')) == 2:
                year = int(file.parent.parent.name)
                month_str = file.parent.name.split('_')[0]
                date = int(file.parent.name.split('_')[1])
                notes_found = False
                spetra_found = False
                floor_found = False
                # check if file contains all numeric data except for the first line
                if all(char.isdigit() or char.isspace() or char == '.' or char == '-' or char == '\t' or char == '\n' for char in "".join(lines[1:])):
                    data_type = "Numeric"
                    if not all(char.isdigit() or char.isspace() or char == '.' or char == '-' or char == '\t' or char == '\n' for char in "".join(lines[0])):
                        #print(f"Numeric file with non-numeric header found: {file}")
                        pass
                    if Path(str(file).split('.')[0]+"_Notes"+file.suffix) in files:
                        notes_found = True
                        pass
                    elif Path(str(file).split('.')[0]+"_notes.txt") in files:
                        notes_found = True
                        pass
                    if Path(str(file).split('.')[0]+"_spectra.txt") in files:
                        spetra_found = True
                        pass
                    elif Path(str(file).split('.')[0]+"_Spectra.dat") in files:
                        spetra_found = True
                        pass
                    if Path(str(file).split('.')[0]+"_Floor.dat") in files:
                        floor_found = True
                        pass
                    if not (notes_found or spetra_found or floor_found):
                        pass#print(f"Numeric file without associated notes/spectra/floor files: {file}")
                else:
                    data_type = "Non-numeric"
                    if first_line.startswith("DET"):
                        pass
                    elif first_line.startswith("td : CARS with Syncerity CCD"):
                        pass
                    elif first_line.startswith("td-CARS with Syncerity CCD"):
                        pass
                    elif first_line.startswith("laser power"):
                        pass
                    elif first_line.startswith("P1"):
                        pass
                    else:
                        print(first_line)
                    if num_rows > 50:
                        #print(f"Non-numeric file with more than 50 rows: {file}")
                        pass
                if notes_found:
                    data = {
                        "Year": year,
                        "Month": month_str,
                        "Day": date,
                        "File Name": str(file.name),
                        "Num Columns": num_columns,
                        "Num Rows": num_rows,
                        "Type": data_type,
                        "Notes": notes_found,
                        "Spectra": spetra_found,
                        "Floor": floor_found
                    }
                    df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
                elif not ("_notes" in file.name or "_Notes" in file.name or "_spectra" in file.name or "_Spectra" in file.name or "_Floor" in file.name):
                    if "_CARS.dat" in file.name:
                        print(f"File without notes/spectra/floor files: {file}")
            else:
                print(f"Unexpected folder name format for file: {file}")
        except Exception as e:
            print(f"Error reading file {file}: {e}")         

Error reading file /Users/dinusha_senarathna/Desktop/experimental_data/CARS/2021/Jul_07/OPO 2 NO PRISMS.txt: list index out of range
Error reading file /Users/dinusha_senarathna/Desktop/experimental_data/CARS/2021/Jul_06/OPO 2 NO PRISMS.txt: list index out of range
File without notes/spectra/floor files: /Users/dinusha_senarathna/Desktop/experimental_data/CARS/2022/Jun_08/KTP_test_2_CARS.dat
File without notes/spectra/floor files: /Users/dinusha_senarathna/Desktop/experimental_data/CARS/2022/Jun_08/test1_CARS.dat
File without notes/spectra/floor files: /Users/dinusha_senarathna/Desktop/experimental_data/CARS/2022/Jun_08/KTP_1_CARS.dat
File without notes/spectra/floor files: /Users/dinusha_senarathna/Desktop/experimental_data/CARS/2022/Jun_09/STO_1_CARS.dat
File without notes/spectra/floor files: /Users/dinusha_senarathna/Desktop/experimental_data/CARS/2022/Jun_09/STO_6_CARS.dat
File without notes/spectra/floor files: /Users/dinusha_senarathna/Desktop/experimental_data/CARS/2022/Jun_09/

In [129]:
# Display items in the dataframe where Notes is not empty
df[df["Floor"] == True]

Unnamed: 0,Year,Month,Day,File Name,Num Columns,Num Rows,Type,Notes,Spectra,Floor
194,2024,Aug,5,LNB_2.dat,4,260,Numeric,True,True,True
195,2024,Aug,5,LNB_3.dat,4,255,Numeric,True,True,True
196,2024,Aug,5,LNB_1.dat,4,266,Numeric,True,True,True
197,2024,Jul,24,LNB_872_test_1.dat,4,311,Numeric,True,True,True
198,2024,Jul,25,NLB_1.dat,4,241,Numeric,True,True,True
...,...,...,...,...,...,...,...,...,...,...
312,2022,Jun,21,STO_5.dat,4,223,Numeric,True,True,True
313,2022,Jun,20,STO_1.dat,4,238,Numeric,True,True,True
314,2022,Jun,20,STO_2.dat,4,321,Numeric,True,True,True
315,2022,Jun,20,STO_3.dat,4,321,Numeric,True,True,True


In [138]:
file = files[905]
with open(file, "r") as f:
    content = f.read()
    print(content)

DET	Syncerity CCD
SMP	BTS
P3 (TiS)	170
P1 (OPO1)	235
P2 (OPO2)	235
Polarizations	all parallel
Ti-Sa	813
OPO1	998.4
OPO2	1097
MONO	756
Slit	0.4
ATTN_0 (initial)	lowest
SP&BP filters	750-50
Spectral bandwidths and distortion	~10 nm OPO1 ~15 nm OPO2-symm, OPO2-assym
Spectral shift at the end of the measurement	NO
Power increase/decrease (>20%) at the end of the measurement	NO
Other	
Spectral window (px)	600.0 to 1400.0
Version	4
	



In [130]:
len(files)-(2+317+317+124+123+13*2+3+24+10+6)

23

In [None]:
# FINAL DATA STRUCTURE
# Files: SMP_1.dat, SMP_1_Notes.dat, SMP_1.Spectra.dat, SMP_1_Floor.dat
# SMP_1.dat: [td/sig/int_sig/att/floor_idx] [len(td)]
# SPM_1_Notes.dat: Date/DET/SMP/P3(TiS)/P2(OPO2)/P1(OPO1)/Polarizations/Ti-Sa/OPO1/OPO2/MONO/Slit/ATTN_0 (initial)/SP&BP filters/
#                  Spectral bandwidth and distortion/Spectral shift at the end of the measurement/
#                  Power increase/decrease (>20%) at the end of the measurement/Other/Spectral window (px)
# SMP_1_Spectra.dat: [px1/.../px2048] [len(td)]
# SMP_1_Floor.dat: [px1/.../px2048] [max(floor_idx)+1]