# Parsing IDMS schema syntax

* [primer](https://www.sfu.ca/sasdoc/sashtml/idms/z0979730.htm)
* [IDMS schema and subschema syntax](https://manualzz.com/doc/o/rziew/ca-idms-database-administration-guide-schema-and-subschema-compilers)


## syntax components
* areas
* records - table
* elements - field
* sets

In [1]:
!head IDMS_schema_source/ASL300.SCHEMA.FARMS.TXT

     ADD                                                                        
     SCHEMA NAME IS FARMS VERSION IS 1                                          
*+       DATE CREATED IS      11/25/87                                          
*+       TIME CREATED IS      09310537                                          
*+       DATE LAST UPDATED IS 07/17/19                                          
*+       TIME LAST UPDATED IS 16423642                                          
*+       PREPARED BY ASL066                                                     
*+       REVISED  BY ASL70R                                                     
         MEMO DATE IS 01/06/86                                                  
         ASSIGN RECORD IDS FROM 1001                                            


In [2]:
pwd

'/Users/ccoletta/projects/gsa_coe/usda/copybooks'

In [3]:
!wc -l  IDMS_schema_source_ALL/*

    2375 IDMS_schema_source_ALL/EMPSCHM-V100-SCHEMA-020623.txt
   16234 IDMS_schema_source_ALL/FARMS-V1-SCHEMA-020623.formatted.txt
   16234 IDMS_schema_source_ALL/FARMS-V1-SCHEMA-020623.txt
   16642 IDMS_schema_source_ALL/FARMS-V10-SCHEMA-020623.txt
   25112 IDMS_schema_source_ALL/MCMMF01-SCHEMA-020623.txt
      47 IDMS_schema_source_ALL/RDUSE01-SCHEMA-020623.txt
    1599 IDMS_schema_source_ALL/RMS-SCHEMA-020623.txt
    3387 IDMS_schema_source_ALL/SCHEMA-LISTING-020623.zip
    1779 IDMS_schema_source_ALL/SCMAC01-SCHEMA-020623.txt
      50 IDMS_schema_source_ALL/SCMACCT-SCHEMA-020623.txt
    1538 IDMS_schema_source_ALL/SCMFA01-SCHEMA-020623.txt
     263 IDMS_schema_source_ALL/SCMHN01-SCHEMA-020623.txt
   26281 IDMS_schema_source_ALL/SCMMF01-SCHEMA-020623.txt
   26039 IDMS_schema_source_ALL/SCMMF01L-SCHEMA-0206023.txt
   26041 IDMS_schema_source_ALL/SCMMF01M-SCHEMA-020623.txt
   26177 IDMS_schema_source_ALL/SCMMF01P-SCHEMA-020623.txt
   25988 IDMS_schema_source_ALL/SCMMF02-SCHEMA-020623

In [4]:
import pandas as pd
from pathlib import Path
import re
from collections import Counter

In [5]:
#test_path = 'IDMS_schema_source/ASL300.SCHEMA.FARMS.TXT'
test_path = 'IDMS_schema_source_ALL/RMS-SCHEMA-020623.txt'

In [6]:
lines_df = pd.read_csv( test_path, header=None )

In [7]:
lines_df.columns = [ 'raw_line' ]

In [8]:
lines_df[ 'stripped' ] = lines_df[ 'raw_line' ].str.slice( start = 5 )

In [9]:
lines_df[ 'stripped' ] = lines_df[ 'stripped' ].str.rstrip()

In [10]:
lines_df[ 'stripped' ]

0                                          ADD
1              SCHEMA NAME IS RMS VERSION IS 1
2                DATE LAST UPDATED IS 10/28/94
3                TIME LAST UPDATED IS 10114932
4                           REVISED  BY ASL066
                         ...                  
1594                       MANDATORY AUTOMATIC
1595                                  KEY IS (
1596                    YR-MO-101 DESCENDING )
1597                DUPLICATES ARE NOT ALLOWED
1598                                         .
Name: stripped, Length: 1599, dtype: object

In [11]:
stripped_whitespace_file = "2023-02-03_IDMS_schema_source_stripped.txt"

In [12]:
lines_df[ 'stripped' ].to_csv( stripped_whitespace_file, header=False, index=False )

In [13]:
raw_text = Path( stripped_whitespace_file ).read_text()

In [14]:
raw_text[:4]

'ADD\n'

In [15]:
p = re.compile( r'\nADD\n' )

In [16]:
components = p.split(raw_text )

In [17]:
len( components )

15

In [18]:
# remove the initial ADD
components[0] = components[0][4:]

In [19]:
p = re.compile( r'^(\S+)' )

In [20]:
p.match( components[0] ).group(1)

'SCHEMA'

In [21]:
component_categories = [ p.match( _ ).group(1) for _ in components ]

In [22]:
c = Counter( component_categories )

In [23]:
c.most_common()

[('RECORD', 8), ('AREA', 5), ('SCHEMA', 1), ('SET', 1)]

# Analyze Record components

In [24]:
record_components = [ c for t, c in zip( component_categories, components ) if t == "RECORD" ]

In [25]:
#print( record_components[0] )

In [26]:
redefines_pattern = re.compile( r'REDEFINES' )
occurs_pattern = re.compile( r'OCCURS' )

In [27]:
column_names = [ 'indent', 'data_level', 'element_name', 'raw_element_descriptors' ]
element_search_pat = re.compile( r'^(\s+)?(\d\d) (\S+)\n\s+(.*?)\n\s+\.', flags=re.MULTILINE | re.DOTALL )

In [28]:
record_name_pat = re.compile( r'RECORD NAME IS (\S+)' )

In [29]:
IS_pat = re.compile( r' IS ' )

In [30]:
def DescriptorsSplitter( raw_descriptor_string ):
    
    descriptors = [ _.strip() for _ in raw_descriptor_string.split( '\n' ) ]
    
    key_value_pairs = [ IS_pat.split( _ ) for _ in descriptors ]
    try:
        index, values = zip( *key_value_pairs )
    except:
        # empty series
        print( "\t\tproblem splitting these characteristics" )
        print( key_value_pairs )
        retval = pd.Series( dtype='object')
    else:
        retval = pd.Series( values, index=index )
    #print( retval )
    return retval

In [31]:
def FormatRecord( component_text, debug=False ):
    
    if debug:
        print( "*" * 50 )
    record_name = record_name_pat.match( component_text ).group(1)
    data_elements = element_search_pat.findall( component_text )
    if debug:
        print( "record", record_name, "has", len( data_elements ), "elements." )
    data_elements = pd.DataFrame( data_elements, columns = column_names )
    data_elements['record'] = record_name
    data_elements['data_step'] = [ (1+int(_)) * 100 for _ in data_elements.index ]
    data_elements['indent'] = data_elements['indent'].apply( len )
    data_elements['raw_element_descriptors'] = \
        data_elements['raw_element_descriptors'].str.replace( redefines_pattern, 'REDEFINES IS' )

    data_elements['raw_element_descriptors'] = \
        data_elements['raw_element_descriptors'].str.replace( occurs_pattern, 'OCCURS IS' )
    
    modifiers_df = data_elements['raw_element_descriptors'].apply( DescriptorsSplitter )
    
    data_elements = pd.concat( (data_elements, modifiers_df), axis=1 )
    
    data_elements = data_elements.set_index( 'record', append=True )
    return data_elements

In [32]:
pd.set_option( 'display.max_rows', 100 )

In [33]:
#FormatRecord( record_components[0] )

In [34]:
pivoted_record_data = pd.concat( [ FormatRecord(_) for _ in record_components ] )

		problem splitting these characteristics
[['PICTURE', ' X(110)'], ['USAGE', 'DISPLAY'], ['ELEMENT LENGTH', '110'], ['POSITION', '3'], ['OLQ HEADER'], ["'COMMENT DATA'"]]
		problem splitting these characteristics
[['PICTURE', ' 9(3)'], ['USAGE', 'DISPLAY'], ['ELEMENT LENGTH', '3'], ['POSITION', '113'], ['OLQ HEADER'], ["'OPERATOR'"], ["-     'NUMBER'"]]


In [35]:
pivoted_record_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 235 entries, (0, 'RMSADRS') to (26, 'RDADATA')
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   indent                   235 non-null    int64 
 1   data_level               235 non-null    object
 2   element_name             235 non-null    object
 3   raw_element_descriptors  235 non-null    object
 4   data_step                235 non-null    int64 
 5   USAGE                    233 non-null    object
 6   ELEMENT LENGTH           225 non-null    object
 7   POSITION                 233 non-null    object
 8   PICTURE                  199 non-null    object
 9   VALUE                    8 non-null      object
 10  OCCURS                   86 non-null     object
 11  REDEFINES                6 non-null      object
dtypes: int64(2), object(10)
memory usage: 28.3+ KB


In [36]:
#pivoted_record_data.sample(100)

In [37]:
p = re.compile( r'SCHEMA NAME IS (\S+) VERSION IS (\d+)' )

In [38]:
p.search( components[0] ).groups()

('RMS', '1')

In [39]:
def ScrapeRecordsAndElements( schema_source_path ):

    print( "=" * 50 )
    lines_df = pd.read_csv( schema_source_path, header=None )
    lines_df.columns = [ 'raw_line' ]
    lines_df[ 'stripped' ] = lines_df[ 'raw_line' ].str.slice( start = 5 )
    lines_df[ 'stripped' ] = lines_df[ 'stripped' ].str.rstrip()

    stripped_whitespace_file = Path( schema_source_path ).with_suffix( ".formatted.txt" )
    print( "writing", f'"{ str(stripped_whitespace_file) }"' )
    lines_df[ 'stripped' ].to_csv( stripped_whitespace_file, header=False, index=False )
    raw_text = stripped_whitespace_file.read_text()

    add_pat = re.compile( r'\nADD\n' )
    components = add_pat.split( raw_text )

    #len( components )
    # remove the initial ADD
    components[0] = components[0][4:]
    
    schema_name_ver_pat = re.compile( r'SCHEMA NAME IS (\S+) VERSION IS (\d+)' )
    schema_info = schema_name_ver_pat.search( components[0] ).groups()
    schema_name, schema_version = schema_info

    first_word_pat = re.compile( r'^(\S+)' )
    component_categories = [ first_word_pat.match( _ ).group(1) for _ in components ]
    c = Counter( component_categories )

    print( schema_info, "\n", c.most_common() )

    # Analyze Record components

    record_components = [ c for t, c in zip( component_categories, components ) if t == "RECORD" ]
    
    pivoted_record_data = pd.concat( [ FormatRecord(_) for _ in record_components ] )
    pivoted_record_data = pivoted_record_data.swaplevel().sort_index()
    return pivoted_record_data
    

In [40]:
retval = ScrapeRecordsAndElements( 'IDMS_schema_source_ALL/FARMS-V1-SCHEMA-020623.txt' )

writing "IDMS_schema_source_ALL/FARMS-V1-SCHEMA-020623.formatted.txt"
('FARMS', '1') 
 [('RECORD', 113), ('SET', 85), ('AREA', 20), ('SCHEMA', 1)]


In [41]:
retval['VALUE'].value_counts()

( 'S' )      10
( 'D' )      10
( 'C' )      10
( 'T' )       8
( 'U' )       8
( 'V' )       8
( '0' )       4
( '1' )       4
( SPACE )     2
( 'R' )       1
( 'A' )       1
Name: VALUE, dtype: int64

In [42]:
retval['VALUE'] = retval['VALUE'].str.extract( r' (\S+) ' )

In [43]:
retval['VALUE'].value_counts()

'S'      10
'D'      10
'C'      10
'T'       8
'U'       8
'V'       8
'0'       4
'1'       4
SPACE     2
'R'       1
'A'       1
Name: VALUE, dtype: int64

In [44]:
retval['data_level'].value_counts()

05    1295
10     626
15     220
88      64
20      13
Name: data_level, dtype: int64

In [45]:
retval['indent'].value_counts()

0     1295
4      626
8      220
16      64
12      13
Name: indent, dtype: int64

In [46]:
retval['indent'] = (retval['data_level'].astype(int) // 5 )

In [47]:
retval.loc[ retval['indent'] >= 5, 'indent' ] = 5

In [48]:
retval['indent'] = retval['indent'] * 2

In [49]:
retval['indent'].value_counts()

2     1295
4      626
6      220
10      64
8       13
Name: indent, dtype: int64

In [50]:
sorted( retval.reset_index()['record'].unique() )[:5]

['ACCT-DATA', 'ACQD-PROP', 'ADPS-CNTRL', 'ADVANCE', 'AID']

In [51]:
retval.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,indent,data_level,element_name,raw_element_descriptors,data_step,USAGE,ELEMENT LENGTH,POSITION,PICTURE,REDEFINES,VALUE,OCCURS
record,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ACCT-DATA,0,2,5,LN-NBR,PICTURE IS 9(2)\n USAGE IS DISPLAY\n EL...,100,DISPLAY,2,1,9(2),,,
ACCT-DATA,1,2,5,FD-CDE,USAGE IS DISPLAY\n ELEMENT LENGTH IS 4\n ...,200,DISPLAY,4,3,,,,
ACCT-DATA,2,4,10,FD-CDE-3,USAGE IS DISPLAY\n ELEMENT LENGTH IS 3\...,300,DISPLAY,3,3,,,,
ACCT-DATA,3,6,15,FD-CDE-2,PICTURE IS 9(2)\n USAGE IS DISPLAY...,400,DISPLAY,2,3,9(2),,,
ACCT-DATA,4,6,15,FD-CDE-3RD,PICTURE IS 9(1)\n USAGE IS DISPLAY...,500,DISPLAY,1,5,9(1),,,


In [52]:
retval['PICTURE'] = retval['PICTURE'].str.strip()

# Add data formatting to the csv output to get it ready for ingestion by 2023-01-24_CreateCopyBooks-v2

In [53]:
table_index_dict = { table_name: i for i, table_name in enumerate( retval.index.levels[0] ) }

In [54]:
retval = retval.reset_index( 'record', drop=False )

In [55]:
retval['table_index'] = \
    [ table_index_dict[n] for n in retval['record'].values ]

In [56]:
retval['table_vers'] = 1

In [57]:
retval.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2218 entries, 0 to 12
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   record                   2218 non-null   object
 1   indent                   2218 non-null   int64 
 2   data_level               2218 non-null   object
 3   element_name             2218 non-null   object
 4   raw_element_descriptors  2218 non-null   object
 5   data_step                2218 non-null   int64 
 6   USAGE                    2218 non-null   object
 7   ELEMENT LENGTH           2154 non-null   object
 8   POSITION                 2218 non-null   object
 9   PICTURE                  1961 non-null   object
 10  REDEFINES                23 non-null     object
 11  VALUE                    66 non-null     object
 12  OCCURS                   31 non-null     object
 13  table_index              2218 non-null   int64 
 14  table_vers               2218 non-null   i

In [58]:
retval.head()

Unnamed: 0,record,indent,data_level,element_name,raw_element_descriptors,data_step,USAGE,ELEMENT LENGTH,POSITION,PICTURE,REDEFINES,VALUE,OCCURS,table_index,table_vers
0,ACCT-DATA,2,5,LN-NBR,PICTURE IS 9(2)\n USAGE IS DISPLAY\n EL...,100,DISPLAY,2,1,9(2),,,,0,1
1,ACCT-DATA,2,5,FD-CDE,USAGE IS DISPLAY\n ELEMENT LENGTH IS 4\n ...,200,DISPLAY,4,3,,,,,0,1
2,ACCT-DATA,4,10,FD-CDE-3,USAGE IS DISPLAY\n ELEMENT LENGTH IS 3\...,300,DISPLAY,3,3,,,,,0,1
3,ACCT-DATA,6,15,FD-CDE-2,PICTURE IS 9(2)\n USAGE IS DISPLAY...,400,DISPLAY,2,3,9(2),,,,0,1
4,ACCT-DATA,6,15,FD-CDE-3RD,PICTURE IS 9(1)\n USAGE IS DISPLAY...,500,DISPLAY,1,5,9(1),,,,0,1


In [59]:
reformatted_retval = retval.rename( 
    columns={ 
        'record': 'table_name',
        'element_name' : 'field_name',
        'USAGE' : 'end',
        'PICTURE' : 'data_type',
        'indent' : 'indent_space_count',
        'data_step' : 'declaration_step'
    } )

In [60]:
reformatted_retval['BLANK ON'] = ''
reformatted_retval['INDEXED BY'] = ''
reformatted_retval['OLQ'] = ''

In [61]:
reformatted_retval = reformatted_retval.drop( columns=['raw_element_descriptors'] )

In [62]:
reformatted_retval.to_csv( '2023-02-14_FSA_FARMS_schema_from_source.csv' )

In [63]:
#retval.loc[ 'ACCT-DATA' ]
#retval.loc[ ('ACCT-DATA', slice(None)), : ]

In [64]:
!head -30 2023-02-14_FSA_FARMS_schema_from_source.csv

,table_name,indent_space_count,data_level,field_name,declaration_step,end,ELEMENT LENGTH,POSITION,data_type,REDEFINES,VALUE,OCCURS,table_index,table_vers,BLANK ON,INDEXED BY,OLQ
0,ACCT-DATA,2,05,LN-NBR,100,DISPLAY,2,1,9(2),,,,0,1,,,
1,ACCT-DATA,2,05,FD-CDE,200,DISPLAY,4,3,,,,,0,1,,,
2,ACCT-DATA,4,10,FD-CDE-3,300,DISPLAY,3,3,,,,,0,1,,,
3,ACCT-DATA,6,15,FD-CDE-2,400,DISPLAY,2,3,9(2),,,,0,1,,,
4,ACCT-DATA,6,15,FD-CDE-3RD,500,DISPLAY,1,5,9(1),,,,0,1,,,
5,ACCT-DATA,4,10,FD-CDE-4TH,600,DISPLAY,1,6,9(1),,,,0,1,,,
6,ACCT-DATA,2,05,KIND-CDE-LN,700,DISPLAY,2,7,9(2),,,,0,1,,,
7,ACCT-DATA,2,05,INT-RATE-NOTE,800,DISPLAY,6,9,9(2)V9(4),,,,0,1,,,
8,ACCT-DATA,2,05,INT-RATE-NOTE-1ST,900,DISPLAY,6,9,V9(6),INT-RATE-NOTE,,,0,1,,,
9,ACCT-DATA,2,05,PYMT-TYP-CDE,1000,DISPLAY,1,15,9(1),,,,0,1,,,
10,ACCT-DATA,2,05,DIR-PYMT-CDE,1100,DISPLAY,1,16,9(1),,,,0,1,,,
11,ACCT-DATA,2,05,DTE-AMORTN-EFCTV,1200,DISPLAY,6,17,9(06),,,,0,1,,,
12,ACCT-DATA,2,05,DSTR-DCLRD-CDE,1300,DISPLAY,5,23,,,,,0,1,,,
13,ACCT-DATA,4,10,DSTR-

# Analyze sets

In [65]:
set_components = [ c for t, c in zip( component_categories, components ) if t == "SET" ]

In [66]:
len( set_components)

1

In [67]:
print( set_components[0] )

SET NAME IS RMS-SDC-ADRS-SET
    ORDER IS SORTED
    MODE IS CHAIN LINKED TO PRIOR
    OWNER IS RMSADRS
        WITHIN AREA RMS-SDC-AREA
        NEXT DBKEY POSITION IS 1
        PRIOR DBKEY POSITION IS 2
    MEMBER IS RMSDATA
        WITHIN AREA RMS-SDC-AREA
        NEXT DBKEY POSITION IS 1
        PRIOR DBKEY POSITION IS 2
        LINKED TO OWNER
        OWNER DBKEY POSITION IS 3
        MANDATORY AUTOMATIC
        KEY IS (
            YR-MO-101 DESCENDING )
            DUPLICATES ARE NOT ALLOWED
    .



In [69]:
print( set_components)

['SET NAME IS RMS-SDC-ADRS-SET\n    ORDER IS SORTED\n    MODE IS CHAIN LINKED TO PRIOR\n    OWNER IS RMSADRS\n        WITHIN AREA RMS-SDC-AREA\n        NEXT DBKEY POSITION IS 1\n        PRIOR DBKEY POSITION IS 2\n    MEMBER IS RMSDATA\n        WITHIN AREA RMS-SDC-AREA\n        NEXT DBKEY POSITION IS 1\n        PRIOR DBKEY POSITION IS 2\n        LINKED TO OWNER\n        OWNER DBKEY POSITION IS 3\n        MANDATORY AUTOMATIC\n        KEY IS (\n            YR-MO-101 DESCENDING )\n            DUPLICATES ARE NOT ALLOWED\n    .\n']


In [70]:
test_str = """        KEY IS (
            CTRY-DIGIT DESCENDING
            DTE-CK-SORT DESCENDING )"""

In [71]:
import numpy as np

In [72]:
key_search_pat = re.compile( r'KEY IS \(\s+(.*) \)', flags=re.MULTILINE | re.DOTALL )

In [73]:
whitespace_search_pat = re.compile( r'\s+', flags=re.MULTILINE )

In [74]:
print( test_str)

        KEY IS (
            CTRY-DIGIT DESCENDING
            DTE-CK-SORT DESCENDING )


In [75]:
m = key_search_pat.search( test_str )

In [76]:
m.group(1)

'CTRY-DIGIT DESCENDING\n            DTE-CK-SORT DESCENDING'

In [77]:
def FormatSet( component_text ):
    
    key_clause = None
    m = key_search_pat.search( component_text )
    if m:
        key_clause = " ".join( whitespace_search_pat.split( m.group(1) ) )
    
    lines_df = pd.DataFrame( { 'raw_line' : component_text.split( '\n' ) } )
    lines_df['indent'] = lines_df['raw_line'].str.extract( r'^(\s+)' )
    lines_df['indent'] = lines_df['indent'].fillna( "" ).apply( len )

    lines_df[ ['attribute', 'value' ] ] = lines_df['raw_line'].str.extract( r'^\s*(.+) IS (.+)' )
    
    lines_df.index.name = 'line'

    set_name = lines_df.loc[ lines_df[ 'attribute' ] == 'SET NAME', 'value' ].values[0]
    
    lines_df['set_name'] = set_name
    #lines_df = lines_df.set_index( 'set_name', append=True )
    if key_clause:
        lines_df.loc[ lines_df[ 'attribute' ] == 'KEY', 'value' ] = key_clause
    
    lines_df['group'] = np.nan
    lines_df.loc[ lines_df['indent'] == 4, 'group'] = lines_df.loc[ lines_df['indent'] == 4, 'attribute' ]
    lines_df['group'] = lines_df['group'].ffill()
    
    #print( lines_df )
    # label multiple members
    member_index = 0
    new_group = []
    new_attribute = []
    new_value = []
    for raw_line, group, indent, attribute, value in lines_df[ [ 'raw_line', 'group', 'indent', 'attribute', 'value' ] ].values:

        if "WITHIN AREA" in raw_line:
            attribute, value = raw_line.strip().rsplit( " ", 1 )
        else:
            try:
                if np.isnan( attribute ):
                    attribute = raw_line
                    value = True
            except:
                pass

        if group != "MEMBER":
            new_group.append( group )
            new_attribute.append( attribute ) 
            new_value.append( value )
            continue
        if indent == 4 and attribute == "MEMBER":
            member_index += 1
            new_attribute.append( f"MEMBER { str(member_index).zfill(2) }" )
            new_group.append( f"MEMBER {str(member_index).zfill(2)}" )
            new_value.append( value )
            continue
        if indent == 12:
            if ')' in raw_line:
                attribute = value = None
            else:
                attribute = f"MEMBER { str(member_index).zfill(2) } KEY {raw_line.strip()}"
                value = True
                
        new_group.append( f"MEMBER {str(member_index).zfill(2)}" )
        new_attribute.append( attribute )
        new_value.append( value )

        
    lines_df[ 'attribute' ] = new_attribute
    lines_df[ 'group' ] = new_group
    lines_df[ 'value' ] = new_value
    
    # label attributes as owner or member
    relevant_rows = lines_df['indent'] == 8
    lines_df.loc[ relevant_rows, 'attribute' ]  = lines_df.loc[ relevant_rows, 'group' ] + " " + lines_df.loc[ relevant_rows, 'attribute' ]
    
    wanted_rows = (lines_df['indent'] <= 8) & ( ~lines_df['attribute'].isna() )
    
    #try:
    #    lines_df = lines_df.loc[ wanted_rows ].pivot( columns = 'attribute', values = 'value', index = 'set_name' )
    #except ValueError:
    #    print( lines_df )
    #    raise
                   
    return lines_df
    

In [None]:
print( set_components[7] )

In [None]:
FormatSet( set_components[7] )

In [None]:
pivoted_data = pd.concat( [ FormatSet(_) for _ in set_components ] )

In [None]:
pivoted_data

In [None]:
#pivoted_data = pivoted_data[ sorted( pivoted_data.columns ) ]

In [None]:
pivoted_data.to_excel( '2023-02-07_Farms_Set_Data.xlsx' )

In [None]:
pwd