* [Reading COBOL Layouts](http://www.3480-3590-data-conversion.com/article-reading-cobol-layouts-1.html)

In [1]:
import pandas as pd

In [2]:
from pathlib import Path

In [3]:
from treelib import Tree

In [4]:
#copybook_data[ ['leading_whitespace','new_leading_whitespace'] ]
#metadata = pd.read_excel( '2022-12-15_IDMS_table_descriptions.xlsx', index_col=0 )
#metadata.index.name = "table_index"
#wanted_subsystem = "DISCREPANCY PROCESSING AND ACCOUNT INFORMATION INQUIRY"
#metadata[ metadata[ wanted_subsystem ] == 1 ].copy()

In [5]:
data = pd.read_csv('2023-02-14_FSA_FARMS_schema_from_source.csv', index_col=0)
#data = pd.read_excel( "2022-12-08_PLAS_IDMS_data_structure_WITH_VALID_VALUES.xlsx", index_col=0 )

In [6]:
# Start off with all lines commented out and bring them back in
# Only if they turn out to be un-redefined leaf nodes:
data['commented_out'] = True

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2218 entries, 0 to 12
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   table_name          2218 non-null   object 
 1   indent_space_count  2218 non-null   int64  
 2   data_level          2218 non-null   int64  
 3   field_name          2218 non-null   object 
 4   declaration_step    2218 non-null   int64  
 5   end                 2218 non-null   object 
 6   ELEMENT LENGTH      2154 non-null   float64
 7   POSITION            2218 non-null   int64  
 8   data_type           1961 non-null   object 
 9   REDEFINES           23 non-null     object 
 10  VALUE               66 non-null     object 
 11  OCCURS              31 non-null     object 
 12  table_index         2218 non-null   int64  
 13  table_vers          2218 non-null   int64  
 14  BLANK ON            0 non-null      float64
 15  INDEXED BY          0 non-null      float64
 16  OLQ     

In [8]:
data['data_level'].value_counts()

5     1295
10     626
15     220
88      64
20      13
Name: data_level, dtype: int64

In [9]:
pd.set_option( 'display.max_rows', None )

In [10]:
#data.groupby( 'table_name')[ 'data_level' ].value_counts().to_frame()

In [11]:
#test_df = data[ data['table_name'] == 'CLIENT' ]

In [12]:
def TreeifyElements( grp_df ):

    table_name = grp_df['table_name'].unique().squeeze()
    tree = Tree()
    tree.create_node( tag = table_name, identifier = 'root' )
    
    prev_data_level = 0
    prev_nodeid = 'root'
    data_level_to_parent_nodeid_dict = { 5 : 'root' }
    
    for row in grp_df.itertuples():
        
        if row.data_level > prev_data_level:
            # save this node as the parent of all nodes who have this data level
            data_level_to_parent_nodeid_dict[ row.data_level ] = prev_nodeid
        elif row.data_level < prev_data_level:
            # erase any subparents that we know will not have further children
            data_level_to_parent_nodeid_dict = \
                { level : node_id for level, node_id in data_level_to_parent_nodeid_dict.items() if level <= row.data_level }

        parent_id = data_level_to_parent_nodeid_dict[ row.data_level ]
        tree.create_node(
            tag = row.field_name,
            identifier = row.Index,
            parent = parent_id,
            data = row.data_level
        )
        prev_nodeid = row.Index
        prev_data_level = row.data_level

    return tree

In [13]:
def UncommentLeafElements( grp_df ):

    grp_df = grp_df.copy()
    tree = TreeifyElements( grp_df )

    overwrite_these = grp_df['REDEFINES'].dropna().values

    delete_these = grp_df.index[ grp_df['field_name'].isin( overwrite_these ) ]

    for delete_this_node in delete_these:
        tree.remove_node( delete_this_node )

    leaf_indices = [ _.identifier for _ in tree.leaves() ]
    
    # Add any non-leaf elements that have 88 valid values underneath them
    all_nodes = [ tree[ nodeid ] for nodeid in tree.expand_tree() ]
    named_value_nodes = [ _ for _ in all_nodes if _.data == 88 ]
    parent_nodes_of_named_values = set( [ tree.parent( _.identifier ).identifier for _ in named_value_nodes ] )
    
    wanted_indices = leaf_indices + list( parent_nodes_of_named_values )

    grp_df.loc[ wanted_indices, 'commented_out' ] = False
    
    # not necessary since all lines start off as False
    #grp_df[ 'commented_out' ] = grp_df[ 'commented_out' ].fillna( True )
    
    named_value_node_indices = [ _.identifier for _ in named_value_nodes ]
    non_88_leaf_node_indices = set( wanted_indices ) - set( named_value_node_indices )

    # print( "*" * 100 )
    # print( "wanted_indices len=", len(wanted_indices), wanted_indices )
    # print( "named_value_nodes len=", len(named_value_nodes), named_value_nodes )
    # print( "non_88_leaf_nodes len=", len(non_88_leaf_node_indices), non_88_leaf_node_indices )
    # print()
    # print( "before:\n",  grp_df[ 'data_level' ].value_counts() )

    grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
    
    # print( "after:\n",  grp_df[ 'data_level' ].value_counts() )
    # print()
    
    return grp_df

In [14]:
#test_tree = TreeifyElements( test_df )

In [15]:
#mod_test_df = UncommentLeafElements( test_df )

In [16]:
#mod_test_df

In [17]:
data.shape

(2218, 18)

In [18]:
farms_data = data

In [19]:
farms_data['table_name'].unique()

array(['ACCT-DATA', 'ACQD-PROP', 'ADPS-CNTRL', 'ADVANCE', 'AID',
       'ALTMADJ', 'ALTMADJ-NEW', 'ALTMT', 'AMORTD-CST', 'APROPTN-LOOKUP',
       'ASSISTANCE', 'ASSOC-PRIN-BOND', 'CASE', 'CDEJUNC', 'CDESTR',
       'CHECKS', 'CK-CNTRL', 'CK-INFO', 'CK-INFO-FRADS', 'CLIENT',
       'CLIENT-SFSI', 'COHORT', 'CRBUR', 'CRCLAIMS', 'CRRATE',
       'CTY-LOOKUP', 'DALLOT', 'DALLOT-DTL', 'DALLOT-OBLGN', 'DALLOT-OTH',
       'DAPROC', 'DISCRP', 'DISCRP-MISC', 'DSTR-SETASD', 'DTEREC',
       'DTL-RATE', 'EASMNT', 'EQTYRVRS', 'EQUITY', 'FD-SIDE', 'FDCDE',
       'GLBORR', 'GLLNDR', 'IMCASE', 'INITMAN', 'INSRNC-AUTHY',
       'INSTALLMENT', 'INSURANCE', 'INT-ASSTNC', 'INT-BDWN', 'INVSTR-DTL',
       'INVSTR-INFO', 'INVSTR-INFO-MISC', 'JDGMT-3RD-PARTY',
       'JOB-RESTART', 'JURDCTN', 'LESSEE', 'LN-AID', 'LN-NO-INT',
       'LNRATE', 'LOAN', 'LOAN-DRE', 'LOAN-OTC', 'LOAN-SFSI',
       'LOCTN-LOOKUP', 'LSE-INFO', 'MALLOT', 'MALLOT-OBLGN', 'MALLOT-OTH',
       'MSTR-RATE', 'NOTIFY', 'NOTIFY-CNTRL', 

In [20]:
data = farms_data

In [21]:
len( data )

2218

In [22]:
data['data_level'].value_counts().sum()

2218

In [23]:
data['indent_space_count'].value_counts()

2     1295
4      626
6      220
10      64
8       13
Name: indent_space_count, dtype: int64

In [24]:
import numpy as np

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2218 entries, 0 to 12
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   table_name          2218 non-null   object 
 1   indent_space_count  2218 non-null   int64  
 2   data_level          2218 non-null   int64  
 3   field_name          2218 non-null   object 
 4   declaration_step    2218 non-null   int64  
 5   end                 2218 non-null   object 
 6   ELEMENT LENGTH      2154 non-null   float64
 7   POSITION            2218 non-null   int64  
 8   data_type           1961 non-null   object 
 9   REDEFINES           23 non-null     object 
 10  VALUE               66 non-null     object 
 11  OCCURS              31 non-null     object 
 12  table_index         2218 non-null   int64  
 13  table_vers          2218 non-null   int64  
 14  BLANK ON            0 non-null      float64
 15  INDEXED BY          0 non-null      float64
 16  OLQ     

In [26]:
data.sample(10)

Unnamed: 0,table_name,indent_space_count,data_level,field_name,declaration_step,end,ELEMENT LENGTH,POSITION,data_type,REDEFINES,VALUE,OCCURS,table_index,table_vers,BLANK ON,INDEXED BY,OLQ,commented_out
21,RENTL-DTL,2,5,DTE-LST-RGSTR-DTL-RENTL,2200,DISPLAY,6.0,92,9(06),,,,93,1,,,,True
2,DTEREC,6,15,EFCTV-DTE-CTRY,300,DISPLAY,2.0,1,9(02),,,,34,1,,,,True
13,CLIENT-SFSI,2,5,DTE-TRNG-COMPLTN,1400,DISPLAY,6.0,51,9(6),,,,20,1,,,,True
10,SITE-LOOKUP,4,10,ZIP-CDE-152,1100,DISPLAY,5.0,148,9(5),,,,98,1,,,,True
182,GLBORR,4,10,SRVC-FEE-PCT,18300,COMP-3,4.0,177,S99V9999,,,,41,1,,,,True
16,PLEREP,4,10,RH-INIT-LNS-6-PCT-CNT,1700,COMP-3,3.0,67,S9(04),,,,77,1,,,,True
17,EASMNT,2,5,LOC-EASMNT-APPRSL-AMT,1800,COMP-3,5.0,67,S9(07)V99,,,,36,1,,,,True
207,GLBORR,4,10,INT-ACCRL-TRMNTN-CDE,20800,DISPLAY,1.0,316,X(01),,,,41,1,,,,True
92,GLBORR,6,15,TRF-FEE-GUARNTD,9300,COMP-3,6.0,240,S9(08)V99,,,,41,1,,,,True
8,USERS,2,5,SRVCG-OFC-CDE-CTY,900,DISPLAY,5.0,7,9(5),,,,112,1,,,,True


In [27]:
def Create_Copybook_Parts( grp ):

    grp = UncommentLeafElements( grp )
    table_index = grp['table_index'].iloc[0]
    table_ver = grp['table_vers'].iloc[0]
    table_n_fields = len(grp)
    table_name = grp['table_name'].iloc[0]
    
    print( 'table', table_index,
          'ver', table_ver,
          'n fields =', table_n_fields,
          'name =', table_name
    )
    
    temp_index = grp.index
          
    sorted_grp = grp.sort_values( 'declaration_step' )
    step_numbers = grp['declaration_step'].astype(str).str.zfill( 6 )
    comment_column = np.where( grp['commented_out'], '*', ' ' )
    #indent = 1 + ( grp['indent_space_count'].astype( int ) - 2 ) * 4
    #indent_spaces = indent.apply( lambda i: " " * i )
    indent_spaces = grp['indent_space_count'].apply( lambda i: " " * i )
    # indent_number
    sep = pd.Series( [ "  " for _ in range( len( grp ) ) ], index=temp_index )
    
    clauses = [ 'PIC', 'BLANK ON', 'INDEXED BY', 'OCCURS', 'OLQ', 'REDEFINES', 'VALUE' ]
    formatted_cols = {}
    for clause in clauses:
        if clause == 'PIC':
            col_name = 'data_type'
        else:
            col_name = clause
        col = grp[col_name]
        col[ col.notna() ] = col[ col.notna() ].apply( lambda t:  f'{clause} {t}' )
        formatted_cols[ col_name ] = col 
    
    formatted_data = dict(
        step_numbers=step_numbers,
        comment_column=comment_column,
        indent_spaces=indent_spaces,
        data_level = grp['data_level'].astype(str).str.zfill( 2 ),
        sep = sep,
        field_name = grp['field_name'],
        pic_clauses= formatted_cols[ 'data_type' ],
        comp_clause = grp[ 'end' ],
        value_clauses = formatted_cols[ 'VALUE' ],
        occurs_clauses = formatted_cols[ 'OCCURS' ],
        #redefines_clauses = formatted_cols[ 'REDEFINES' ],
        redefines_clauses = [ "" for _ in range( len( grp ) ) ],
        blank_on_clauses = formatted_cols[ 'BLANK ON' ],
        indexed_by_clauses = formatted_cols[ 'INDEXED BY' ],
        olq_clauses = formatted_cols[ 'OLQ' ],
    )
    
    lengths = { len(_) for _ in formatted_data.values() }
    assert len( lengths ) == 1
    assert lengths.pop() == table_n_fields
    
    df = pd.DataFrame( formatted_data )
    
    assert len( df ) == table_n_fields, f'len( df ) = {len( df )}, table_n_fields = {table_n_fields}\n\n{formatted_data}'

    table_name_line = pd.DataFrame( columns=df.columns )
    table_name_line.loc[ 0, 'step_numbers' ] = str( 50 ).zfill( 6 )
    table_name_line.loc[ 0, 'comment_column' ] = " "
    table_name_line.loc[ 0, 'indent_spaces' ] = "" # " "
    table_name_line.loc[ 0, 'data_level' ] = '01'
    table_name_line.loc[ 0, 'sep' ] = "  "
    table_name_line.loc[ 0, 'field_name' ] = table_name
    table_name_line = table_name_line.fillna( '' )

    df = pd.concat( ( table_name_line, df ), axis=0 )
    return df

In [28]:
data.shape

(2218, 18)

In [29]:
formatted_df = data.groupby( 'table_index' ).apply( Create_Copybook_Parts )

table 0 ver 1 n fields = 29 name = ACCT-DATA
table 1 ver 1 n fields = 114 name = ACQD-PROP
table 2 ver 1 n fields = 5 name = ADPS-CNTRL
table 3 ver 1 n fields = 6 name = ADVANCE
table 4 ver 1 n fields = 16 name = AID
table 5 ver 1 n fields = 36 name = ALTMADJ
table 6 ver 1 n fields = 37 name = ALTMADJ-NEW
table 7 ver 1 n fields = 5 name = ALTMT
table 8 ver 1 n fields = 7 name = AMORTD-CST
table 9 ver 1 n fields = 10 name = APROPTN-LOOKUP
table 10 ver 1 n fields = 12 name = ASSISTANCE
table 11 ver 1 n fields = 7 name = ASSOC-PRIN-BOND
table 12 ver 1 n fields = 6 name = CASE
table 13 ver 1 n fields = 4 name = CDEJUNC
table 14 ver 1 n fields = 24 name = CDESTR
table 15 ver 1 n fields = 9 name = CHECKS
table 16 ver 1 n fields = 8 name = CK-CNTRL
table 17 ver 1 n fields = 28 name = CK-INFO
table 18 ver 1 n fields = 28 name = CK-INFO-FRADS
table 19 ver 1 n fields = 76 name = CLIENT
table 20 ver 1 n fields = 16 name = CLIENT-SFSI
table 21 ver 1 n fields = 2 name = COHORT
table 22 ver 1 n fiel

  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] =

table 82 ver 1 n fields = 10 name = RDA-AREA-OBLGN
table 83 ver 1 n fields = 24 name = RDA-DALLOT-DTL
table 84 ver 1 n fields = 10 name = RDA-DALLOT-OBLGN
table 85 ver 1 n fields = 6 name = RDA-FD-SIDE
table 86 ver 1 n fields = 16 name = RDA-INSRNC-AUTHY
table 87 ver 1 n fields = 19 name = RDA-MALLOT
table 88 ver 1 n fields = 10 name = RDA-MALLOT-OBLGN
table 89 ver 1 n fields = 14 name = RDA-RGN-DTL
table 90 ver 1 n fields = 16 name = RDA-RGN-OBLGN
table 91 ver 1 n fields = 4 name = REJECT-TRNSCTN
table 92 ver 1 n fields = 14 name = RENTL-CNTRL
table 93 ver 1 n fields = 23 name = RENTL-DTL
table 94 ver 1 n fields = 3 name = RENTL-FY-UNIT
table 95 ver 1 n fields = 10 name = RENTL-TTL
table 96 ver 1 n fields = 4 name = RESCHEDULE
table 97 ver 1 n fields = 22 name = RH-DFRL
table 98 ver 1 n fields = 21 name = SITE-LOOKUP
table 99 ver 1 n fields = 2 name = SRCFDS
table 100 ver 1 n fields = 8 name = ST-LOOKUP
table 101 ver 1 n fields = 9 name = STAT
table 102 ver 1 n fields = 1 name = STOPP

  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] = 5
  grp_df.loc[ non_88_leaf_node_indices, 'data_level' ] =

In [30]:
formatted_df.shape

(2331, 14)

In [31]:
formatted_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2331 entries, (0, 0) to (112, 12)
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   step_numbers        2331 non-null   object
 1   comment_column      2331 non-null   object
 2   indent_spaces       2331 non-null   object
 3   data_level          2331 non-null   object
 4   sep                 2331 non-null   object
 5   field_name          2331 non-null   object
 6   pic_clauses         2074 non-null   object
 7   comp_clause         2331 non-null   object
 8   value_clauses       179 non-null    object
 9   occurs_clauses      144 non-null    object
 10  redefines_clauses   2331 non-null   object
 11  blank_on_clauses    113 non-null    object
 12  indexed_by_clauses  113 non-null    object
 13  olq_clauses         113 non-null    object
dtypes: object(14)
memory usage: 276.7+ KB


In [32]:
#formatted_df

In [33]:
formatted_df['first_part'] = \
    formatted_df['step_numbers'] + \
    formatted_df['comment_column'] + \
    formatted_df['indent_spaces'] + \
    formatted_df['data_level'] + \
    formatted_df['sep'] + \
    formatted_df['field_name']

In [34]:
#formatted_df

In [35]:
formatted_df['first_part_len'] = formatted_df['first_part'].apply( len )

In [36]:
#formatted_df['first_part_len'].hist()

In [37]:
(formatted_df['first_part_len'] >= 49).sum()

0

In [38]:
#formatted_df['first_part_len'].describe()

In [39]:
#long_rows = formatted_df[ formatted_df['first_part_len'] >= 49].index

In [40]:
#formatted_df.loc[long_rows, 'first_part_len'].describe()

In [41]:
#formatted_df['first_part'] = formatted_df['first_part'].str.ljust(50)

In [42]:
formatted_df['white_space_middle_part'] = formatted_df['first_part_len'].apply( lambda l: (" " * (50 - l)) if l < 49 else " " )

In [43]:
formatted_df['white_space_middle_part'].values

array(['                              ',
       '                               ',
       '                               ', ..., '                     ',
       '                    ', '                               '],
      dtype=object)

# Copybook Syntax

* "OCCURS" and "REDEFINES" clauses go before PIC clause
* "VALUE" clause is either in lieu of or after PIC clause
* COMP-3 goes after PIC clause

In [44]:
formatted_df = formatted_df.fillna("")

In [45]:
formatted_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2331 entries, (0, 0) to (112, 12)
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   step_numbers             2331 non-null   object
 1   comment_column           2331 non-null   object
 2   indent_spaces            2331 non-null   object
 3   data_level               2331 non-null   object
 4   sep                      2331 non-null   object
 5   field_name               2331 non-null   object
 6   pic_clauses              2331 non-null   object
 7   comp_clause              2331 non-null   object
 8   value_clauses            2331 non-null   object
 9   occurs_clauses           2331 non-null   object
 10  redefines_clauses        2331 non-null   object
 11  blank_on_clauses         2331 non-null   object
 12  indexed_by_clauses       2331 non-null   object
 13  olq_clauses              2331 non-null   object
 14  first_part               2331 

In [46]:
( (formatted_df['occurs_clauses'] != "") & (formatted_df['redefines_clauses'] != "") ).sum()

0

In [47]:
#formatted_df['redefines_clauses_len'] = formatted_df['redefines_clauses'].apply( len )

In [48]:
#formatted_df['redefines_clauses_len'].describe()

In [49]:
#formatted_df['occurs_clauses_len'] = formatted_df['occurs_clauses'].apply( len )

In [50]:
#formatted_df['occurs_clauses_len'].describe()

In [51]:
# Add a separating space to the pre-pic clauses when you have both
#formatted_df.loc[ (formatted_df['occurs_clauses'] != "") & (formatted_df['redefines_clauses'] != ""), 'occurs_clauses'] = \
#    formatted_df.loc[ (formatted_df['occurs_clauses'] != "") & (formatted_df['redefines_clauses'] != ""), 'occurs_clauses'].apply( lambda s: " " + s )

In [52]:
formatted_df['pre_pic_clause'] = formatted_df['redefines_clauses'] + formatted_df['occurs_clauses']

In [53]:
formatted_df[ 'pre_pic_clause' ] = formatted_df[ 'pre_pic_clause' ].str.replace( pat=r'\s+', repl=' ', regex=True )

In [54]:
formatted_df['second_part'] = formatted_df['pre_pic_clause']

# Diagnostic

In [55]:
formatted_df.loc[ (formatted_df['occurs_clauses'] != "") | (formatted_df['redefines_clauses'] != ""), 'second_part'].values[:50]

array(['OCCURS 0 TO 10 TIMES DEPENDING ON NBR-OF-OCCURS',
       'OCCURS 0 TO 10 TIMES DEPENDING ON NBR-OF-OCCURS',
       'OCCURS 2 TIMES', 'OCCURS 10 TIMES', 'OCCURS 4 TIMES',
       'OCCURS 4 TIMES', 'OCCURS 2 TIMES', 'OCCURS 16 TIMES',
       'OCCURS 4 TIMES', 'OCCURS 10 TIMES', 'OCCURS 4 TIMES',
       'OCCURS 6 TIMES', 'OCCURS 6 TIMES', 'OCCURS 3 TIMES',
       'OCCURS 3 TIMES', 'OCCURS 3 TIMES',
       'OCCURS 0 TO 1800 TIMES DEPENDING ON NBR-OF-OCCURS',
       'OCCURS 5 TIMES', 'OCCURS 6 TIMES', 'OCCURS 6 TIMES',
       'OCCURS 2 TIMES', 'OCCURS 4 TIMES', 'OCCURS 8 TIMES',
       'OCCURS 4 TIMES', 'OCCURS 10 TIMES', 'OCCURS 10 TIMES',
       'OCCURS 0 TO 1850 TIMES DEPENDING ON MULTI-CARD-DATA-LGTH',
       'OCCURS 3 TIMES', 'OCCURS 4 TIMES', 'OCCURS 200 TIMES',
       'OCCURS 2 TIMES'], dtype=object)

In [56]:
#formatted_df.loc[ (formatted_df['occurs_clauses'] != "") & (formatted_df['redefines_clauses'] != ""), 'first_and_second_part'].values[:50]

In [57]:
formatted_df['second_part_len'] = formatted_df['second_part'].apply( len )

In [58]:
formatted_df['second_part_len'].describe()

count    2331.000000
mean        0.250536
std         2.562999
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max        56.000000
Name: second_part_len, dtype: float64

In [59]:
pd.set_option( 'display.max_colwidth', None )

In [60]:
( formatted_df['pre_pic_clause'] != "" ).sum()

31

In [61]:
# Add a separating space to the pic clauses when you have a pre-pic clause
formatted_df.loc[ (formatted_df['pre_pic_clause'] != "") & (formatted_df['pic_clauses'] != ""), 'pic_clauses'] = \
    formatted_df.loc[ (formatted_df['pre_pic_clause'] != "") & (formatted_df['pic_clauses'] != ""), 'pic_clauses'].apply( lambda s: " " + s )

In [62]:
formatted_df['second_and_third_part'] = formatted_df['second_part'] + formatted_df['pic_clauses']

### Training clauses are COMP and VALUE

In [63]:
#formatted_df['comp_clause']

In [64]:
formatted_df['comp_clause'].value_counts()

DISPLAY           1502
COMP-3             647
                   113
CONDITION-NAME      64
COMP                 5
Name: comp_clause, dtype: int64

In [65]:
formatted_df.loc[ (formatted_df['comp_clause'] == 'DISPLAY'), 'comp_clause' ] = ''

In [66]:
formatted_df.loc[ (formatted_df['comp_clause'] == 'COND'), 'comp_clause' ] = ''

In [67]:
formatted_df.loc[ (formatted_df['comp_clause'] == 'CONDITION-NAME'), 'comp_clause' ] = ''

In [68]:
formatted_df['value_clauses'].value_counts()

               2265
VALUE 'S'        10
VALUE 'D'        10
VALUE 'C'        10
VALUE 'T'         8
VALUE 'U'         8
VALUE 'V'         8
VALUE '0'         4
VALUE '1'         4
VALUE SPACE       2
VALUE 'R'         1
VALUE 'A'         1
Name: value_clauses, dtype: int64

In [69]:
( (formatted_df['comp_clause'] != "") & (formatted_df['value_clauses'] != "") ).sum()

0

In [70]:
# Add a separating space to the post-pic clauses when you have both
formatted_df.loc[ (formatted_df['comp_clause'] != "") & (formatted_df['value_clauses'] != ""), 'value_clauses'] = \
    formatted_df.loc[ (formatted_df['comp_clause'] != "") & (formatted_df['value_clauses'] != ""), 'value_clauses'].apply( lambda s: " " + s )

In [71]:
formatted_df['post_pic_clauses'] = formatted_df['comp_clause'] + formatted_df['value_clauses']

In [72]:
# Add a separating space to the pic clauses when you have a post-pic clause
formatted_df.loc[ (formatted_df['post_pic_clauses'] != "") & ((formatted_df['pre_pic_clause'] != "") | (formatted_df['pic_clauses'] != "")), 'post_pic_clauses'] = \
    formatted_df.loc[ (formatted_df['post_pic_clauses'] != "") & ((formatted_df['pre_pic_clause'] != "") | (formatted_df['pic_clauses'] != "")), 'post_pic_clauses'].apply( lambda s: " " + s )

In [73]:
#formatted_df.loc[formatted_df['post_pic_clauses'] != "", 'post_pic_clauses'].values

In [74]:
formatted_df[ 'second_and_third_part' ].values[:50]

array(['', 'PIC 9(2)', '', '', 'PIC 9(2)', 'PIC 9(1)', 'PIC 9(1)',
       'PIC 9(2)', 'PIC 9(2)V9(4)', 'PIC V9(6)', 'PIC 9(1)', 'PIC 9(1)',
       'PIC 9(06)', '', 'PIC 9(1)', 'PIC 9(1)', 'PIC 9(3)', 'PIC 9(2)',
       'PIC X(1)', 'PIC 9(06)', 'PIC 9(3)', 'PIC S9(8)V99', 'PIC X(01)',
       'PIC 9(1)', 'PIC 9(6)', 'PIC 9(01)', 'PIC 9(1)', 'PIC 9(2)V9(4)',
       'PIC V9(6)', 'PIC X(0019)', '', '', '', 'PIC 9(2)', 'PIC 9(1)',
       'PIC 9(1)', 'PIC X(07)', 'PIC X(07)', '', 'PIC 9(2)', 'PIC 9(3)',
       'PIC 9(5)', '', 'PIC X(1)', '', '', '', '', '', ''], dtype=object)

In [75]:
formatted_df[ 'second_third_and_forth_part'] = formatted_df[ 'second_and_third_part' ] + formatted_df['post_pic_clauses']

In [76]:
formatted_df[ 'second_third_and_forth_part'].sample(50).values

array(['PIC 9(6)', 'PIC S9(08)V99 COMP-3', 'PIC X(3)', 'PIC 9(06)',
       'PIC S9(10)V99 COMP-3', 'PIC S9(10)V99 COMP-3',
       'PIC S9(05)V9 COMP-3', 'PIC 9(06)', 'PIC 9(2)', 'PIC X(19)',
       'PIC 9(1)', 'PIC 9(2)', '', 'PIC S9(6)V99 COMP-3', 'PIC XX',
       'PIC 9(2)', '', "VALUE 'C'", 'PIC S9(10)V99 COMP-3', '',
       'OCCURS 4 TIMES', 'PIC X(0017)', 'PIC 99', '', 'PIC X(2)',
       'PIC 9(2)', 'PIC 9(1)', 'PIC 9(2)', "VALUE 'D'", '', 'PIC 99',
       'PIC S9(10)V99 COMP-3', 'PIC 99', 'PIC 9(6)', '', 'PIC 9(06)',
       'PIC S9(11)V99 COMP-3', 'PIC 9(1)', 'PIC X(02)', "VALUE 'D'",
       'PIC X(0036)', 'PIC 9(3)', 'PIC X(1)', 'PIC 9(01)', 'PIC X(10)',
       'OCCURS 4 TIMES PIC V9(04)', 'PIC X(0044)', 'PIC 9(1)', '',
       'PIC S9(08) COMP-3'], dtype=object)

In [77]:
formatted_df[ 'second_third_and_forth_part_len'] = formatted_df[ 'second_third_and_forth_part' ].apply( len )

In [78]:
formatted_df[ 'second_third_and_forth_part_len'].describe()

count    2331.000000
mean       10.506650
std         6.630099
min         0.000000
25%         8.000000
50%         9.000000
75%        17.000000
max        65.000000
Name: second_third_and_forth_part_len, dtype: float64

In [79]:
formatted_df['content_len'] = formatted_df['first_part_len'] + formatted_df[ 'second_third_and_forth_part_len'] + 2 # separating space + period

In [80]:
formatted_df['content_len'].sort_values().tail(30)

table_index     
26           3      60
51           10     60
77           3      60
26           2      60
41           81     60
77           6      60
             24     60
             11     60
26           1      60
41           82     61
10           5      61
48           3      61
16           3      61
11           3      61
48           2      61
83           22     62
41           165    62
77           9      62
             8      62
96           1      62
41           187    62
27           26     62
81           22     62
1            35     62
11           4      62
37           11     71
5            16     76
6            16     82
44           38     91
91           3      97
Name: content_len, dtype: int64

In [81]:
formatted_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2331 entries, (0, 0) to (112, 12)
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   step_numbers                     2331 non-null   object
 1   comment_column                   2331 non-null   object
 2   indent_spaces                    2331 non-null   object
 3   data_level                       2331 non-null   object
 4   sep                              2331 non-null   object
 5   field_name                       2331 non-null   object
 6   pic_clauses                      2331 non-null   object
 7   comp_clause                      2331 non-null   object
 8   value_clauses                    2331 non-null   object
 9   occurs_clauses                   2331 non-null   object
 10  redefines_clauses                2331 non-null   object
 11  blank_on_clauses                 2331 non-null   object
 12  indexed_by_clauses      

In [82]:
formatted_df['alternative_white_space_middle_part'] = [ " " * ( 72 - _ ) for _ in formatted_df['content_len'].values ]

In [83]:
formatted_df['alternative_white_space_middle_part_len'] = formatted_df['alternative_white_space_middle_part'].apply( len )

In [84]:
formatted_df['alternative_white_space_middle_part_len'].describe()

count    2331.000000
mean       31.470184
std         9.770251
min         0.000000
25%        24.000000
50%        32.000000
75%        38.000000
max        56.000000
Name: alternative_white_space_middle_part_len, dtype: float64

In [85]:
formatted_df.loc[ formatted_df['alternative_white_space_middle_part'] == "", 'alternative_white_space_middle_part' ] = " "

In [86]:
formatted_df['alternative_white_space_middle_part_len'] = formatted_df['alternative_white_space_middle_part'].apply( len )

In [87]:
formatted_df['alternative_white_space_middle_part_len'].describe()

count    2331.000000
mean       31.471900
std         9.764807
min         1.000000
25%        24.000000
50%        32.000000
75%        38.000000
max        56.000000
Name: alternative_white_space_middle_part_len, dtype: float64

In [88]:
#formatted_df['alternative_white_space_middle_part_len']

In [89]:
formatted_df['white_space_middle_part_len'] = formatted_df['white_space_middle_part'].apply( len )

In [90]:
(~(formatted_df['white_space_middle_part_len'] < formatted_df['alternative_white_space_middle_part_len'])).sum()

363

In [91]:
formatted_df['white_space_middle_part'] = formatted_df['white_space_middle_part'].where( 
    cond = formatted_df['white_space_middle_part_len'] < formatted_df['alternative_white_space_middle_part_len'],
    other = formatted_df['alternative_white_space_middle_part']
)

In [92]:
(formatted_df[ 'second_third_and_forth_part_len'] == 0).sum()

293

In [93]:
formatted_df[ 'line_completion' ] = '.'

In [94]:
formatted_df[ 'line_completion' ] = formatted_df[ 'line_completion' ].where(
    cond = formatted_df[ 'second_third_and_forth_part' ] == "",
    other = formatted_df['white_space_middle_part'] + formatted_df[ 'second_third_and_forth_part' ] + '.'
)

In [95]:
formatted_df[ 'full_line'] = formatted_df['first_part'] + formatted_df[ 'line_completion' ]

In [96]:
formatted_df[ 'full_line_len' ] = formatted_df[ 'full_line'].apply(len)

In [97]:
formatted_df[ 'full_line_len' ].describe()

count    2331.000000
mean       58.208494
std        13.494025
min        15.000000
25%        59.000000
50%        60.000000
75%        68.000000
max        97.000000
Name: full_line_len, dtype: float64

In [98]:
#formatted_df[ 'full_line_len' ].sort_values()

In [99]:
#formatted_df.sort_values('full_line_len' )['full_line'].tail(50)

In [100]:
formatted_df['full_line'].to_csv("2023-02-21_IDMS_Copybooks_FARMS_FLAT_STRUCTURE.txt", header=False, index=False )

In [101]:
!head 2023-02-21_IDMS_Copybooks_FARMS_FLAT_STRUCTURE.txt

000050 01  ACCT-DATA.
000100   05  LN-NBR                               PIC 9(2).
000200*  05  FD-CDE.
000300*    10  FD-CDE-3.
000400       05  FD-CDE-2                         PIC 9(2).
000500       05  FD-CDE-3RD                       PIC 9(1).
000600     05  FD-CDE-4TH                         PIC 9(1).
000700   05  KIND-CDE-LN                          PIC 9(2).
000800*  05  INT-RATE-NOTE                        PIC 9(2)V9(4).
000900   05  INT-RATE-NOTE-1ST                    PIC V9(6).
