#### Ashish Mahabal, Caltech/EDRN, August 2024
#### Load and display metadata
display 2 rows for each tab of each excl sheet, and summarize.

In [1]:
# basic imports
import os
import pandas as pd
from IPython.display import display

In [2]:
# A few functions
def display_dataframe(df, sheet_name):
    """Display the first two rows of the dataframe."""
    print(f"First two rows of sheet: {sheet_name}")
    display(df.head(2))  # Display the first two rows as a formatted DataFrame
    print()

def summarize_dataframe(df, sheet_name):
    """Summarize the given dataframe."""
    summary = {
        'Sheet Name': sheet_name,
        'Number of Rows': df.shape[0],
        'Number of Columns': df.shape[1],
        'Column Summary': []
    }
    
    for col in df.columns:
        col_summary = {
            'Column Name': col,
            'Data Type': df[col].dtype,
            'Number of Unique Values': df[col].nunique()
        }
        
        if pd.api.types.is_numeric_dtype(df[col]):
            col_summary['Mean'] = df[col].mean()
            col_summary['Standard Deviation'] = df[col].std()
            col_summary['Min'] = df[col].min()
            col_summary['Max'] = df[col].max()
        elif pd.api.types.is_categorical_dtype(df[col]) or pd.api.types.is_object_dtype(df[col]):
            col_summary['Most Frequent Value'] = df[col].mode().iloc[0]
            col_summary['Frequency'] = df[col].value_counts().iloc[0]
        
        summary['Column Summary'].append(col_summary)
    
    return summary

def print_sheet_summary(sheet_summary):
    """Print the summary for a single sheet."""
    print(f"    Number of Rows: {sheet_summary['Number of Rows']}")
    print(f"    Number of Columns: {sheet_summary['Number of Columns']}")
    for column in sheet_summary['Column Summary']:
        print(f"      Column: {column['Column Name']}")
        print(f"        Data Type: {column['Data Type']}")
        print(f"        Number of Unique Values: {column['Number of Unique Values']}")
        if 'Mean' in column:
            print(f"        Mean: {column['Mean']}")
            print(f"        Standard Deviation: {column['Standard Deviation']}")
            print(f"        Min: {column['Min']}")
            print(f"        Max: {column['Max']}")
        if 'Most Frequent Value' in column:
            print(f"        Most Frequent Value: {column['Most Frequent Value']}")
            print(f"        Frequency: {column['Frequency']}")
    print()

def summarize_excel_files(directory_path):
    """List all Excel files in the directory and summarize their contents."""
    excel_files = [f for f in os.listdir(directory_path) if f.endswith('.xlsx') or f.endswith('.xls')]
    
    for excel_file in excel_files:
        print(f"Processing file: {excel_file}")
        file_path = os.path.join(directory_path, excel_file)
        xls = pd.ExcelFile(file_path)
        
        for sheet_name in xls.sheet_names:
            print(f"  Processing sheet: {sheet_name}")
            df = pd.read_excel(xls, sheet_name=sheet_name)
            display_dataframe(df, sheet_name)  # Display the first two rows
            sheet_summary = summarize_dataframe(df, sheet_name)
            print_sheet_summary(sheet_summary)  # Print the summary for the sheet
        
        print()  # Add a space between processing different files

#### Get the summaries for the three metadata sets

In [3]:
summarize_excel_files('Collection1')

Processing file: GE Data Dictionary 447.xlsx
  Processing sheet: Study_Log
First two rows of sheet: Study_Log


Unnamed: 0,Tab,Column Name,Description,Key
0,A,Study,Patient anonymized study ID,C#### - Cases ...
1,B,Age,Age of patient at time of mammogram,



    Number of Rows: 36
    Number of Columns: 4
      Column: Tab
        Data Type: object
        Number of Unique Values: 36
        Most Frequent Value: A
        Frequency: 1
      Column: Column Name
        Data Type: object
        Number of Unique Values: 36
        Most Frequent Value: Age
        Frequency: 1
      Column: Description
        Data Type: object
        Number of Unique Values: 36
        Most Frequent Value: Age of patient at time of mammogram
        Frequency: 1
      Column: Key
        Data Type: object
        Number of Unique Values: 21
        Most Frequent Value: automatically calculated
        Frequency: 4


Processing file: An Automated System for Breast Cancer Biomarker Analysis Log 2024.xls
  Processing sheet: Definitions
First two rows of sheet: Definitions


Unnamed: 0,Tab,Column Name,Description,Key
0,A,Study,Patient anonymized study ID,C#### - Cases ...
1,B,Age,Age of patient at time of mammogram,



    Number of Rows: 36
    Number of Columns: 4
      Column: Tab
        Data Type: object
        Number of Unique Values: 36
        Most Frequent Value: A
        Frequency: 1
      Column: Column Name
        Data Type: object
        Number of Unique Values: 36
        Most Frequent Value: Age
        Frequency: 1
      Column: Description
        Data Type: object
        Number of Unique Values: 36
        Most Frequent Value: Age of patient at time of mammogram
        Frequency: 1
      Column: Key
        Data Type: object
        Number of Unique Values: 21
        Most Frequent Value: automatically calculated
        Frequency: 4

  Processing sheet: Data
First two rows of sheet: Data


Unnamed: 0,Study,Age,Side,Histology,ER,PR,HER,Scr_Hist,HRT,HRT_Duration,...,weight,height,height_in,height_m,weight_kg,BMI,Ethnicity,Race,brstsz ok/ge,other
0,C0001,48,L,1.0,1.0,1.0,-999.0,1,0,,...,127,"5' 7""",67,1.7018,57.606231,19.890808,nh,w,1.0,calcs since '04
1,N0001,48,,,,,,1,0,,...,125,"5' 7""",67,1.7018,56.699046,19.577567,nh,w,,



    Number of Rows: 360
    Number of Columns: 33
      Column: Study
        Data Type: object
        Number of Unique Values: 360
        Most Frequent Value: C0001
        Frequency: 1
      Column: Age
        Data Type: int64
        Number of Unique Values: 49
        Mean: 58.56388888888889
        Standard Deviation: 10.414064113136735
        Min: 31
        Max: 83
      Column: Side
        Data Type: object
        Number of Unique Values: 4
        Most Frequent Value: L
        Frequency: 97
      Column: Histology
        Data Type: object
        Number of Unique Values: 12
        Most Frequent Value: 1, 2
        Frequency: 64
      Column: ER
        Data Type: float64
        Number of Unique Values: 3
        Mean: -27.1
        Standard Deviation: 164.74097155685718
        Min: -999.0
        Max: 1.0
      Column: PR
        Data Type: float64
        Number of Unique Values: 3
        Mean: -32.766666666666666
        Standard Deviation: 179.92739379846014
  

In [4]:
summarize_excel_files('Collection2')

Processing file: Automated Quantitative Measures of Breast Density Hormone Log.xlsx
  Processing sheet: Definitions
First two rows of sheet: Definitions


Unnamed: 0,Tab,Column,Descriptions,Key
0,A,Study,Study ID/Number,C#### - Cases ...
1,B,Grade_Flag,Indicates if there are multiple values or not ...,0 - single value 1 - mult...



    Number of Rows: 40
    Number of Columns: 4
      Column: Tab
        Data Type: object
        Number of Unique Values: 40
        Most Frequent Value: A
        Frequency: 1
      Column: Column
        Data Type: object
        Number of Unique Values: 40
        Most Frequent Value: Avg_#_CEP17_copies_per_cell
        Frequency: 1
      Column: Descriptions
        Data Type: object
        Number of Unique Values: 30
        Most Frequent Value: Found in molecular pathology report
        Frequency: 4
      Column: Key
        Data Type: object
        Number of Unique Values: 18
        Most Frequent Value: 0 - single value                      1 - multiple values
        Frequency: 7

  Processing sheet: Data
First two rows of sheet: Data


Unnamed: 0,Study,Grade_Flag,Histologic_Grade,Grade_High,Grade_Low,Oncotype,Primary_Tumor,Regional_Lymph_Nodes,Distant_Metastasis_,Pathologic_Stage,...,er/pr/her,her-2,her-2_IHC,Staining_Intensity_HER,Result_Methodology_HER,her-2_Dual_ISH_(DISH),HER2/CEP 17 ratio,Tumor_Cells_Analyzed,Avg_#_HER2/neu_copies_per_cell,Avg_#_CEP17_copies_per_cell
0,C0250,0,2,,,17,T1b,pN0(i-),M0,IA,...,+/+/-,-1,-1,1+,-999,not-amplified,1.01,30,2.04,2.01
1,C0251,0,2,,,17,T1c,N0,M0,IA,...,+/+/-,-1,-1,0+,-999,not-amplified,1.24,20,2.3,1.85



    Number of Rows: 319
    Number of Columns: 40
      Column: Study
        Data Type: object
        Number of Unique Values: 319
        Most Frequent Value: C0250
        Frequency: 1
      Column: Grade_Flag
        Data Type: int64
        Number of Unique Values: 2
        Mean: 0.034482758620689655
        Standard Deviation: 0.18275227807622937
        Min: 0
        Max: 1
      Column: Histologic_Grade
        Data Type: object
        Number of Unique Values: 6
        Most Frequent Value: 2
        Frequency: 145
      Column: Grade_High
        Data Type: float64
        Number of Unique Values: 2
        Mean: 2.727272727272727
        Standard Deviation: 0.46709936649691375
        Min: 2.0
        Max: 3.0
      Column: Grade_Low
        Data Type: float64
        Number of Unique Values: 2
        Mean: 1.7272727272727273
        Standard Deviation: 0.46709936649691375
        Min: 1.0
        Max: 2.0
      Column: Oncotype
        Data Type: object
        Number 

Unnamed: 0,Tab,Column Name,Description,Key
0,A,Study,Study ID/Number,C#### - Cases ...
1,B,Age,Age of patient at time of mammogram (cases and...,



    Number of Rows: 52
    Number of Columns: 4
      Column: Tab
        Data Type: object
        Number of Unique Values: 52
        Most Frequent Value: A
        Frequency: 1
      Column: Column Name
        Data Type: object
        Number of Unique Values: 52
        Most Frequent Value: # MM Lt Br
        Frequency: 1
      Column: Description
        Data Type: object
        Number of Unique Values: 52
        Most Frequent Value: Age of patient at time of mammogram (cases and controls matched +/- 2 years)
        Frequency: 1
      Column: Key
        Data Type: object
        Number of Unique Values: 22
        Most Frequent Value: 1 - yes                                                 0 - no
        Frequency: 10

  Processing sheet: Data
First two rows of sheet: Data


Unnamed: 0,Study,Age,Side,Histology,ER,PR,HER,Grp,HRT,HRT_Duration,...,# MM Rt Br,# MM Lt Br,PM Rt Br,PM Lt Br,# PM Rt Br,# PM Lt Br,Weight_kg,Height_in,Height_m,BMI
0,C0250,43,L,"1, 2",1.0,1.0,-1.0,1,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,78.017888,63,1.6002,30.46812
1,N0250,45,,,,,,1,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,102.058283,67,1.7018,35.23962



    Number of Rows: 638
    Number of Columns: 50
      Column: Study
        Data Type: object
        Number of Unique Values: 638
        Most Frequent Value: C0250
        Frequency: 1
      Column: Age
        Data Type: object
        Number of Unique Values: 59
        Most Frequent Value: 53
        Frequency: 28
      Column: Side
        Data Type: object
        Number of Unique Values: 2
        Most Frequent Value: R
        Frequency: 164
      Column: Histology
        Data Type: object
        Number of Unique Values: 12
        Most Frequent Value: 2
        Frequency: 131
      Column: ER
        Data Type: float64
        Number of Unique Values: 3
        Mean: -14.93730407523511
        Standard Deviation: 124.37444410395652
        Min: -999.0
        Max: 1.0
      Column: PR
        Data Type: float64
        Number of Unique Values: 3
        Mean: -15.100313479623825
        Standard Deviation: 124.35469647139458
        Min: -999.0
        Max: 1.0
      Col

Unnamed: 0,Tab,Column Name,Description,Key
0,A,Study,Study ID/Number,C#### - Cases ...
1,B,Age,Age of patient at time of mammogram (cases and...,



    Number of Rows: 49
    Number of Columns: 4
      Column: Tab
        Data Type: object
        Number of Unique Values: 49
        Most Frequent Value: A
        Frequency: 1
      Column: Column Name
        Data Type: object
        Number of Unique Values: 49
        Most Frequent Value: # MM Lt Br
        Frequency: 1
      Column: Description
        Data Type: object
        Number of Unique Values: 49
        Most Frequent Value: Age of patient at time of mammogram (cases and controls matched +/- 2 years)
        Frequency: 1
      Column: Key
        Data Type: object
        Number of Unique Values: 21
        Most Frequent Value: 1 - yes                                                 0 - no
        Frequency: 10

  Processing sheet: Birads_Log
First two rows of sheet: Birads_Log


Unnamed: 0,Tab,Column,Description,Key
0,A,Study,Study ID/Number,C##### - Case
1,B,MG_Date,Date of Study Mammogram,



    Number of Rows: 24
    Number of Columns: 4
      Column: Tab
        Data Type: object
        Number of Unique Values: 24
        Most Frequent Value: A
        Frequency: 1
      Column: Column
        Data Type: object
        Number of Unique Values: 24
        Most Frequent Value: Arch_Dist
        Frequency: 1
      Column: Description
        Data Type: object
        Number of Unique Values: 24
        Most Frequent Value: Are there any associated features?
        Frequency: 1
      Column: Key
        Data Type: object
        Number of Unique Values: 21
        Most Frequent Value: 0 - Calcification is not a benign type                                         1 - Calcification is a benign type
        Frequency: 1

  Processing sheet: Hormone_Log
First two rows of sheet: Hormone_Log


Unnamed: 0,Tab,Column,Descriptions,Key
0,A,Study,Study ID/Number,C#### - Cases ...
1,B,Grade_Flag,Indicates if there are multiple values or not ...,0 - single value 1 - mult...



    Number of Rows: 40
    Number of Columns: 4
      Column: Tab
        Data Type: object
        Number of Unique Values: 40
        Most Frequent Value: A
        Frequency: 1
      Column: Column
        Data Type: object
        Number of Unique Values: 40
        Most Frequent Value: Avg_#_CEP17_copies_per_cell
        Frequency: 1
      Column: Descriptions
        Data Type: object
        Number of Unique Values: 30
        Most Frequent Value: Found in molecular pathology report
        Frequency: 4
      Column: Key
        Data Type: object
        Number of Unique Values: 18
        Most Frequent Value: 0 - single value                      1 - multiple values
        Frequency: 7


Processing file: Automated Quantitative Measures of Breast Density Birads Log.xlsx
  Processing sheet: Definitions
First two rows of sheet: Definitions


Unnamed: 0,BIRADS Sheet,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,Tab,Column,Description,Key
1,A,Study,Study ID/Number,C##### - Case



    Number of Rows: 48
    Number of Columns: 4
      Column: BIRADS Sheet
        Data Type: object
        Number of Unique Values: 26
        Most Frequent Value: A
        Frequency: 2
      Column: Unnamed: 1
        Data Type: object
        Number of Unique Values: 42
        Most Frequent Value: Column
        Frequency: 2
      Column: Unnamed: 2
        Data Type: object
        Number of Unique Values: 42
        Most Frequent Value: Date of Study Mammogram
        Frequency: 2
      Column: Unnamed: 3
        Data Type: object
        Number of Unique Values: 24
        Most Frequent Value: 1 - yes                                                             0 - no
        Frequency: 9

  Processing sheet: Data
First two rows of sheet: Data


Unnamed: 0,Study,MG_Date,Number,BR_Comp,Masses,Shape,Margin,Density,Calcifications,Typically_Benign,...,Arch_Dist,Asym,IMLN,SL,SDD,Assoc_Feats,Laterality,Quadrant,Depth,Distance
0,C0250,2013-04-18,1,C,1,3.0,5.0,-999.0,0,,...,0,0,0,0,0,0,L,1,2,4.5
1,C0250,2013-04-18,2,C,0,,,,0,,...,0,3,0,0,0,0,R,3,-999,11.0



    Number of Rows: 571
    Number of Columns: 24
      Column: Study
        Data Type: object
        Number of Unique Values: 318
        Most Frequent Value: C0292
        Frequency: 6
      Column: MG_Date
        Data Type: datetime64[ns]
        Number of Unique Values: 260
      Column: Number
        Data Type: int64
        Number of Unique Values: 6
        Mean: 1.6760070052539404
        Standard Deviation: 0.9262841214417122
        Min: 1
        Max: 6
      Column: BR_Comp
        Data Type: object
        Number of Unique Values: 5
        Most Frequent Value: B
        Frequency: 247
      Column: Masses
        Data Type: int64
        Number of Unique Values: 2
        Mean: 0.35376532399299476
        Standard Deviation: 0.4785566833892509
        Min: 0
        Max: 1
      Column: Shape
        Data Type: float64
        Number of Unique Values: 5
        Mean: -295.1930693069307
        Standard Deviation: 458.63095793199244
        Min: -999.0
        Max: 4.

In [5]:
summarize_excel_files('Collection3')

Processing file: U01 Hormone Receptors.xlsx
  Processing sheet: Definitions
First two rows of sheet: Definitions


Unnamed: 0,Tab,Column,Descriptions,Key
0,A,Study,Study ID/Number,C#### - Cases ...
1,B,Loc,,



    Number of Rows: 41
    Number of Columns: 4
      Column: Tab
        Data Type: object
        Number of Unique Values: 41
        Most Frequent Value: A
        Frequency: 1
      Column: Column
        Data Type: object
        Number of Unique Values: 41
        Most Frequent Value: Avg_#_CEP17_copies_per_cell
        Frequency: 1
      Column: Descriptions
        Data Type: object
        Number of Unique Values: 30
        Most Frequent Value: Found in molecular pathology report
        Frequency: 4
      Column: Key
        Data Type: object
        Number of Unique Values: 18
        Most Frequent Value: 0 - single value                      1 - multiple values
        Frequency: 7

  Processing sheet: Study Log
First two rows of sheet: Study Log


Unnamed: 0,Study,Loc,Grade_Flag,Histologic_Grade,Grade_High,Grade_Low,Oncotype,Primary_Tumor,Regional_Lymph_Nodes,Distant_Metastasis,...,er/pr/her,her-2,her-2_IHC,Staining_Intensity_HER,Result_Methodology_HER,her2_Dual_ISH_(DISH),HER2/CEP_17_ratio,Tumor_Cells_Analyzed,Avg_#_HER2/neu_copies_per_cell,Avg_#_CEP17_copies_per_cell
0,C0627,1,0,2,,,13,T1c,pN0,M0,...,+/+/-,-1,-1,1+,-999,-999,-999.0,-999,-999.0,-999.0
1,C0628,1,0,2,,,-999,pT2,pN1,M0,...,+/+/-,-1,-1,1+,Manual morphometric analysis,not amplified,1.51,20,2.5,1.6



    Number of Rows: 394
    Number of Columns: 41
      Column: Study
        Data Type: object
        Number of Unique Values: 380
        Most Frequent Value: C0769
        Frequency: 2
      Column: Loc
        Data Type: int64
        Number of Unique Values: 2
        Mean: 1.0786802030456852
        Standard Deviation: 0.26958130624319276
        Min: 1
        Max: 2
      Column: Grade_Flag
        Data Type: int64
        Number of Unique Values: 3
        Mean: -2.3984771573604062
        Standard Deviation: 50.336992581028774
        Min: -999
        Max: 1
      Column: Histologic_Grade
        Data Type: object
        Number of Unique Values: 7
        Most Frequent Value: 2
        Frequency: 159
      Column: Grade_High
        Data Type: float64
        Number of Unique Values: 3
        Mean: -16.41509433962264
        Standard Deviation: 137.56487056633776
        Min: -999.0
        Max: 3.0
      Column: Grade_Low
        Data Type: float64
        Number of Uni

Unnamed: 0,Study,Loc,Grade_Flag,Histologic_Grade,Grade_High,Grade_Low,Oncotype,Primary_Tumor,Regional_Lymph_Nodes,Distant_Metastasis,...,er/pr/her,her-2,her-2_IHC,Staining_Intensity_HER,Result_Methodology_HER,her-2_Dual_ISH_(DISH),HER2/CEP_17_ratio,Tumor_Cells_Analyzed,Avg_#_HER2/neu_copies_per_cell,Avg_#_CEP17_copies_per_cell
0,"C0651 - OFF STUDY, BILATERAL BR CA",,,,,,,,,,...,,,,,,,,,,
1,C0718-OFF STUDY NO CONSENT FORM,,,,,,,,,,...,,,,,,,,,,



    Number of Rows: 12
    Number of Columns: 41
      Column: Study
        Data Type: object
        Number of Unique Values: 11
        Most Frequent Value: C0898
        Frequency: 2
      Column: Loc
        Data Type: float64
        Number of Unique Values: 1
        Mean: 1.0
        Standard Deviation: 0.0
        Min: 1.0
        Max: 1.0
      Column: Grade_Flag
        Data Type: float64
        Number of Unique Values: 2
        Mean: 0.25
        Standard Deviation: 0.4629100498862757
        Min: 0.0
        Max: 1.0
      Column: Histologic_Grade
        Data Type: object
        Number of Unique Values: 5
        Most Frequent Value: 2
        Frequency: 3
      Column: Grade_High
        Data Type: float64
        Number of Unique Values: 2
        Mean: 2.5
        Standard Deviation: 0.7071067811865476
        Min: 2.0
        Max: 3.0
      Column: Grade_Low
        Data Type: float64
        Number of Unique Values: 2
        Mean: 1.5
        Standard Deviation:

  warn(f"Unable to sort modes: {err}")


  Processing sheet: Definitions
First two rows of sheet: Definitions


Unnamed: 0,BIRADS Sheet,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,Tab,Column,Description,Key
1,A,Study,Study ID/Number,C##### - Case



    Number of Rows: 48
    Number of Columns: 4
      Column: BIRADS Sheet
        Data Type: object
        Number of Unique Values: 26
        Most Frequent Value: A
        Frequency: 2
      Column: Unnamed: 1
        Data Type: object
        Number of Unique Values: 42
        Most Frequent Value: MG_Date
        Frequency: 2
      Column: Unnamed: 2
        Data Type: object
        Number of Unique Values: 42
        Most Frequent Value: Date of Study Mammogram
        Frequency: 2
      Column: Unnamed: 3
        Data Type: object
        Number of Unique Values: 24
        Most Frequent Value: 1 - yes                                                             0 - no
        Frequency: 9

  Processing sheet: BIRADS
First two rows of sheet: BIRADS


Unnamed: 0,Study,MG_Date,Number,BR_Comp,Masses,Shape,Margin,Density,Calcs,Typ_Benign,...,Arch_Dist,Asym,IMLN,SL,SDD,Assoc_Feats,Laterality,Quadrant,Depth,Distance
0,C0627,2016-11-22,1,C,0,,,,0,,...,1,0,0,0,0,0,L,1,-999,6
1,C0628,2016-12-06,1,B,1,-999.0,4.0,-999.0,0,,...,0,0,0,0,0,1,L,5,-999,-999



    Number of Rows: 722
    Number of Columns: 24
      Column: Study
        Data Type: object
        Number of Unique Values: 379
        Most Frequent Value: C0682
        Frequency: 6
      Column: MG_Date
        Data Type: datetime64[ns]
        Number of Unique Values: 270
      Column: Number
        Data Type: int64
        Number of Unique Values: 6
        Mean: 1.760387811634349
        Standard Deviation: 1.0277340596586464
        Min: 1
        Max: 6
      Column: BR_Comp
        Data Type: object
        Number of Unique Values: 4
        Most Frequent Value: B
        Frequency: 323
      Column: Masses
        Data Type: int64
        Number of Unique Values: 2
        Mean: 0.4487534626038781
        Standard Deviation: 0.497711654064203
        Min: 0
        Max: 1
      Column: Shape
        Data Type: object
        Number of Unique Values: 5
        Most Frequent Value: 3
        Frequency: 156
      Column: Margin
        Data Type: float64
        Number of

Unnamed: 0,Study,Initial Assessment,MG_Date,Recall,Same_Day_Img,Addl_Mammo,Addl_Mammo_Date,Addl_US,Addl_US_Date,Addl_MRI,Addl_MRI_Date,SC,MV,SM,XCC,CV,TV,Other,Final_Tech
0,C0627,2013-08-20,2016-11-22,0.0,1.0,0.0,NaT,1.0,2016-09-08,1.0,2016-09-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MG
1,C0628,2013-03-19,2016-12-06,0.0,0.0,0.0,NaT,1.0,2016-11-17,1.0,2016-11-29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MG



    Number of Rows: 309
    Number of Columns: 19
      Column: Study
        Data Type: object
        Number of Unique Values: 309
        Most Frequent Value: C0627
        Frequency: 1
      Column: Initial Assessment
        Data Type: datetime64[ns]
        Number of Unique Values: 10
      Column: MG_Date
        Data Type: datetime64[ns]
        Number of Unique Values: 227
      Column: Recall
        Data Type: float64
        Number of Unique Values: 2
        Mean: 0.2
        Standard Deviation: 0.42163702135578396
        Min: 0.0
        Max: 1.0
      Column: Same_Day_Img
        Data Type: float64
        Number of Unique Values: 2
        Mean: 0.3076923076923077
        Standard Deviation: 0.48038446141526137
        Min: 0.0
        Max: 1.0
      Column: Addl_Mammo
        Data Type: float64
        Number of Unique Values: 2
        Mean: 0.1
        Standard Deviation: 0.31622776601683794
        Min: 0.0
        Max: 1.0
      Column: Addl_Mammo_Date
        Da

Unnamed: 0,Study,MG_Date,Number,BR_Comp,Masses,Shape,Margin,Density,Calcifications,Typically_Benign,...,Arch_Dist,Asym,IMLN,SL,SDD,Assoc_Feats,Laterality,Quadrant,Depth,Distance
0,C0651,2016-12-14,1,C,0,,,,1,0.0,...,0,0,0,0,0,"6, 7",R,2,-999,5
1,C0651,2016-12-14,2,C,1,2.0,-999.0,-999.0,0,,...,0,0,0,0,0,0,R,1,-999,1



    Number of Rows: 26
    Number of Columns: 24
      Column: Study
        Data Type: object
        Number of Unique Values: 9
        Most Frequent Value: C0651
        Frequency: 5
      Column: MG_Date
        Data Type: datetime64[ns]
        Number of Unique Values: 9
      Column: Number
        Data Type: int64
        Number of Unique Values: 5
        Mean: 2.230769230769231
        Standard Deviation: 1.2428255648381974
        Min: 1
        Max: 5
      Column: BR_Comp
        Data Type: object
        Number of Unique Values: 2
        Most Frequent Value: C
        Frequency: 21
      Column: Masses
        Data Type: int64
        Number of Unique Values: 2
        Mean: 0.23076923076923078
        Standard Deviation: 0.4296689244236597
        Min: 0
        Max: 1
      Column: Shape
        Data Type: float64
        Number of Unique Values: 3
        Mean: -331.3333333333333
        Standard Deviation: 517.1725695226562
        Min: -999.0
        Max: 3.0
      

Unnamed: 0,Tab,Column Name,Description,Key,Unnamed: 4
0,A,Study,Study ID/Number,C#### - Cases ...,
1,B,Age,Age of patient at time of mammogram (cases and...,,



    Number of Rows: 37
    Number of Columns: 5
      Column: Tab
        Data Type: object
        Number of Unique Values: 37
        Most Frequent Value: A
        Frequency: 1
      Column: Column Name
        Data Type: object
        Number of Unique Values: 37
        Most Frequent Value: Age
        Frequency: 1
      Column: Description
        Data Type: object
        Number of Unique Values: 37
        Most Frequent Value: Age of patient at time of mammogram (cases and controls matched +/- 2 years)
        Frequency: 1
      Column: Key
        Data Type: object
        Number of Unique Values: 20
        Most Frequent Value: 1 - yes                                                 0 - no
        Frequency: 5
      Column: Unnamed: 4
        Data Type: object
        Number of Unique Values: 1
        Most Frequent Value: *"(1) case subjects that have been in screening for at least two years before the detection of breast disease (by any means).  The two year screening crit

Unnamed: 0,Study,Age,Side,Histology,ER,PR,HER,Grp,HRT,HRT_Duration,...,weight,height,ethnicity,race,hx_other_cancer,MRI,Weight_kg,Height_in,Height_m,BMI
0,C0627,62,L,2.0,1.0,1.0,-1.0,1,0,,...,171,"5'7""",nh,w,0,1,77.5643,67,1.7018,26.7821
1,N0627,63,,,,,,1,0,,...,145,"5'6""",nh,b,0,0,65.7709,66,1.6764,23.4034



    Number of Rows: 752
    Number of Columns: 35
      Column: Study
        Data Type: object
        Number of Unique Values: 727
        Most Frequent Value: -
        Frequency: 26
      Column: Age
        Data Type: object
        Number of Unique Values: 61
        Most Frequent Value: 62
        Frequency: 31
      Column: Side
        Data Type: object
        Number of Unique Values: 3
        Most Frequent Value: L
        Frequency: 201
      Column: Histology
        Data Type: object
        Number of Unique Values: 16
        Most Frequent Value: 2
        Frequency: 162
      Column: ER
        Data Type: object
        Number of Unique Values: 5
        Most Frequent Value: 1
        Frequency: 307
      Column: PR
        Data Type: object
        Number of Unique Values: 6
        Most Frequent Value: 1
        Frequency: 280
      Column: HER
        Data Type: object
        Number of Unique Values: 7
        Most Frequent Value: -1
        Frequency: 228
      C

Unnamed: 0,Tab,Column Name,Description,Key
0,A,Study,Study ID/Number,C#### - Cases ...
1,B,Age,Age of patient at time of mammogram (cases and...,



    Number of Rows: 34
    Number of Columns: 4
      Column: Tab
        Data Type: object
        Number of Unique Values: 34
        Most Frequent Value: A
        Frequency: 1
      Column: Column Name
        Data Type: object
        Number of Unique Values: 34
        Most Frequent Value: Age
        Frequency: 1
      Column: Description
        Data Type: object
        Number of Unique Values: 34
        Most Frequent Value: Age of patient at time of mammogram (cases and controls matched +/- 2 years)
        Frequency: 1
      Column: Key
        Data Type: object
        Number of Unique Values: 19
        Most Frequent Value: 1 - yes                                                 0 - no
        Frequency: 5

  Processing sheet: Birads_Log
First two rows of sheet: Birads_Log


Unnamed: 0,Tab,Column,Description,Key
0,A,Study,Study ID/Number,C##### - Case
1,B,MG_Date,Date of Study Mammogram,



    Number of Rows: 24
    Number of Columns: 4
      Column: Tab
        Data Type: object
        Number of Unique Values: 24
        Most Frequent Value: A
        Frequency: 1
      Column: Column
        Data Type: object
        Number of Unique Values: 24
        Most Frequent Value: Arch_Dist
        Frequency: 1
      Column: Description
        Data Type: object
        Number of Unique Values: 24
        Most Frequent Value: Are there any associated features?
        Frequency: 1
      Column: Key
        Data Type: object
        Number of Unique Values: 21
        Most Frequent Value: 0 - Calcification is not a benign type                                         1 - Calcification is a benign type
        Frequency: 1

  Processing sheet: Hormone_Log
First two rows of sheet: Hormone_Log


Unnamed: 0,Tab,Column,Descriptions,Key
0,A,Study,Study ID/Number,C#### - Cases ...
1,B,Loc,,



    Number of Rows: 41
    Number of Columns: 4
      Column: Tab
        Data Type: object
        Number of Unique Values: 41
        Most Frequent Value: A
        Frequency: 1
      Column: Column
        Data Type: object
        Number of Unique Values: 41
        Most Frequent Value: Avg_#_CEP17_copies_per_cell
        Frequency: 1
      Column: Descriptions
        Data Type: object
        Number of Unique Values: 30
        Most Frequent Value: Found in molecular pathology report
        Frequency: 4
      Column: Key
        Data Type: object
        Number of Unique Values: 18
        Most Frequent Value: 0 - single value                      1 - multiple values
        Frequency: 7


