In [10]:
import os, sys
import pandas as pd
import numpy as np
from pprint import pprint
import re

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
sys.path.append('../src')

In [11]:
from download import Config

In [12]:
def convert_to_camelcase(input_string):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', input_string)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()


In [13]:
def check_len_strings(df, check='max_length', print_out=False):
    """print out the max or min length of string columns in the dataframe
    df: pd.DataFrame
    check: 'max_length' or 'min_length' 
    returns pd.Series of column names and max string lengths"""
    
    assert (check=='max_length') or (check=='min_length')
    lens_dict = {}
    
    for col in df.columns:
        ser = df[col]
        
        if ser.dtype == np.object:
            col_lens = df[col].str.len()
            
            if check=='max_length':
                len_val = col_lens.max()
            
            elif check=='min_length':
                len_val = col_lens.min()

            lens_dict[col] = len_val
            
            if print_out:
                print(col,' : ', int(len_val))
    
    return pd.Series(lens_dict, dtype=int, name=check)

In [14]:
# Make series of string length and value
def get_ser_of_extreme_vals(df, agg='max'):
    """
    df: pandasdataframe
    agg: 'max' or 'min' check for max or min length of strings and values of numerics
    
    returns pd.Series
    """

    ser_lens = check_len_strings(df, check=f'{agg}_length')
    fltr = [pd.api.types.is_numeric_dtype(df[col]) for col in df.columns]
    ser_vals = df.loc[:, fltr]
    if agg =='max':
        ser_vals = ser_vals.max()
    elif agg =='min':
        ser_vals = ser_vals.min()

    ser_vals = pd.concat([ser_vals,  ser_lens])
    ser_vals.name = f'{agg}_vals'

    return ser_vals

In [15]:
def show_df_dtypes_and_vals(df):

    # make dataframe of datatypes
    ser_types = df.dtypes
    ser_types.name = 'datatypes'
    df_summary = pd.DataFrame(ser_types)

    #add extreme values
    ser_min_vals = get_ser_of_extreme_vals(df, agg='min')
    ser_max_vals = get_ser_of_extreme_vals(df, agg='max')
    df_summary = df_summary.join(ser_min_vals, how='left').join(ser_max_vals, how='left')
    # reorder to original column order
    df_summary = df_summary.loc[df.columns,:]

    return df_summary

In [48]:
for k,v in Config.googlesheet_uids.items():
    url = f"https://docs.google.com/spreadsheets/d/{v}"

    print(f"{k}:"  , url)

states: https://docs.google.com/spreadsheets/d/11OkIWLwZYi9xkpuFODAKXQZHEFeMvYCQ8BTfIBKm0Z8
disorders: https://docs.google.com/spreadsheets/d/13a0w3ouXq5sFCa0fBsg9xhWx67RGJJJqLjD_Oy1c3b0
resources: https://docs.google.com/spreadsheets/d/1LeLlrsvBWMYTTIXTVtkynmBzzb0Uzi1OwpRLfyRAwzM
assessments: https://docs.google.com/spreadsheets/d/1VUf3XnieYThY8OA6JWtpNP4zI2xa9xak9LXuyH_PaoE
sensors: https://docs.google.com/spreadsheets/d/1ELaw79zmtmjmrg3J7slyoP-HXdfQRWa1Aqnbp50cmj8


In [43]:
docid = Config.googlesheet_uids['resources']
url = f"https://docs.google.com/spreadsheets/d/{docid}"
export_url = url + "/export?format=xlsx"
url

'https://docs.google.com/spreadsheets/d/1LeLlrsvBWMYTTIXTVtkynmBzzb0Uzi1OwpRLfyRAwzM'

In [16]:
fldr_path = '..','data','raw'
# fname = 'assessments.xlsx'
sheet_name ='disorders'
fname = 'disorders.xlsx'

fpath = os.path.join(*fldr_path, fname)

In [17]:
xl = pd.ExcelFile(fpath)
sheet_names = xl.sheet_names  # see all sheet names

In [18]:
cols_dict = {}

for sheet_name in sheet_names:
    df = xl.parse(sheet_name, nrows=0)
    cols = df.columns.tolist()
    cols_dict[sheet_name] = cols



In [19]:
for key, col_list in cols_dict.items():
    print('-'*50,'\n', key)
    for col in col_list:
        print(' '*4, col, ' '*4, )
        

-------------------------------------------------- 
 Classes
     ClassName     
     label     
     definition     
     sameAs     
     equivalentClasses     
     equivalentClasses?     
     subClassOf     
-------------------------------------------------- 
 Properties
     property     
     label     
     definition     
     sameAs     
     equivalentProperty     
     subPropertyOf     
     propertyDomain     
     propertyRange     
-------------------------------------------------- 
 signs_symptoms
     index     
     sign_symptom     
     indices_disorder     
     index_reference     
     index_gender     
     sign_symptom_number     
     indices_sign_symptom     
     FIX indices_sign_symptom     
     indices_question     
     concept     
     underlying_behavior_index     
     underlying_behavior_ for _readability     
-------------------------------------------------- 
 examples_signs_symptoms
     index     
     example_sign_symptom     
     indices_sig

In [20]:
df = xl.parse('disorders')
df.head()

Unnamed: 0,S,index_disorder_category,index_disorder_subcategory,index_disorder_subsubcategory,index_disorder_subsubsubcategory,disorder,equivalentClasses,ICD9CM,ICD10CM,index_diagnostic_specifier,index_diagnostic_inclusion_criterion,index_diagnostic_inclusion_criterion2,index_diagnostic_exclusion_criterion,index_diagnostic_exclusion_criterion2,index_severity,note
0,1,1,1.0,,,Intellectual Disability (Intellectual Developm...,SNOMEDCT:110359009,319.0,,,,,,,,
1,2,1,1.0,,,Intellectual Disability (Intellectual Developm...,,,F70,,,,,,1.0,
2,3,1,1.0,,,Intellectual Disability (Intellectual Developm...,,,F71,,,,,,2.0,
3,4,1,1.0,,,Intellectual Disability (Intellectual Developm...,,,F72,,,,,,3.0,
4,5,1,1.0,,,Intellectual Disability (Intellectual Developm...,,,F73,,,,,,4.0,


In [21]:
show_df_dtypes_and_vals(df)

Unnamed: 0,datatypes,min_vals,max_vals
S,int64,1.0,755.0
index_disorder_category,int64,1.0,22.0
index_disorder_subcategory,float64,1.0,29.0
index_disorder_subsubcategory,float64,1.0,25.0
index_disorder_subsubsubcategory,float64,1.0,9.0
disorder,object,4.0,106.0
equivalentClasses,object,13.0,50.0
ICD9CM,object,3.0,6.0
ICD10CM,object,3.0,8.0
index_diagnostic_specifier,float64,1.0,106.0


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1181 entries, 0 to 1180
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             1181 non-null   int64  
 1   title             1181 non-null   object 
 2   link              1178 non-null   object 
 3   authors           1166 non-null   object 
 4   entry_date        1181 non-null   object 
 5   last_modified_by  1181 non-null   object 
 6   pubdate           1166 non-null   object 
 7   PubMedID          1100 non-null   float64
 8   cogatlas_node_id  1086 non-null   float64
 9   cogatlas_prop_id  7 non-null      object 
dtypes: float64(2), int64(1), object(7)
memory usage: 92.4+ KB


In [40]:
## Describe the numerical values
fltr = [pd.api.types.is_numeric_dtype(df[col]) for col in df.columns]
df.loc[:, fltr].describe()

Unnamed: 0,index,PubMedID,cogatlas_node_id
count,1181.0,1100.0,1086.0
mean,1436.108383,15608900.0,31427.008287
std,472.577437,6220990.0,2138.476465
min,1.0,514759.0,30412.0
25%,1201.0,11004710.0,30698.25
50%,1496.0,16681110.0,30980.5
75%,1791.0,20686780.0,31326.5
max,2092.0,30577940.0,49716.0
