# core

> These functions are of general use and not specific to any one dataset.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export


def ensure_dir_path_exists(dir_path = '../ext_data' # Directory path to check
                          ):
    "Iteratively check for and create directories to store output. Ideally this would just be os.mkdirs() but that function is not available in this version of python"
    import os
    
    for i in range(2, len(dir_path.split('/'))+1):
        path_part = '/'.join(dir_path.split('/')[0:i])
        if not os.path.exists(path_part):
            os.mkdir(path_part)

In [None]:
#| export

def find_df_shared_cols(df1,# DataFrame 1 
                        df2 # DataFrame 2
                       ):
    shared_cols = [e for e in list(df1) if e in list(df2)]
    return(shared_cols)

In [None]:
import pandas as pd
df1 = pd.DataFrame.from_dict({'a':[0], 'b':['c']})
df2 = pd.DataFrame.from_dict({'a':[1], 'b':[0]})

find_df_shared_cols(df1, df2)

['a', 'b']

In [None]:
#| export
def find_df_col_mismatches(df1, # DataFrame 1 
                           df2, # DataFrame 2 
                           showtype = True # Whether the data types should be returned
                          ):
    "Identify columns that match in two dataframes but have mismatched data types"
    import numpy as np
    if showtype:
        return [(e, np.dtype(df1[e]), np.dtype(df2[e])) for e in [e for e in list(df2) if e in list(df1)] if (np.dtype(df1[e]) != np.dtype(df2[e])) ]
    else:
        return [e for e in [e for e in list(df2) if e in list(df1)] if (np.dtype(df1[e]) != np.dtype(df2[e])) ]



In [None]:
import pandas as pd
df1 = pd.DataFrame.from_dict({'a':[0], 'b':['c']})
df2 = pd.DataFrame.from_dict({'a':[1], 'b':[0]})

find_df_col_mismatches(df1, df2)

[('b', dtype('O'), dtype('int64'))]

In [None]:
#| export
def summarize_col_missing(df):
    "Report the number and percentage of missing values for a DataFrame"
    import pandas as pd
    return(
        pd.DataFrame({'Col'   : [e for e in list(df)],
              'N_miss' : [sum(df[e].isna()) for e in list(df)],
              'Pr_Comp': [round(100*(1-sum(df[e].isna())/len(df[e])), 1) for e in list(df)]})
    )

In [None]:
import pandas as pd
import numpy as np

df1 = pd.DataFrame.from_dict({'a':[0, 1, 2, 3, 4, np.nan], 
                              'b':[0, 1, 2, 3, 4, 5     ]})

summarize_col_missing(df= df1)

Unnamed: 0,Col,N_miss,Pr_Comp
0,a,1,83.3
1,b,0,100.0


In [None]:
#| export

def sanitize_col(df, # DataFrame
                 col, # Column to be sanitized
                 simple_renames= {}, # Replace entries that match the given key with the given value. {'A':'a'}
                 split_renames= {} # Split entries that should be separate rows into two. {'a_b':['a','b']}
                ):
    "Simplify renaming entries in a column for standardizaiton. Particularly useful for irrigation/managment entries which may be intended for humans not computers."
    import pandas as pd
    # simple renames
    for e in simple_renames.keys():
        mask = (df[col] == e)
        df.loc[mask, col] = simple_renames[e]

    # splits
    # pull out the relevant multiname rows, copy, rename, append
    for e in split_renames.keys():
        mask = (df[col] == e)
        temp = df.loc[mask, :] 

        df = df.loc[~mask, :]
        for e2 in split_renames[e]:
            temp2 = temp.copy()
            temp2[col] = e2
            df = df.merge(temp2, how = 'outer')

    return(df)

In [None]:
import pandas as pd
df1 = pd.DataFrame.from_dict({'letters':['a', 'B', 'cd']})

sanitize_col(df = df1, col = 'letters',
                 simple_renames= {'B':'b'},
                 split_renames= {'cd':['c', 'd']}
                )

Unnamed: 0,letters
0,a
1,b
2,c
3,d


In [None]:
# utility funciton to update notebook names?
# rename notebooks
# notebook name, cache path, cache dir name
# also needs to search through and update cache path names for all notebooks that depend on it.

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()