In [143]:
def overview_data(df):
    '''
    Return a summarise table of the data
    Consist of column types, n_unique, examples entries, number of zeros, zeros portion,
    number of negatives, negative portion and the absolute sum of negative compare to sum of positive
    
    Required functions: summarise_zero, summarise_negative
    Required packages: pandas
    
    Argument: dataframe    
    '''
    import pandas as pd
    
    df_summary = pd.DataFrame(data = df.columns, columns = ['Column'])
    
    df_summary['type'] = None
    df_summary['n_unique'] = None
    df_summary['examples'] = None
    df_summary['n_zeros'] = None
    df_summary['zero_portion'] = None
    df_summary['n_negatives'] = None
    df_summary['negative_portion'] = None
    df_summary['abs_sum_portion'] = None
    
    
    for i in range(len(df_summary['Column'])):
        col_type = df.iloc[:, i].dtype
        df_summary.loc[:, 'type'][i] = col_type
        df_summary.loc[:, 'n_unique'][i] = df.iloc[:, i].nunique()
        df_summary.loc[:, 'examples'][i] = pd.unique(df.iloc[:, i])[:10]
        if col_type == type(1) or col_type == type(0.1):
            df_summary.loc[:, 'n_zeros'][i], df_summary.loc[:, 'zero_portion'][i] = summarise_zero(df.iloc[:, i])
            df_summary.loc[:, 'n_negatives'][i], df_summary.loc[:, 'negative_portion'][i], df_summary.loc[:, 'abs_sum_portion'][i] = summarise_negative(df.iloc[:, i])
        
    return df_summary

In [44]:
def summarise_zero(df_column):
    import pandas as pd
    length = len(df_column)
    n_zeros = sum(df_column == 0)
    df_zero = pd.Series([n_zeros, n_zeros/length], index = ['n_zeros', 'zero_portion'])
    return df_zero

In [134]:
def summarise_negative(df_column):
    '''
    Return a pandas series consist of number of negative figures in a chosen column,
    the portion of it in the data and its impact (portion of negative sum over positive sum)
    Argument:
        df_column: a pandas dataframe column, or a pandas series
    '''
    import pandas as pd
    negative = pd.DataFrame(df_column.values, columns = ['col']).query('col < 0')
    n_neg = len(negative)
    abs_sum_neg = negative.sum().abs()[0]
    positive = pd.DataFrame(df_column.values, columns = ['col']).query('col > 0')
    n_pos = len(positive)
    sum_pos = positive.sum()[0]
    df_negative = pd.Series(data = {
        'n_negatives' : n_neg,
        'negative_portion' : n_neg / (n_neg + n_pos),
        'abs_sum_portion' : (abs_sum_neg / sum_pos)
    })
    return df_negative

In [144]:
# import pandas as pd
# a = pd.read_csv('df_test.csv')
# overview_data(a)

Unnamed: 0,Column,type,n_unique,examples,n_zeros,zero_portion,n_negatives,negative_portion,abs_sum_portion
0,current_amount,float64,898,"[5000.0, 2500.0, 2400.0, 10000.0, 3000.0, 7000...",0.0,0.0,0.0,0.0,0.0
1,durations,object,2,"[ 36 months, 60 months, nan]",,,,,
2,interest,object,394,"[ 10.65%, 15.27%, 15.96%, 13.49%, 12.69%, ...",,,,,
3,monthly,float64,16459,"[162.87, 59.83, 84.33, 339.31, 67.79, 156.46, ...",0.0,0.0,0.0,0.0,0.0
4,rank,object,7,"[B, C, A, E, F, D, G, nan]",,,,,
5,home_ownership,object,5,"[RENT, OWN, MORTGAGE, OTHER, NONE, nan]",,,,,
6,income,float64,5597,"[24000.0, 30000.0, 12252.0, 49200.0, 80000.0, ...",0.0,0.0,0.0,0.0,0.0
7,verification_status,object,3,"[Verified, Source Verified, Not Verified, nan]",,,,,
8,day_recorded,object,55,"[Dec-2011, Nov-2011, Oct-2011, Sep-2011, Aug-2...",,,,,
9,stat,object,4,"[Fully Paid, Charged Off, nan, Does not meet t...",,,,,
