# Examples of using fuction in "data_description.py"

In [1]:
import pandas as pd
import numpy as np
from data_description import *

# Creation an exmples data frame

In [2]:
frame_size = 100

data = pd.DataFrame({
    'dates_column' : np.datetime64('2017-01-01') + np.random.choice(np.arange(0, 60), frame_size),
    'object_column' : np.random.choice(['lev1', 'lev2', 'lev3'], frame_size),
    'category_column' : pd.Series(np.random.choice(['lev1', 'lev2', 'lev3'], frame_size), dtype = "category"),
    'bool_column' : np.random.choice([True, False], frame_size),
    'float_column' : np.random.normal(5, 1, frame_size),
    'integer_column' : np.random.normal(5, 1, frame_size).astype('int64')
})

for col_name in data.columns:
    data.loc[data.sample(20).index, col_name] = np.NaN

# Testing of "get_col_av_values" function

In [3]:
data.apply(get_col_av_values)

dates_column                      [01.01.2017;26.02.2017]
object_column                       lev3, lev1, nan, lev2
category_column                     lev1, lev2, lev3, nan
bool_column                              nan, True, False
float_column       [3.1749034414791204;7.213499187316685]
integer_column                                  [2.0;7.0]
dtype: object

# Testing of "get_col_obj_count" function

In [4]:
data.apply(get_col_obj_count)

dates_column       -
object_column      4
category_column    4
bool_column        3
float_column       -
integer_column     -
dtype: object

# Testing of "get_columns_desription" function

In [5]:
get_columns_desription(data)

Unnamed: 0,Data type,Range,Levels number,NA count
dates_column,datetime64[ns],[01.01.2017;26.02.2017],-,20
object_column,object,"lev3, lev1, nan, lev2",4,20
category_column,category,"lev1, lev2, lev3, nan",4,20
bool_column,object,"nan, True, False",3,20
float_column,float64,[3.1749034414791204;7.213499187316685],-,20
integer_column,float64,[2.0;7.0],-,20


# get_most_freq

Returns value from series which has the most frequent manifestation of the trait specified in cond.

In [6]:
data["ob_class"] = data["object_column"].apply(
    lambda x: np.random.choice([0,1], p = [0, 1]) \
    if x == 'lev2' else\
    np.random.choice([0,1], p = [0.5, 0.5])
)

pd.crosstab(data['object_column'], data['ob_class'])

ob_class,0,1
object_column,Unnamed: 1_level_1,Unnamed: 2_level_1
lev1,17,13
lev2,0,23
lev3,10,17


In [7]:
get_most_freq_by_cond(data['object_column'], data["ob_class"] == 0)

'lev1'

# get_join_repl_rule
Get an joining rule for the levels of some variable for further use in `pandas.Series.replace`

In [11]:
def get_join_repl_rule(joiners):
    '''
        Get an joining rule for the levels of some variable 
        for further use in pandas.Series.replace
        Inputs:
            joiners - the list of lists;
        Output dictionary with format {<old_level>:<new_level>}
    '''
    
    rule = {}
    
    for join_lev in joiners:
        res_level = join_lev[0]
        for lev in join_lev[1:]:
            res_level += "_" + lev
        for lev in join_lev:
            rule[lev] = res_level
            
    return rule

In [12]:
get_join_repl_rule([['lev1', 'lev2'], ['lev3']])

{'lev1': 'lev1_lev2', 'lev2': 'lev1_lev2', 'lev3': 'lev3'}