# Examples of using fuction in "data_description.py"

In [1]:
import pandas as pd
import numpy as np
from data_description import *

# Creation an exmples data frame

In [2]:
frame_size = 100

data = pd.DataFrame({
    'dates_column' : np.datetime64('2017-01-01') + np.random.choice(np.arange(0, 60), frame_size),
    'object_column' : np.random.choice(['lev1', 'lev2', 'lev3'], frame_size),
    'category_column' : pd.Series(np.random.choice(['lev1', 'lev2', 'lev3'], frame_size), dtype = "category"),
    'bool_column' : np.random.choice([True, False], frame_size),
    'float_column' : np.random.normal(5, 1, frame_size),
    'integer_column' : np.random.normal(5, 1, frame_size).astype('int64')
})

for col_name in data.columns:
    data.loc[data.sample(20).index, col_name] = np.NaN

# get_col_av_values

In [3]:
data.apply(get_col_av_values)

dates_column                      [01.01.2017;01.03.2017]
object_column                       nan, lev1, lev3, lev2
category_column                     lev2, lev3, nan, lev1
bool_column                              nan, False, True
float_column       [2.6776571136470233;7.933870714929562]
integer_column                                  [2.0;7.0]
dtype: object

# get_col_obj_count

In [4]:
data.apply(get_col_obj_count)

dates_column       -
object_column      4
category_column    4
bool_column        3
float_column       -
integer_column     -
dtype: object

# get_columns_desription

In [5]:
get_columns_desription(data)

Unnamed: 0,Data type,Range,Levels number,NA count
dates_column,datetime64[ns],[01.01.2017;01.03.2017],-,20
object_column,object,"nan, lev1, lev3, lev2",4,20
category_column,category,"lev2, lev3, nan, lev1",4,20
bool_column,object,"nan, False, True",3,20
float_column,float64,[2.6776571136470233;7.933870714929562],-,20
integer_column,float64,[2.0;7.0],-,20


# get_most_freq

Returns value from series which has the most frequent manifestation of the trait specified in cond.

In [6]:
data["ob_class"] = data["object_column"].apply(
    lambda x: np.random.choice([0,1], p = [0, 1]) \
    if x == 'lev2' else\
    np.random.choice([0,1], p = [0.5, 0.5])
)

pd.crosstab(data['object_column'], data['ob_class'])

ob_class,0,1
object_column,Unnamed: 1_level_1,Unnamed: 2_level_1
lev1,12,16
lev2,0,30
lev3,12,10


In [7]:
get_most_freq_by_cond(data['object_column'], data["ob_class"] == 0)

'lev3'

# super_crosstab

Improvement of `pandas.crosstab` method - allows to get absolute and normalized tables with one object at once.

In [22]:
def crosstab(index, columns, **crosstab_args):
    
    crosstab_args = {'normalize' : 'index'} | crosstab_args
    
    return pd.concat(
        [
            pd.crosstab(
                index, columns, 
                **crosstab_args | {'normalize' : False}
            ),
            pd.crosstab(index,columns, **crosstab_args)
        ],
        axis = 1
    )
    

crosstab(
    np.random.choice(['1', '2'], 200),
    np.random.choice(['3', '4'], 200),
    margins = True
)

col_0,3,4,3,4
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,48.0,55.0,0.466019,0.533981
2,55.0,42.0,0.56701,0.43299
All,,,0.515,0.485


In [95]:
pd.crosstab(
    np.random.choice(['1', '2'], 200),
    [np.random.choice(['3', '4'], 200),
    np.random.choice(['7', '8'], 200)],
    normalize = 'index', margins = True,
    values = np.random.choice([1, 2], 200),
    aggfunc = np.mean
)

col_0,3,3,4,4
col_1,7,8,7,8
row_0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,0.251871,0.249124,0.239782,0.259223
2,0.269903,0.241774,0.243783,0.24454
All,0.258049,0.246235,0.242447,0.25327


In [73]:
np.random.seed(30)

test_list = pd.crosstab(
    np.random.choice(['1', '2'], 200),
    [np.random.choice(['3', '4'], 200),
    np.random.choice(['7', '8'], 200)],
    normalize = 'index', margins = True
).columns

In [72]:
super_index = pd.MultiIndex.from_product(
    [['absolute', 'relative'] ,test_list]
)
import itertools

test_list


[('3', '7'), ('3', '8'), ('4', '7'), ('4', '8')]

In [64]:
big_table = pd.concat(
    [
        pd.crosstab(
            np.random.choice(['1', '2'], 200),
            [np.random.choice(['3', '4'], 200),
            np.random.choice(['7', '8'], 200)],
            normalize = 'index', margins = True
        ),
        pd.crosstab(
            np.random.choice(['1', '2'], 200),
            [np.random.choice(['3', '4'], 200),
            np.random.choice(['7', '8'], 200)],
            normalize = 'index', margins = True
        ),
    ],
    axis = 1
)




big_table.columns = \
pd.MultiIndex.from_product(
    [['absolute', 'relative'] ,test_list]
)

In [65]:
big_table

Unnamed: 0_level_0,absolute,absolute,absolute,absolute,relative,relative,relative,relative
Unnamed: 0_level_1,"(3, 7)","(3, 8)","(4, 7)","(4, 8)","(3, 7)","(3, 8)","(4, 7)","(4, 8)"
row_0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,0.237113,0.298969,0.206186,0.257732,0.257426,0.306931,0.217822,0.217822
2,0.271845,0.203883,0.291262,0.23301,0.232323,0.30303,0.212121,0.252525
All,0.255,0.25,0.25,0.245,0.245,0.305,0.215,0.235
