# Examples of using fuction in "data_description.py"

In [1]:
import pandas as pd
import numpy as np
from data_description import *

# Creation an exmples data frame

In [2]:
frame_size = 100

data = pd.DataFrame({
    'dates_column' : np.datetime64('2017-01-01') + np.random.choice(np.arange(0, 60), frame_size),
    'object_column' : np.random.choice(['lev1', 'lev2', 'lev3'], frame_size),
    'category_column' : pd.Series(np.random.choice(['lev1', 'lev2', 'lev3'], frame_size), dtype = "category"),
    'bool_column' : np.random.choice([True, False], frame_size),
    'float_column' : np.random.normal(5, 1, frame_size),
    'integer_column' : np.random.normal(5, 1, frame_size).astype('int64')
})

for col_name in data.columns:
    data.loc[data.sample(20).index, col_name] = np.NaN

# `get_col_av_values`

In [3]:
data.apply(get_col_av_values)

dates_column                    [02.01.2017;01.03.2017]
object_column                     lev1, lev2, lev3, nan
category_column                   lev3, lev2, nan, lev1
bool_column                            nan, False, True
float_column       [2.878409569286852;7.34631318086829]
integer_column                                [2.0;6.0]
dtype: object

# `get_col_obj_count`

In [4]:
data.apply(get_col_obj_count)

dates_column       -
object_column      4
category_column    4
bool_column        3
float_column       -
integer_column     -
dtype: object

# `get_columns_desription`

In [5]:
get_columns_desription(data)

Unnamed: 0,Data type,Range,Levels number,NA count
dates_column,datetime64[ns],[02.01.2017;01.03.2017],-,20
object_column,object,"lev1, lev2, lev3, nan",4,20
category_column,category,"lev3, lev2, nan, lev1",4,20
bool_column,object,"nan, False, True",3,20
float_column,float64,[2.878409569286852;7.34631318086829],-,20
integer_column,float64,[2.0;6.0],-,20


# `get_most_freq`

Returns value from series which has the most frequent manifestation of the trait specified in cond.

In [6]:
data["ob_class"] = data["object_column"].apply(
    lambda x: np.random.choice([0,1], p = [0, 1]) \
    if x == 'lev2' else\
    np.random.choice([0,1], p = [0.5, 0.5])
)

pd.crosstab(data['object_column'], data['ob_class'])

ob_class,0,1
object_column,Unnamed: 1_level_1,Unnamed: 2_level_1
lev1,20,13
lev2,0,22
lev3,10,15


In [7]:
get_most_freq_by_cond(data['object_column'], data["ob_class"] == 0)

'lev1'

# `ECDFs_by_classes`

In [8]:
val = np.array([0, 1, 2, 3])
marker = np.array(["a", "b", "a", "b"])

### *Default*

By default, the function takes a series to build the ecdf and a row that divides the observations into classes.

In [9]:
ECDFs_by_classes(val, marker)

{'a': array([0.5, 0.5, 1. , 1. ]), 'b': array([0. , 0.5, 0.5, 1. ])}

### *`side`*

`side` argument is similar to the value of this argument in <a href="https://www.statsmodels.org/dev/generated/statsmodels.distributions.empirical_distribution.ECDF.html">Statsmodels</a>. Takes `'right'` by default.

Can take values `left` or `right` as strings. In this case all result functions will have relevant `side`.

In [10]:
ECDFs_by_classes(
    val, marker, side= "left"
)

{'a': array([0. , 0.5, 0.5, 1. ]), 'b': array([0. , 0. , 0.5, 0.5])}

If `list` used as a value of `side` then ecdf takes the corresponding `side` value for each class.

In [11]:
ECDFs_by_classes(
    val, marker, side = ["right", "left"]
)

{'a': array([0.5, 0.5, 1. , 1. ]), 'b': array([0. , 0. , 0.5, 0.5])}

### `functions`

Possible values:
- `True` means that as a result, will be `statsmodels.distributions.empirical_distribution.ECDF` instances;
- `False` means that as a result, will be `numpy.array` instances which contains value of ecdf in each point of the `val_col`.

By default `False`.

In [12]:
ECDFs_by_classes(
    val, marker, functions = True
)

{'a': <statsmodels.distributions.empirical_distribution.ECDF at 0x7fe8159606d0>,
 'b': <statsmodels.distributions.empirical_distribution.ECDF at 0x7fe8159630d0>}