# Examples of using fuction in "data_description.py"

In [3]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

from data_description import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Creation an exmples data frame

In [4]:
frame_size = 100

data = pd.DataFrame({
    'dates_column' : np.datetime64('2017-01-01') + np.random.choice(np.arange(0, 60), frame_size),
    'object_column' : np.random.choice(['lev1', 'lev2', 'lev3'], frame_size),
    'category_column' : pd.Series(
        np.random.choice(['lev1', 'lev2', 'lev3'], frame_size), dtype = "category"
    ),
    'bool_column' : np.random.choice([True, False], frame_size),
    'float_column' : np.random.normal(5, 1, frame_size),
    'integer_column' : np.random.normal(5, 1, frame_size).astype('int64')
})

for col_name in data.columns:
    data.loc[data.sample(20).index, col_name] = np.NaN

# `get_col_av_values`

Get availible values in getted column. Makes different
for numeric dtypes and others. 
For numeric dtypes returns range like "[min, max]".
For non-numeric dtypes returns possible values. 

In [5]:
data.apply(get_col_av_values)

dates_column                     [01.01.2017;27.02.2017]
object_column                      lev1, lev2, nan, lev3
category_column                    lev2, nan, lev1, lev3
bool_column                             False, True, nan
float_column       [2.6158986388590093;6.95890359809643]
integer_column                                 [3.0;7.0]
dtype: object

# `get_col_obj_unique_count`

For non-numeric pandas.Series, returns a count of unique values. 
For numeric variables returns "-".

In [6]:
data.apply(get_col_obj_unique_count)

dates_column       -
object_column      4
category_column    4
bool_column        3
float_column       -
integer_column     -
dtype: object

# `get_Dran_col_descr`

In [7]:
data.apply(get_Dran_col_descr).T

Unnamed: 0,type,unique values,unique count,nan count
dates_column,datetime64[ns],[01.01.2017;27.02.2017],-,20
object_column,object,"lev1, lev2, nan, lev3",4,20
category_column,category,"lev2, nan, lev1, lev3",4,20
bool_column,object,"False, True, nan",3,20
float_column,float64,[2.6158986388590093;6.95890359809643],-,20
integer_column,float64,[3.0;7.0],-,20


# `get_most_freq`

Returns value from series which has the most frequent manifestation of the trait specified in cond.

In [8]:
data["ob_class"] = data["object_column"].apply(
    lambda x: np.random.choice([0,1], p = [0, 1]) \
    if x == 'lev2' else\
    np.random.choice([0,1], p = [0.5, 0.5])
)

pd.crosstab(data['object_column'], data['ob_class'])

ob_class,0,1
object_column,Unnamed: 1_level_1,Unnamed: 2_level_1
lev1,17,14
lev2,0,26
lev3,12,11


In [9]:
get_most_freq_by_cond(data['object_column'], data["ob_class"] == 0)

'lev1'

# `ECDFs_by_classes`

In [10]:
val = np.array([0, 1, 2, 3])
marker = np.array(["a", "b", "a", "b"])

### *Default*

By default, the function takes a series to build the ecdf and a row that divides the observations into classes.

In [11]:
ECDFs_by_classes(val, marker)

{'a': array([0.5, 0.5, 1. , 1. ]), 'b': array([0. , 0.5, 0.5, 1. ])}

### *`side`*

`side` argument is similar to the value of this argument in <a href="https://www.statsmodels.org/dev/generated/statsmodels.distributions.empirical_distribution.ECDF.html">Statsmodels</a>. Takes `'right'` by default.

Can take values `left` or `right` as strings. In this case all result functions will have relevant `side`.

In [12]:
ECDFs_by_classes(
    val, marker, side= "left"
)

{'a': array([0. , 0.5, 0.5, 1. ]), 'b': array([0. , 0. , 0.5, 0.5])}

If `list` used as a value of `side` then ecdf takes the corresponding `side` value for each class.

In [13]:
ECDFs_by_classes(
    val, marker, side = ["right", "left"]
)

{'a': array([0.5, 0.5, 1. , 1. ]), 'b': array([0. , 0. , 0.5, 0.5])}

### `functions`

Possible values:
- `True` means that as a result, will be `statsmodels.distributions.empirical_distribution.ECDF` instances;
- `False` means that as a result, will be `numpy.array` instances which contains value of ecdf in each point of the `val_col`.

By default `False`.

In [14]:
ECDFs_by_classes(
    val, marker, functions = True
)

{'a': <statsmodels.distributions.empirical_distribution.ECDF at 0x7fc2f1a83a10>,
 'b': <statsmodels.distributions.empirical_distribution.ECDF at 0x7fc2f1a98fd0>}

# `conf_table`

Returns table that contains the information from confusion
matrix, but for given thresholds.

In [23]:
np.random.seed(5)
y_pred = np.arange(0, 1, 0.1)
y_true = np.array([0,0,0,1,0,1,0,1,1,1])

conf_table(y_true, y_pred)

Unnamed: 0_level_0,TN,FP,FN,TP,TNR,FPR,FNR,TPR
treshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1,4,0,5,0.2,0.8,0.0,1.0
0.1,2,3,0,5,0.4,0.6,0.0,1.0
0.2,3,2,0,5,0.6,0.4,0.0,1.0
0.3,3,2,1,4,0.6,0.4,0.2,0.8
0.4,4,1,1,4,0.8,0.2,0.2,0.8
0.5,4,1,2,3,0.8,0.2,0.4,0.6
0.6,5,0,2,3,1.0,0.0,0.4,0.6
0.7,5,0,3,2,1.0,0.0,0.6,0.4
0.8,5,0,4,1,1.0,0.0,0.8,0.2
0.9,5,0,5,0,1.0,0.0,1.0,0.0


Rename rules, is dicts with typical rename conditions for table which `conf_table` returns:
- `ru_scoring_rename_rule` - typical naming for credit scoring task in russian;

In [24]:
conf_table(y_true, y_pred).rename(columns = ru_scoring_rename_rule)

Unnamed: 0_level_0,Верно выданные (шт.),Ошибочно удержанные (шт.),Ошибочно выданные (шт.),Верно удержанные (шт.),Верно выданные (% от хороших),Ошибочно удержанные (% от хороших),Ошибочно выданные (% от дефолта),Верно удержанные (% от дефолта)
treshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1,4,0,5,0.2,0.8,0.0,1.0
0.1,2,3,0,5,0.4,0.6,0.0,1.0
0.2,3,2,0,5,0.6,0.4,0.0,1.0
0.3,3,2,1,4,0.6,0.4,0.2,0.8
0.4,4,1,1,4,0.8,0.2,0.2,0.8
0.5,4,1,2,3,0.8,0.2,0.4,0.6
0.6,5,0,2,3,1.0,0.0,0.4,0.6
0.7,5,0,3,2,1.0,0.0,0.6,0.4
0.8,5,0,4,1,1.0,0.0,0.8,0.2
0.9,5,0,5,0,1.0,0.0,1.0,0.0
