# Desribes and testing of functions for data processing

In [1]:
import pandas as pd
import numpy as np

from data_processing import *

In [2]:
test_df = pd.DataFrame({
    "num_var" : np.random.normal(3, 5, 200),
    "cut_var" : np.random.choice(['a', 'b'])
})

# `insert_next`

Quick insertion following the column selected by name for pandas.DataFrame.

In [3]:
test_df = pd.DataFrame({
    "num_var" : np.random.normal(3, 5, 200),
    "cut_var" : np.random.choice(['a', 'b'])
})

## *One level columns*

Rely simple to use

In [4]:
insert_next(test_df, "num_var", test_df["num_var"] / 20)

test_df

Unnamed: 0,num_var,cut_var
0,1.215260,a
1,15.127617,a
2,11.436560,a
3,-7.357038,a
4,0.395152,a
...,...,...
195,3.771804,a
196,-0.103275,a
197,5.316207,a
198,0.126943,a


In [5]:
insert_next(test_df, "num_var", test_df["num_var"]*20)

Unnamed: 0,num_var,num_var_transf,cut_var
0,1.215260,24.305205,a
1,15.127617,302.552348,a
2,11.436560,228.731194,a
3,-7.357038,-147.140758,a
4,0.395152,7.903041,a
...,...,...,...
195,3.771804,75.436074,a
196,-0.103275,-2.065505,a
197,5.316207,106.324145,a
198,0.126943,2.538851,a


## *Multi level columns*

Have some details in using.

In [6]:
test_df = pd.DataFrame(
    np.random.rand(6,6),
    columns = pd.MultiIndex.from_product([['a', 'b', 'c'], [1,2]])
)

test_df

Unnamed: 0_level_0,a,a,b,b,c,c
Unnamed: 0_level_1,1,2,1,2,1,2
0,0.379128,0.760526,0.256883,0.050584,0.128445,0.066869
1,0.61908,0.898181,0.090516,0.638989,0.451363,0.239268
2,0.731321,0.336435,0.482167,0.166372,0.800961,0.04119
3,0.881381,0.480961,0.879988,0.465377,0.564059,0.113484
4,0.199563,0.076012,0.017178,0.701133,0.347351,0.90553
5,0.379262,0.41696,0.00923,0.384097,0.748812,0.515283


If you are using `pandas.MultiIndex` in columns of `pandas.DataFrame` you have to use `tuple` as `col_name` argument.

In [7]:
insert_next(
    test_df, ("a", 1),
    np.random.rand(6)
)

Unnamed: 0_level_0,a,a _transf,a,b,b,c,c
Unnamed: 0_level_1,1,1 _transf,2,1,2,1,2
0,0.379128,0.845796,0.760526,0.256883,0.050584,0.128445,0.066869
1,0.61908,0.485844,0.898181,0.090516,0.638989,0.451363,0.239268
2,0.731321,0.474411,0.336435,0.482167,0.166372,0.800961,0.04119
3,0.881381,0.801398,0.480961,0.879988,0.465377,0.564059,0.113484
4,0.199563,0.871143,0.076012,0.017178,0.701133,0.347351,0.90553
5,0.379262,0.266653,0.41696,0.00923,0.384097,0.748812,0.515283


Other case you will have warning, and most likely error from `pandas`.

In [8]:
insert_next(
    test_df, "a",
    np.random.rand(6),
    "test"
)

            For tables with columns.__class__ == pd.MultiIndex, not allowed using
            col_name of type str. Try better typle.
            
  warn(


TypeError: unsupported operand type(s) for +: 'slice' and 'int'

# `get_num_cond`

Get condition for selection from pandas.DataFrame numeric data types

In [9]:
test_df = pd.DataFrame({
    "num_var" : np.random.normal(3, 5, 200),
    "cut_var" : np.random.choice(['a', 'b'])
})

get_num_cond(test_df)

num_var     True
cut_var    False
dtype: bool

# `pd_OHE`

Conduct one hot encoding for pandas data frame with the result as pandas.DataFrame and columns in reading format. 

In [10]:
test_df = pd.DataFrame({
        'my_col' : ['a', 'b', 'b', 'a', np.NaN],
        'my_col2': ['d', 'v', 't', 'g', 'q']
})

pd_OHE(test_df, sk_OHE_kwarg = {'drop':['a', 'v']})

Unnamed: 0,my_col_b,my_col_nan,my_col2_d,my_col2_g,my_col2_q,my_col2_t
0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0


# `get_join_repl_rule`
Get an joining rule for the levels of some variable for further use in `pandas.Series.replace`

In [11]:
get_merge_repl_rule([['lev1', 'lev2'], ['lev3']])

{'lev1': 'lev1_lev2', 'lev2': 'lev1_lev2', 'lev3': 'lev3'}

# `np_replace`

For `numpy.array` replace values, by roole defined as dictionary:<br>
```
{
    <old val1>:<new val1>, 
    <old val2>:<new val2>,
    .....................
    <old valn>:<new valn>
}
```

`pandas.DataFrame.replace` synonym, but for `numpy.array`.

In [12]:
X = np.random.choice(
    ['a', 'b', 'c'], [10, 10]
)

X

array([['c', 'a', 'a', 'a', 'a', 'a', 'b', 'a', 'a', 'b'],
       ['a', 'c', 'c', 'b', 'b', 'b', 'b', 'c', 'b', 'b'],
       ['c', 'c', 'c', 'b', 'b', 'a', 'b', 'b', 'a', 'b'],
       ['c', 'b', 'c', 'c', 'a', 'a', 'b', 'a', 'a', 'b'],
       ['c', 'c', 'c', 'a', 'a', 'b', 'c', 'a', 'a', 'a'],
       ['c', 'c', 'b', 'c', 'b', 'b', 'b', 'b', 'b', 'c'],
       ['b', 'c', 'a', 'a', 'c', 'b', 'a', 'a', 'c', 'a'],
       ['a', 'c', 'b', 'b', 'c', 'a', 'c', 'c', 'c', 'a'],
       ['b', 'a', 'c', 'a', 'b', 'b', 'a', 'c', 'c', 'c'],
       ['a', 'a', 'b', 'b', 'b', 'a', 'b', 'a', 'a', 'c']], dtype='<U1')

In [13]:
np_replace(X, {'a':'test_a', 'b': "test_b", 'a':"test_a2"})

array([['c', 'test_a2', 'test_a2', 'test_a2', 'test_a2', 'test_a2',
        'test_b', 'test_a2', 'test_a2', 'test_b'],
       ['test_a2', 'c', 'c', 'test_b', 'test_b', 'test_b', 'test_b', 'c',
        'test_b', 'test_b'],
       ['c', 'c', 'c', 'test_b', 'test_b', 'test_a2', 'test_b', 'test_b',
        'test_a2', 'test_b'],
       ['c', 'test_b', 'c', 'c', 'test_a2', 'test_a2', 'test_b',
        'test_a2', 'test_a2', 'test_b'],
       ['c', 'c', 'c', 'test_a2', 'test_a2', 'test_b', 'c', 'test_a2',
        'test_a2', 'test_a2'],
       ['c', 'c', 'test_b', 'c', 'test_b', 'test_b', 'test_b', 'test_b',
        'test_b', 'c'],
       ['test_b', 'c', 'test_a2', 'test_a2', 'c', 'test_b', 'test_a2',
        'test_a2', 'c', 'test_a2'],
       ['test_a2', 'c', 'test_b', 'test_b', 'c', 'test_a2', 'c', 'c',
        'c', 'test_a2'],
       ['test_b', 'test_a2', 'c', 'test_a2', 'test_b', 'test_b',
        'test_a2', 'c', 'c', 'c'],
       ['test_a2', 'test_a2', 'test_b', 'test_b', 'test_b', 'test_a

# `fix_pd_multiIndex`

`padnas` has some problems with loading multilevel columns headers from excel. When different columns has different count of levels, result `pandas.DataFrame` will have maximum count of levels in each columns, and lower levels of columns wich in excel has less levels will renamed like `Unnamed: ...`. This function goal is change every name of wich starts from `Unnamed: ... ` in index to empty line in index.

Let the input file looks like.

<img src="pictures/multiindex_examle.png"></img>

By defult panads loads it like.

In [14]:
df = pd.read_excel(
    "test_data/fix_pd_multiIndex_df.xlsx",
    header = [0,1]
)
df

Unnamed: 0_level_0,id,group1,group1,group2,group2
Unnamed: 0_level_1,Unnamed: 0_level_1,col1,col2,col3,col4
0,323,a,4,g,5
1,433,b,5,r,6
2,123,c,6,d,7
3,321,d,7,x,5


So `id` column, loaded as `("id", "Unnamed: 0_level_1")`. Now lets apply function for geting a new Index.

In [15]:
new_columns = fix_pd_multiIndex(df.columns)
new_columns

MultiIndex([(    'id',     ''),
            ('group1', 'col1'),
            ('group1', 'col2'),
            ('group2', 'col3'),
            ('group2', 'col4')],
           )

And now apply it to our dataframe.

In [16]:
df.columns = new_columns
df

Unnamed: 0_level_0,id,group1,group1,group2,group2
Unnamed: 0_level_1,Unnamed: 1_level_1,col1,col2,col3,col4
0,323,a,4,g,5
1,433,b,5,r,6
2,123,c,6,d,7
3,321,d,7,x,5
