# Desribes and testing of functions for data processing

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

from data_processing import *

In [2]:
test_df = pd.DataFrame({
    "num_var" : np.random.normal(3, 5, 200),
    "cut_var" : np.random.choice(['a', 'b'])
})

# `insert_next`

Quick insertion following the column selected by name for pandas.DataFrame.

In [3]:
test_df = pd.DataFrame({
    "num_var" : np.random.normal(3, 5, 200),
    "cut_var" : np.random.choice(['a', 'b'])
})

## *One level columns*

Rely simple to use

In [4]:
insert_next(test_df, "num_var", test_df["num_var"] / 20)

test_df

Unnamed: 0,num_var,cut_var
0,-0.596950,b
1,4.186098,b
2,3.985521,b
3,-0.697587,b
4,5.153473,b
...,...,...
195,9.144420,b
196,-1.401377,b
197,7.007348,b
198,-0.190932,b


In [5]:
insert_next(test_df, "num_var", test_df["num_var"]*20)

Unnamed: 0,num_var,num_var_transf,cut_var
0,-0.596950,-11.938993,b
1,4.186098,83.721956,b
2,3.985521,79.710414,b
3,-0.697587,-13.951750,b
4,5.153473,103.069470,b
...,...,...,...
195,9.144420,182.888400,b
196,-1.401377,-28.027537,b
197,7.007348,140.146968,b
198,-0.190932,-3.818641,b


## *Multi level columns*

Have some details in using.

In [6]:
test_df = pd.DataFrame(
    np.random.rand(6,6),
    columns = pd.MultiIndex.from_product([['a', 'b', 'c'], [1,2]])
)

test_df

Unnamed: 0_level_0,a,a,b,b,c,c
Unnamed: 0_level_1,1,2,1,2,1,2
0,0.687871,0.430511,0.457409,0.626873,0.773616,0.912703
1,0.671212,0.2496,0.732682,0.09626,0.562298,0.819538
2,0.448415,0.21264,0.750602,0.121679,0.813854,0.107848
3,0.625392,0.268536,0.10569,0.34769,0.079185,0.564113
4,0.821446,0.395826,0.884554,0.309489,0.497456,0.577332
5,0.288959,0.356469,0.90207,0.961456,0.741428,0.096169


If you are using `pandas.MultiIndex` in columns of `pandas.DataFrame` you have to use `tuple` as `col_name` argument.

In [7]:
insert_next(
    test_df, ("a", 1),
    np.random.rand(6)
)

Unnamed: 0_level_0,a,a _transf,a,b,b,c,c
Unnamed: 0_level_1,1,1 _transf,2,1,2,1,2
0,0.687871,0.834373,0.430511,0.457409,0.626873,0.773616,0.912703
1,0.671212,0.313538,0.2496,0.732682,0.09626,0.562298,0.819538
2,0.448415,0.023887,0.21264,0.750602,0.121679,0.813854,0.107848
3,0.625392,0.574787,0.268536,0.10569,0.34769,0.079185,0.564113
4,0.821446,0.403611,0.395826,0.884554,0.309489,0.497456,0.577332
5,0.288959,0.996754,0.356469,0.90207,0.961456,0.741428,0.096169


Other case you will have warning, and most likely error from `pandas`.

# `get_num_cond`

Get condition for selection from pandas.DataFrame numeric data types

In [8]:
test_df = pd.DataFrame({
    "num_var" : np.random.normal(3, 5, 200),
    "cut_var" : np.random.choice(['a', 'b'])
})

get_num_cond(test_df)

num_var     True
cut_var    False
dtype: bool

# `pd_OHE`

Conduct one hot encoding for pandas data frame with the result as pandas.DataFrame and columns in reading format. 

In [9]:
test_df = pd.DataFrame({
        'my_col' : ['a', 'b', 'b', 'a', np.NaN],
        'my_col2': ['d', 'v', 't', 'g', 'q']
})

pd_OHE(test_df, sk_OHE_kwarg = {'drop':['a', 'v']})

Unnamed: 0,my_col_b,my_col_nan,my_col2_d,my_col2_g,my_col2_q,my_col2_t
0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0


# `get_merge_repl_rule`
Get an joining rule for the levels of some variable for further use in `pandas.Series.replace`

In [10]:
get_merge_repl_rule([['lev1', 'lev2'], ['lev3']])

{'lev1': 'lev1_lev2', 'lev2': 'lev1_lev2', 'lev3': 'lev3'}

# `np_replace`

For `numpy.array` replace values, by roole defined as dictionary:<br>
```
{
    <old val1>:<new val1>, 
    <old val2>:<new val2>,
    .....................
    <old valn>:<new valn>
}
```

`pandas.DataFrame.replace` synonym, but for `numpy.array`.

In [11]:
X = np.random.choice(
    ['a', 'b', 'c'], [10, 10]
)

X

array([['c', 'c', 'b', 'a', 'a', 'a', 'a', 'a', 'c', 'c'],
       ['a', 'c', 'b', 'b', 'b', 'b', 'a', 'c', 'a', 'b'],
       ['a', 'b', 'a', 'b', 'b', 'a', 'b', 'c', 'b', 'b'],
       ['c', 'a', 'c', 'a', 'a', 'b', 'b', 'c', 'a', 'c'],
       ['c', 'a', 'c', 'a', 'a', 'a', 'b', 'b', 'c', 'c'],
       ['a', 'c', 'c', 'b', 'a', 'b', 'b', 'a', 'b', 'b'],
       ['c', 'c', 'b', 'b', 'b', 'a', 'c', 'c', 'c', 'b'],
       ['b', 'b', 'b', 'a', 'c', 'b', 'c', 'b', 'b', 'c'],
       ['c', 'a', 'a', 'c', 'c', 'a', 'a', 'c', 'a', 'c'],
       ['a', 'c', 'a', 'b', 'a', 'b', 'a', 'c', 'a', 'a']], dtype='<U1')

In [12]:
np_replace(X, {'a':'test_a', 'b': "test_b", 'a':"test_a2"})

array([['c', 'c', 'test_b', 'test_a2', 'test_a2', 'test_a2', 'test_a2',
        'test_a2', 'c', 'c'],
       ['test_a2', 'c', 'test_b', 'test_b', 'test_b', 'test_b',
        'test_a2', 'c', 'test_a2', 'test_b'],
       ['test_a2', 'test_b', 'test_a2', 'test_b', 'test_b', 'test_a2',
        'test_b', 'c', 'test_b', 'test_b'],
       ['c', 'test_a2', 'c', 'test_a2', 'test_a2', 'test_b', 'test_b',
        'c', 'test_a2', 'c'],
       ['c', 'test_a2', 'c', 'test_a2', 'test_a2', 'test_a2', 'test_b',
        'test_b', 'c', 'c'],
       ['test_a2', 'c', 'c', 'test_b', 'test_a2', 'test_b', 'test_b',
        'test_a2', 'test_b', 'test_b'],
       ['c', 'c', 'test_b', 'test_b', 'test_b', 'test_a2', 'c', 'c', 'c',
        'test_b'],
       ['test_b', 'test_b', 'test_b', 'test_a2', 'c', 'test_b', 'c',
        'test_b', 'test_b', 'c'],
       ['c', 'test_a2', 'test_a2', 'c', 'c', 'test_a2', 'test_a2', 'c',
        'test_a2', 'c'],
       ['test_a2', 'c', 'test_a2', 'test_b', 'test_a2', 'test_b',
  

# `fix_pd_multiIndex`

`padnas` has some problems with loading multilevel columns headers from excel. When different columns has different count of levels, result `pandas.DataFrame` will have maximum count of levels in each columns, and lower levels of columns wich in excel has less levels will renamed like `Unnamed: ...`. This function goal is change every name of wich starts from `Unnamed: ... ` in index to empty line in index.

Let the input file looks like.

<img src="pictures/multiindex_examle.png"></img>

By defult panads loads it like.

In [13]:
df = pd.read_excel(
    "test_data/fix_pd_multiIndex_df.xlsx",
    header = [0,1]
)
df

Unnamed: 0_level_0,id,group1,group1,group2,group2
Unnamed: 0_level_1,Unnamed: 0_level_1,col1,col2,col3,col4
0,323,a,4,g,5
1,433,b,5,r,6
2,123,c,6,d,7
3,321,d,7,x,5


So `id` column, loaded as `("id", "Unnamed: 0_level_1")`. Now lets apply function for geting a new Index.

In [14]:
new_columns = fix_pd_multiIndex(df.columns)
new_columns

MultiIndex([(    'id',     ''),
            ('group1', 'col1'),
            ('group1', 'col2'),
            ('group2', 'col3'),
            ('group2', 'col4')],
           )

And now apply it to our dataframe.

In [15]:
df.columns = new_columns
df

Unnamed: 0_level_0,id,group1,group1,group2,group2
Unnamed: 0_level_1,Unnamed: 1_level_1,col1,col2,col3,col4
0,323,a,4,g,5
1,433,b,5,r,6
2,123,c,6,d,7
3,321,d,7,x,5
