# Desribes and testing of functions for data processing

In [1]:
import pandas as pd
import numpy as np

from data_processing import *

In [2]:
test_df = pd.DataFrame({
    "num_var" : np.random.normal(3, 5, 200),
    "cut_var" : np.random.choice(['a', 'b'])
})

# `insert_next`

Quick insertion following the column selected by name for pandas.DataFrame.

In [3]:
insert_next(test_df, "num_var", test_df["num_var"] / 20)

test_df

Unnamed: 0,num_var,num_var_transf,cut_var
0,4.275936,0.213797,b
1,8.059681,0.402984,b
2,-4.472518,-0.223626,b
3,12.941513,0.647076,b
4,3.399259,0.169963,b
...,...,...,...
195,6.763084,0.338154,b
196,8.509601,0.425480,b
197,-1.410680,-0.070534,b
198,1.721128,0.086056,b


In [4]:
insert_next(test_df, "num_var", test_df["num_var"]*20)

Unnamed: 0,num_var,num_var_transf,cut_var
0,4.275936,85.518721,b
1,8.059681,161.193625,b
2,-4.472518,-89.450358,b
3,12.941513,258.830257,b
4,3.399259,67.985184,b
...,...,...,...
195,6.763084,135.261676,b
196,8.509601,170.192016,b
197,-1.410680,-28.213596,b
198,1.721128,34.422562,b


# `get_num_cond`

Get condition for selection from pandas.DataFrame numeric data types

In [5]:
get_num_cond(test_df)

num_var            True
num_var_transf     True
cut_var           False
dtype: bool

# `pd_OHE`

Conduct one hot encoding for pandas data frame with the result as pandas.DataFrame and columns in reading format. 

In [6]:
test_df = pd.DataFrame({
        'my_col' : ['a', 'b', 'b', 'a', np.NaN],
        'my_col2': ['d', 'v', 't', 'g', 'q']
})

pd_OHE(test_df, sk_OHE_kwarg = {'drop':['a', 'v']})

Unnamed: 0,my_col_b,my_col_nan,my_col2_d,my_col2_g,my_col2_q,my_col2_t
0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0


# `get_join_repl_rule`
Get an joining rule for the levels of some variable for further use in `pandas.Series.replace`

In [7]:
get_merge_repl_rule([['lev1', 'lev2'], ['lev3']])

{'lev1': 'lev1_lev2', 'lev2': 'lev1_lev2', 'lev3': 'lev3'}

# `np_replace`

For `numpy.array` replace values, by roole defined as dictionary:<br>
```
{
    <old val1>:<new val1>, 
    <old val2>:<new val2>,
    .....................
    <old valn>:<new valn>
}
```

`pandas.DataFrame.replace` synonym, but for `numpy.array`.

In [8]:
X = np.random.choice(
    ['a', 'b', 'c'], [10, 10]
)

X

array([['b', 'a', 'b', 'c', 'c', 'a', 'c', 'c', 'b', 'a'],
       ['b', 'b', 'a', 'c', 'a', 'a', 'c', 'c', 'b', 'c'],
       ['b', 'b', 'c', 'c', 'c', 'b', 'b', 'a', 'b', 'b'],
       ['a', 'b', 'a', 'c', 'a', 'c', 'b', 'b', 'b', 'a'],
       ['c', 'c', 'a', 'b', 'b', 'c', 'b', 'b', 'a', 'a'],
       ['b', 'a', 'c', 'b', 'b', 'c', 'a', 'c', 'c', 'a'],
       ['b', 'c', 'b', 'c', 'b', 'a', 'a', 'a', 'b', 'a'],
       ['c', 'b', 'c', 'b', 'c', 'b', 'c', 'c', 'a', 'a'],
       ['c', 'a', 'a', 'b', 'a', 'b', 'c', 'c', 'b', 'b'],
       ['b', 'c', 'c', 'b', 'a', 'b', 'a', 'b', 'a', 'a']], dtype='<U1')

In [9]:
np_replace(X, {'a':'test_a', 'b': "test_b", 'a':"test_a2"})

array([['test_b', 'test_a2', 'test_b', 'c', 'c', 'test_a2', 'c', 'c',
        'test_b', 'test_a2'],
       ['test_b', 'test_b', 'test_a2', 'c', 'test_a2', 'test_a2', 'c',
        'c', 'test_b', 'c'],
       ['test_b', 'test_b', 'c', 'c', 'c', 'test_b', 'test_b', 'test_a2',
        'test_b', 'test_b'],
       ['test_a2', 'test_b', 'test_a2', 'c', 'test_a2', 'c', 'test_b',
        'test_b', 'test_b', 'test_a2'],
       ['c', 'c', 'test_a2', 'test_b', 'test_b', 'c', 'test_b', 'test_b',
        'test_a2', 'test_a2'],
       ['test_b', 'test_a2', 'c', 'test_b', 'test_b', 'c', 'test_a2',
        'c', 'c', 'test_a2'],
       ['test_b', 'c', 'test_b', 'c', 'test_b', 'test_a2', 'test_a2',
        'test_a2', 'test_b', 'test_a2'],
       ['c', 'test_b', 'c', 'test_b', 'c', 'test_b', 'c', 'c', 'test_a2',
        'test_a2'],
       ['c', 'test_a2', 'test_a2', 'test_b', 'test_a2', 'test_b', 'c',
        'c', 'test_b', 'test_b'],
       ['test_b', 'c', 'c', 'test_b', 'test_a2', 'test_b', 'test_a2',
 

# `fix_pd_multiIndex`

`padnas` has some problems with loading multilevel columns headers from excel. When different columns has different count of levels, result `pandas.DataFrame` will have maximum count of levels in each columns, and lower levels of columns wich in excel has less levels will renamed like `Unnamed: ...`. This function goal is change every name of wich starts from `Unnamed: ... ` in index to empty line in index.

Let the input file looks like.

<img src="pictures/multiindex_examle.png"></img>

By defult panads loads it like.

In [10]:
df = pd.read_excel(
    "test_data/fix_pd_multiIndex_df.xlsx",
    header = [0,1]
)
df

Unnamed: 0_level_0,id,group1,group1,group2,group2
Unnamed: 0_level_1,Unnamed: 0_level_1,col1,col2,col3,col4
0,323,a,4,g,5
1,433,b,5,r,6
2,123,c,6,d,7
3,321,d,7,x,5


So `id` column, loaded as `("id", "Unnamed: 0_level_1")`. Now lets apply function for geting a new Index.

In [11]:
new_columns = fix_pd_multiIndex(df.columns)
new_columns

MultiIndex([(    'id',     ''),
            ('group1', 'col1'),
            ('group1', 'col2'),
            ('group2', 'col3'),
            ('group2', 'col4')],
           )

And now apply it to our dataframe.

In [12]:
df.columns = new_columns
df

Unnamed: 0_level_0,id,group1,group1,group2,group2
Unnamed: 0_level_1,Unnamed: 1_level_1,col1,col2,col3,col4
0,323,a,4,g,5
1,433,b,5,r,6
2,123,c,6,d,7
3,321,d,7,x,5
