# Testing notebook

In [1]:
import xlsxwriter
import pandas as pd
import numpy as np

from classification_power_predictor import classification_power_predictor
from writing import *

## We need some testing data

Data generator for the study of interrelationships

In [2]:
def inter_data_gen(grop_size = 200, grops_count = 3, relation_param = 1.2):
    # lets creaate a sample wich contains columns for every situation
    Y = np.concatenate([np.zeros(grop_size) + i for i in range(grops_count)])
    related_col = np.concatenate(
        [np.random.normal(i*relation_param, 0.5, grop_size) for i in range(grops_count)]
    )
    non_related_col = np.concatenate(
        [np.random.normal(0, 0.5, grop_size) for i in range(grops_count)]
    )
    # get <grops_count>-order quantiles
    related_col_quants = np.array(
        [np.quantile(related_col,(i+1)/grops_count) for i in range(grops_count)]
    )
    non_related_col_quants = np.array(
        [np.quantile(non_related_col,(i+1)/grops_count) for i in range(grops_count)]
    )
    test_frame = pd.DataFrame({
        "Y" : Y,
        "related_col" : related_col,
        "non_related_col" : non_related_col,
        "related_col_cat" : \
            map(lambda x: str(sum(x > related_col_quants)), related_col),
        "non_related_col_cat":\
            map(
                lambda x: str(sum(x > non_related_col_quants)), 
                non_related_col
            ),
        "same_value_num": np.zeros(grop_size * grops_count),
        "same_value_nom": ["yes"] * (grop_size * grops_count)
    })
    
    test_frame["Y_names"] =\
    test_frame['Y'].replace({
        level : "name " + str(level) \
        for level in test_frame['Y'].unique()
    })
    
    
    test_frame['related_col_emp'] = test_frame['related_col']
    test_frame.loc[
        test_frame['related_col_emp'].sample(int(grop_size/2)).index,
        'related_col_emp'
    ] = np.NaN
    
    test_frame['non_related_col_cat_emp'] = test_frame['related_col_cat']
    test_frame.loc[
        test_frame['non_related_col_cat_emp'].sample(int(grop_size/2)).index,
        'non_related_col_cat_emp'
    ] = np.NaN
    
    return test_frame

temp_frame = inter_data_gen()
temp_frame

Unnamed: 0,Y,related_col,non_related_col,related_col_cat,non_related_col_cat,same_value_num,same_value_nom,Y_names,related_col_emp,non_related_col_cat_emp
0,0.0,-0.093937,-0.282317,0,0,0.0,yes,name 0.0,-0.093937,
1,0.0,-0.731781,0.144617,0,1,0.0,yes,name 0.0,-0.731781,0
2,0.0,-0.382713,0.264627,0,2,0.0,yes,name 0.0,,0
3,0.0,-0.565249,0.327062,0,2,0.0,yes,name 0.0,-0.565249,0
4,0.0,0.563397,0.671040,0,2,0.0,yes,name 0.0,0.563397,0
...,...,...,...,...,...,...,...,...,...,...
595,2.0,2.400304,-0.146089,2,1,0.0,yes,name 2.0,2.400304,2
596,2.0,3.499982,0.336043,2,2,0.0,yes,name 2.0,,2
597,2.0,2.289874,-0.478007,2,0,0.0,yes,name 2.0,2.289874,2
598,2.0,1.836556,-0.340781,2,0,0.0,yes,name 2.0,1.836556,


# Computions funcitons

In [3]:
from computions import *

## get_describe_nominal function test

Basic situation

In [4]:
get_describe_nominal(temp_frame["related_col_cat"], temp_frame.Y)

Unnamed: 0_level_0,count,0.0,0.0%,1.0,1.0%,2.0,2.0%
related_col_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,200,175.0,87.5,25,12.5,0.0,0.0
1,200,25.0,12.5,153,76.5,22.0,11.0
2,200,0.0,0.0,22,11.0,178.0,89.0


## Stats computing test

Testig settings

In [5]:
KS_conv_level = 0.05
second_kind_error_count = 0
first_kind_error_count = 0
test_count = 200

Stats numeric

In [6]:
for i in range(test_count):
    data_for_testing = inter_data_gen(relation_param = 1.2)
    
    related_stats = get_stats_numeric(
        data_for_testing['related_col'], data_for_testing['Y']
    )
    non_related_stats = get_stats_numeric(
        data_for_testing['non_related_col'], data_for_testing['Y']
    )
    
    if related_stats[2]['AUC'] < non_related_stats[2]['AUC']:  
        raise ValueError(
            'AUC of related sample is smaller than AUC of independent!'
        )
        break
    if related_stats[0]['AUC'] > 0.5:
        raise ValueError(
            'AUC for factor with smaller on average levels must be lower than 0.5'
        )
        break
    if related_stats[2]['KS'] < non_related_stats[2]['KS']:
        raise ValueError(
            'KS of related sample is smaller than KS of independent!'
        )
    
    if related_stats[0]['KS_p_val'] > KS_conv_level:
        first_kind_error_count += 1
    if non_related_stats[0]['KS_p_val'] < KS_conv_level:
        second_kind_error_count += 1
        
print("KS test results")
print("first kind error")
print(
    "count " + str(first_kind_error_count) + "; share " +\
    str(np.round(first_kind_error_count*100/test_count, 3)) + "%"
)
print("second kind error")
print(
    "count " + str(second_kind_error_count) + "; share " +\
    str(np.round(second_kind_error_count*100/test_count, 3)) + "%"
)

KS test results
first kind error
count 0; share 0.0%
second kind error
count 9; share 4.5%


Stats nominal

In [7]:
for i in range(test_count):
    data_for_testing = inter_data_gen(relation_param = 1.2)
    
    related_stats = get_stats_nominal(
        data_for_testing['related_col_cat'], data_for_testing['Y']
    )
    non_related_stats = get_stats_nominal(
        data_for_testing['non_related_col_cat'], data_for_testing['Y']
    )
    
    if related_stats[2]['AUC'] < non_related_stats[2]['AUC']:  
        raise ValueError(
            'AUC of related sample is smaller than AUC of independent!'
        )
        break
    if related_stats[2]['KS'] < non_related_stats[2]['KS']:
        raise ValueError(
            'KS of related sample is smaller than KS of independent!'
        )
        break
        
    if related_stats[0]['KS_p_val'] > KS_conv_level:
        first_kind_error_count += 1
    if non_related_stats[0]['KS_p_val'] < KS_conv_level:
        second_kind_error_count += 1
        
print("KS test results")
print("first kind error")
print(
    "count " + str(first_kind_error_count) + "; share " +\
    str(np.round(first_kind_error_count*100/test_count, 3)) + "%"
)
print("second kind error")
print(
    "count " + str(second_kind_error_count) + "; share " +\
    str(np.round(second_kind_error_count*100/test_count, 3)) + "%"
)

KS test results
first kind error
count 0; share 0.0%
second kind error
count 10; share 5.0%


Does it work with description table

In [8]:
dn_tab = get_describe_nominal(
    temp_frame["related_col_cat"], 
    temp_frame["Y_names"]
)

with_table = get_stats_nominal(
    temp_frame["related_col_cat"], 
    temp_frame["Y_names"], dn_tab
)
with_table

{'name 0.0': {'AUC': 0.9375, 'KS': 0.8125, 'KS_p_val': 3.870088828635011e-96},
 'name 1.0': {'AUC': 0.8275, 'KS': 0.6475, 'KS_p_val': 6.100660513936802e-55},
 'name 2.0': {'AUC': 0.9450000000000001,
  'KS': 0.835,
  'KS_p_val': 6.03148620091097e-104}}

## Getting full info about stats - no difference nominal or numeric

In [9]:
data_for_testing = inter_data_gen(relation_param = 1.2)
print("result for numeric column")
print(get_full_stats(
    data_for_testing["related_col"], data_for_testing.Y, "numeric"
))
print("result for nominal column")
print(get_full_stats(
    data_for_testing["related_col_cat"], data_for_testing.Y, "nominal"
))

result for numeric column
{0.0: {'AUC': 0.97285, 'KS': 0.825, 'KS_p_val': 2.2563555189899545e-100, 'rel_type': -1, 'GINI': 0.9457, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 1.0: {'AUC': 0.505425, 'KS': 0.35, 'KS_p_val': 4.5395316242411e-15, 'rel_type': -1, 'GINI': 0.010850000000000026, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 2.0: {'AUC': 0.978275, 'KS': 0.865, 'KS_p_val': 6.553529640023942e-116, 'rel_type': 1, 'GINI': 0.95655, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}}
result for nominal column
{0.0: {'AUC': 0.9325, 'KS': 0.7975, 'KS_p_val': 2.3581393559109397e-91, 'rel_type': 1, 'GINI': 0.865, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 1.0: {'AUC': 0.8225, 'KS': 0.6325000000000001, 'KS_p_val': 4.2317438329182994e-52, 'rel_type': 1, 'GINI': 0.645, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 2.0: {'AUC': 0.

## all computions

Basic situations

In [10]:
get_all_computions(
    temp_frame['related_col'], 
    temp_frame.Y
)

{'name': 'related_col',
 'empty count': 0,
 'empty part%': 0.0,
 'predictor_type': 'numeric',
 'describe_table':            Value:  Part %:
 count  600.000000      NaN
 mean     1.199726      NaN
 std      1.113036      NaN
 min     -1.372088      NaN
 25%      0.322414      NaN
 50%      1.186223      NaN
 75%      2.148612      NaN
 max      3.667398      NaN
 Empty    0.000000      0.0,
 'stats_result': {0.0: {'AUC': 0.97185,
   'KS': 0.8374999999999999,
   'KS_p_val': 7.22298933508805e-105,
   'rel_type': -1,
   'GINI': 0.9437,
   'Count': 200,
   'Empty': 0,
   'Empty% in level': 0.0,
   'Empty% in all Empty': 0.0},
  1.0: {'AUC': 0.5031625,
   'KS': 0.36,
   'KS_p_val': 6.080580717515716e-16,
   'rel_type': -1,
   'GINI': 0.006324999999999914,
   'Count': 200,
   'Empty': 0,
   'Empty% in level': 0.0,
   'Empty% in all Empty': 0.0},
  2.0: {'AUC': 0.9750125000000001,
   'KS': 0.8475,
   'KS_p_val': 1.1127225408894443e-108,
   'rel_type': 1,
   'GINI': 0.9500250000000001,
   'Coun

In [11]:
get_all_computions(
    temp_frame['related_col_emp'], 
    temp_frame.Y
)

{'name': 'related_col_emp',
 'empty count': 100,
 'empty part%': 16.666666666666668,
 'predictor_type': 'numeric',
 'describe_table':            Value:    Part %:
 count  500.000000        NaN
 mean     1.209745        NaN
 std      1.129782        NaN
 min     -1.208402        NaN
 25%      0.305828        NaN
 50%      1.200295        NaN
 75%      2.183520        NaN
 max      3.667398        NaN
 Empty  100.000000  16.666667,
 'stats_result': {0.0: {'AUC': 0.9715864527629233,
   'KS': 0.8360071301247771,
   'KS_p_val': 6.405329511964237e-88,
   'rel_type': -1,
   'GINI': 0.9431729055258466,
   'Count': 200,
   'Empty': 30,
   'Empty% in level': 15.0,
   'Empty% in all Empty': 30.0},
  1.0: {'AUC': 0.5129136851520573,
   'KS': 0.3690369707811568,
   'KS_p_val': 1.4032832887734825e-13,
   'rel_type': -1,
   'GINI': 0.025827370304114527,
   'Count': 200,
   'Empty': 44,
   'Empty% in level': 22.0,
   'Empty% in all Empty': 44.0},
  2.0: {'AUC': 0.9786157534729567,
   'KS': 0.862033707

In [12]:
get_all_computions(
    temp_frame['non_related_col_cat'], 
    temp_frame.Y
)

{'name': 'non_related_col_cat',
 'empty count': 0,
 'empty part%': 0.0,
 'predictor_type': 'nominal',
 'describe_table':                      count  0.0  0.0%  1.0  1.0%  2.0  2.0%
 non_related_col_cat                                        
 0                      200   70  35.0   66  33.0   64  32.0
 1                      200   66  33.0   73  36.5   61  30.5
 2                      200   64  32.0   61  30.5   75  37.5,
 'stats_result': {0.0: {'AUC': 0.515,
   'KS': 0.024999999999999967,
   'KS_p_val': 0.9999897215931545,
   'rel_type': 1,
   'GINI': 0.030000000000000027,
   'Count': 200,
   'Empty': 0,
   'Empty% in level': 0.0,
   'Empty% in all Empty': 0.0},
  1.0: {'AUC': 0.53,
   'KS': 0.04749999999999999,
   'KS_p_val': 0.910913641987852,
   'rel_type': 1,
   'GINI': 0.06000000000000005,
   'Count': 200,
   'Empty': 0,
   'Empty% in level': 0.0,
   'Empty% in all Empty': 0.0},
  2.0: {'AUC': 0.5349999999999999,
   'KS': 0.0625,
   'KS_p_val': 0.6530103012036852,
   'rel_type': 

In [13]:
get_all_computions(
    temp_frame['non_related_col_cat_emp'], 
    temp_frame.Y
)

{'name': 'non_related_col_cat_emp',
 'empty count': 100,
 'empty part%': 16.666666666666668,
 'predictor_type': 'nominal',
 'describe_table':                          count    0.0       0.0%  1.0       1.0%    2.0  \
 non_related_col_cat_emp                                                   
 1                          168   23.0  13.690476  127  75.595238   18.0   
 0                          166  144.0  86.746988   22  13.253012    0.0   
 2                          166    0.0   0.000000   18  10.843373  148.0   
 Empty                      100   33.0  33.000000   33  33.000000   34.0   
 
                               2.0%  
 non_related_col_cat_emp             
 1                        10.714286  
 0                         0.000000  
 2                        89.156627  
 Empty                    34.000000  ,
 'stats_result': {0.0: {'AUC': 0.910875,
   'KS': 0.6649999999999999,
   'KS_p_val': 2.1060927030672474e-58,
   'rel_type': 1,
   'GINI': 0.82175,
   'Count': 200,
   'Empt

## stats_info_to_DataFrame

In [14]:
data_for_testing = inter_data_gen(relation_param = 1.2)


stats_info_to_DataFrame(
    get_all_computions(
        data_for_testing['non_related_col'], 
        data_for_testing.Y
    )['stats_result']
)


stats_info_to_DataFrame(
    get_all_computions(
        data_for_testing['related_col_emp'], 
        data_for_testing.Y
    )['stats_result']
)

Unnamed: 0_level_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
Unnamed: 0_level_1,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,AUC,...,Empty% in all Empty,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty
0,0.974079,0.830047,0.0,-1,0.948158,200,31,15.5,31.0,0.501942,...,36.0,0.974951,0.837982,0.0,1,0.949902,200,33,16.5,33.0


Getting indicators from computions uotput

In [15]:
data_for_testing = inter_data_gen(relation_param = 1.2)

comp_result =  get_all_computions(
    data_for_testing['non_related_col'], 
    data_for_testing.Y
)

print('full numeric data')
get_predictor_row(comp_result)

full numeric data


Unnamed: 0_level_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,Empty,Empty part%
Unnamed: 0_level_1,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,AUC,...,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,Unnamed: 20_level_1,Unnamed: 21_level_1
non_related_col,0.523775,0.0875,0.245688,1,0.04755,200,0,0.0,0.0,0.503688,...,0.07,0.509914,-1,0.040175,200,0,0.0,0.0,0,0.0


# Writing to excel

Testing data

In [16]:
# frame size
n = 1000

test_frame = pd.DataFrame({
    "numeric_variable": np.random.normal(10, 5, n), 
    "object_variable": np.round(np.random.uniform(0, 10, n)).astype('O'),
    "column with soo000oo000oo000oo000oo long name": np.random.normal(10, 5, n),
    "Num_col_with_emp" : np.random.normal(10, 5, n),
    "Nom_col_with_emp" : np.round(np.random.uniform(0, 10, n)).astype('O')
})
test_frame.loc[test_frame.sample(50).index, "Num_col_with_emp"] = np.NaN
test_frame.loc[test_frame.sample(50).index, "Nom_col_with_emp"] = np.NaN

nv = test_frame["numeric_variable"]
ov = test_frame["object_variable"]

probs1 = nv.apply(lambda x: (x/(max(nv) - min(nv))) + ((np.random.rand())/5))
probs2 = ov.apply(lambda x: (x/(max(ov) - min(ov))) + ((np.random.rand())/5))

f_probs = (probs1 + probs2)/2
f_probs[f_probs >= 1] = 1


Y = np.zeros(n)
Y[f_probs > 0.4] = 1
Y[f_probs > 0.7] = 2

Y = pd.Series(Y)
Y_strs = Y.replace({0: 'cetegory 0', 1:'category 1', 2:'category 2'})
Y.value_counts()
Y_binary = Y.replace({2:1})

na_containts_frame = test_frame.copy()

for col in test_frame.columns:
    na_containts_frame.loc[na_containts_frame.sample(int(n/50)).index, col] = np.NaN

Adding a info about different predictors

In [17]:
my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()

Let's show final table

In [18]:
my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()
#my_cpp.get_predictors_data()
my_cpp.result_DF

Unnamed: 0_level_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,Empty,Empty part%
Unnamed: 0_level_1,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,AUC,...,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,Unnamed: 20_level_1,Unnamed: 21_level_1
numeric_variable,0.689264,0.272096,0.0,-1,0.378528,299,0,0.0,0.0,0.769797,...,0.108659,0.005657,1,0.074751,571,0,0.0,0.0,0,0.0
object_variable,0.933024,0.718796,0.0,1,0.866049,299,0,0.0,0.0,0.928404,...,0.522222,0.0,1,0.679624,571,0,0.0,0.0,0,0.0
column with soo000oo000oo000oo000oo long name,0.529497,0.067057,0.28851,-1,0.058994,299,0,0.0,0.0,0.527666,...,0.03801,0.857345,1,0.024931,571,0,0.0,0.0,0,0.0
Num_col_with_emp,0.52643,0.056497,0.530387,1,0.052859,299,15,5.016722,30.0,0.536145,...,0.0343,0.938977,-1,0.012685,571,25,4.378284,50.0,50,5.0
Nom_col_with_emp,0.536279,0.055248,0.525003,1,0.072558,299,16,5.351171,32.0,0.572551,...,0.066236,0.222628,1,0.09193,571,31,5.429072,62.0,50,5.0


Writig a double header table

In [19]:
xl_writer = pd.ExcelWriter("test_result/double_header_saver.xlsx",engine='xlsxwriter')
ws = xl_writer.book.add_worksheet('test')
xl_writer.sheets['test'] = ws      

my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()
my_cpp.my_writer = xl_writer

print_double_column_header( 
    my_cpp, ws, 
    my_cpp.result_DF.loc[['numeric_variable'],:],
    "A1", 'Hello World', 
    xl_writer.book.add_format(my_cpp.default_header_format)
)

xl_writer.close()

Writing info about some predictor

# Full test

Basic setup

In [20]:
xl_writer = pd.ExcelWriter("test_result/writings.xlsx",engine='xlsxwriter')

ws = xl_writer.book.add_worksheet('test')
xl_writer.sheets['test'] = ws                             

my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()

xl_writer.close()

In [21]:
xl_writer = pd.ExcelWriter("test_result/result_test.xlsx",engine='xlsxwriter')
my_cpp.write_to_book(xl_writer)
xl_writer.close()