# Testing notebook

In [1]:
import xlsxwriter
import pandas as pd
import numpy as np

from classification_power_predictor import classification_power_predictor
from writing import *

## We need some testing data

Data generator for the study of interrelationships

In [2]:
def inter_data_gen(grop_size = 200, grops_count = 3, relation_param = 1.2):
    # lets creaate a sample wich contains columns for every situation
    Y = np.concatenate([np.zeros(grop_size) + i for i in range(grops_count)])
    related_col = np.concatenate(
        [np.random.normal(i*relation_param, 0.5, grop_size) for i in range(grops_count)]
    )
    non_related_col = np.concatenate(
        [np.random.normal(0, 0.5, grop_size) for i in range(grops_count)]
    )
    # get <grops_count>-order quantiles
    related_col_quants = np.array(
        [np.quantile(related_col,(i+1)/grops_count) for i in range(grops_count)]
    )
    non_related_col_quants = np.array(
        [np.quantile(non_related_col,(i+1)/grops_count) for i in range(grops_count)]
    )
    test_frame = pd.DataFrame({
        "Y" : Y,
        "related_col" : related_col,
        "non_related_col" : non_related_col,
        "related_col_cat" : \
            map(lambda x: str(sum(x > related_col_quants)), related_col),
        "non_related_col_cat":\
            map(
                lambda x: str(sum(x > non_related_col_quants)), 
                non_related_col
            ),
        "same_value_num": np.zeros(grop_size * grops_count),
        "same_value_nom": ["yes"] * (grop_size * grops_count)
    })
    
    test_frame["Y_names"] =\
    test_frame['Y'].replace({
        level : "name " + str(level) \
        for level in test_frame['Y'].unique()
    })
    
    
    test_frame['related_col_emp'] = test_frame['related_col']
    test_frame.loc[
        test_frame['related_col_emp'].sample(int(grop_size/2)).index,
        'related_col_emp'
    ] = np.NaN
    
    test_frame['non_related_col_cat_emp'] = test_frame['related_col_cat']
    test_frame.loc[
        test_frame['non_related_col_cat_emp'].sample(int(grop_size/2)).index,
        'non_related_col_cat_emp'
    ] = np.NaN
    
    return test_frame

temp_frame = inter_data_gen()
temp_frame

Unnamed: 0,Y,related_col,non_related_col,related_col_cat,non_related_col_cat,same_value_num,same_value_nom,Y_names,related_col_emp,non_related_col_cat_emp
0,0.0,0.808448,0.062530,1,1,0.0,yes,name 0.0,,1
1,0.0,-0.461488,-0.186105,0,0,0.0,yes,name 0.0,,0
2,0.0,0.912255,-0.375989,1,0,0.0,yes,name 0.0,0.912255,1
3,0.0,-0.026570,-0.260699,0,0,0.0,yes,name 0.0,-0.026570,0
4,0.0,0.218478,-1.483134,0,0,0.0,yes,name 0.0,0.218478,0
...,...,...,...,...,...,...,...,...,...,...
595,2.0,2.392919,0.259174,2,1,0.0,yes,name 2.0,2.392919,2
596,2.0,1.732847,-0.179040,1,0,0.0,yes,name 2.0,1.732847,1
597,2.0,1.789880,-0.073691,2,1,0.0,yes,name 2.0,,2
598,2.0,2.804882,0.770160,2,2,0.0,yes,name 2.0,2.804882,2


# Computions funcitons

In [3]:
from computions import *

## get_describe_nominal function test

Basic situation

In [4]:
get_describe_nominal(temp_frame["related_col_cat"], temp_frame.Y)

Unnamed: 0_level_0,count,0.0,0.0%,1.0,1.0%,2.0,2.0%
related_col_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,200,22.0,11.0,157,78.5,21.0,10.5
0,200,178.0,89.0,22,11.0,0.0,0.0
2,200,0.0,0.0,21,10.5,179.0,89.5


## Stats computing test

Testig settings

In [5]:
KS_conv_level = 0.05
second_kind_error_count = 0
first_kind_error_count = 0
test_count = 200

Stats numeric

In [6]:
for i in range(test_count):
    data_for_testing = inter_data_gen(relation_param = 1.2)
    
    related_stats = get_stats_numeric(
        data_for_testing['related_col'], data_for_testing['Y']
    )
    non_related_stats = get_stats_numeric(
        data_for_testing['non_related_col'], data_for_testing['Y']
    )
    
    if related_stats[2]['AUC'] < non_related_stats[2]['AUC']:  
        raise ValueError(
            'AUC of related sample is smaller than AUC of independent!'
        )
        break
    if related_stats[0]['AUC'] > 0.5:
        raise ValueError(
            'AUC for factor with smaller on average levels must be lower than 0.5'
        )
        break
    if related_stats[2]['KS'] < non_related_stats[2]['KS']:
        raise ValueError(
            'KS of related sample is smaller than KS of independent!'
        )
    
    if related_stats[0]['KS_p_val'] > KS_conv_level:
        first_kind_error_count += 1
    if non_related_stats[0]['KS_p_val'] < KS_conv_level:
        second_kind_error_count += 1
        
print("KS test results")
print("first kind error")
print(
    "count " + str(first_kind_error_count) + "; share " +\
    str(np.round(first_kind_error_count*100/test_count, 3)) + "%"
)
print("second kind error")
print(
    "count " + str(second_kind_error_count) + "; share " +\
    str(np.round(second_kind_error_count*100/test_count, 3)) + "%"
)

KS test results
first kind error
count 0; share 0.0%
second kind error
count 11; share 5.5%


Stats nominal

In [7]:
for i in range(test_count):
    data_for_testing = inter_data_gen(relation_param = 1.2)
    
    related_stats = get_stats_nominal(
        data_for_testing['related_col_cat'], data_for_testing['Y']
    )
    non_related_stats = get_stats_nominal(
        data_for_testing['non_related_col_cat'], data_for_testing['Y']
    )
    
    if related_stats[2]['AUC'] < non_related_stats[2]['AUC']:  
        raise ValueError(
            'AUC of related sample is smaller than AUC of independent!'
        )
        break
    if related_stats[2]['KS'] < non_related_stats[2]['KS']:
        raise ValueError(
            'KS of related sample is smaller than KS of independent!'
        )
        break
        
    if related_stats[0]['KS_p_val'] > KS_conv_level:
        first_kind_error_count += 1
    if non_related_stats[0]['KS_p_val'] < KS_conv_level:
        second_kind_error_count += 1
        
print("KS test results")
print("first kind error")
print(
    "count " + str(first_kind_error_count) + "; share " +\
    str(np.round(first_kind_error_count*100/test_count, 3)) + "%"
)
print("second kind error")
print(
    "count " + str(second_kind_error_count) + "; share " +\
    str(np.round(second_kind_error_count*100/test_count, 3)) + "%"
)

KS test results
first kind error
count 0; share 0.0%
second kind error
count 13; share 6.5%


Does it work with description table

In [8]:
dn_tab = get_describe_nominal(
    temp_frame["related_col_cat"], 
    temp_frame["Y_names"]
)

with_table = get_stats_nominal(
    temp_frame["related_col_cat"], 
    temp_frame["Y_names"], dn_tab
)
with_table

{'name 0.0': {'AUC': 0.9450000000000001,
  'KS': 0.835,
  'KS_p_val': 6.03148620091097e-104},
 'name 1.0': {'AUC': 0.8400000000000001,
  'KS': 0.6775,
  'KS_p_val': 5.595357747107431e-61},
 'name 2.0': {'AUC': 0.9475, 'KS': 0.8425, 'KS_p_val': 9.517116054304806e-107}}

## Getting full info about stats - no difference nominal or numeric

In [9]:
data_for_testing = inter_data_gen(relation_param = 1.2)
print("result for numeric column")
print(get_full_stats(
    data_for_testing["related_col"], data_for_testing.Y, "numeric"
))
print("result for nominal column")
print(get_full_stats(
    data_for_testing["related_col_cat"], data_for_testing.Y, "nominal"
))

result for numeric column
{0.0: {'AUC': 0.980225, 'KS': 0.85, 'KS_p_val': 1.1473668301474423e-109, 'rel_type': -1, 'GINI': 0.96045, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 1.0: {'AUC': 0.5089625, 'KS': 0.375, 'KS_p_val': 2.6421118749418694e-17, 'rel_type': 1, 'GINI': 0.01792499999999997, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 2.0: {'AUC': 0.9712625000000001, 'KS': 0.85, 'KS_p_val': 1.1473668301474423e-109, 'rel_type': 1, 'GINI': 0.9425250000000003, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}}
result for nominal column
{0.0: {'AUC': 0.9475, 'KS': 0.8425, 'KS_p_val': 9.517116054304806e-107, 'rel_type': 1, 'GINI': 0.895, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 1.0: {'AUC': 0.8475, 'KS': 0.6925, 'KS_p_val': 3.440346392592297e-64, 'rel_type': 1, 'GINI': 0.6950000000000001, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empt

## all computions

Basic situations

In [10]:
get_all_computions(
    temp_frame['related_col'], 
    temp_frame.Y
)

{'name': 'related_col',
 'empty count': 0,
 'empty part': 0.0,
 'predictor_type': 'numeric',
 'describe_table':            Value:  Part %:
 count  600.000000      NaN
 mean     1.202356      NaN
 std      1.083181      NaN
 min     -1.056858      NaN
 25%      0.274717      NaN
 50%      1.248043      NaN
 75%      2.139406      NaN
 max      3.504165      NaN
 Empty    0.000000      0.0,
 'stats_result': {0.0: {'AUC': 0.9747125,
   'KS': 0.8500000000000001,
   'KS_p_val': 1.1473668301473257e-109,
   'rel_type': -1,
   'GINI': 0.949425,
   'Count': 200,
   'Empty': 0,
   'Empty% in level': 0.0,
   'Empty% in all Empty': 0.0},
  1.0: {'AUC': 0.502175,
   'KS': 0.3625,
   'KS_p_val': 3.6419462233368156e-16,
   'rel_type': -1,
   'GINI': 0.004350000000000076,
   'Count': 200,
   'Empty': 0,
   'Empty% in level': 0.0,
   'Empty% in all Empty': 0.0},
  2.0: {'AUC': 0.9768875000000001,
   'KS': 0.8475,
   'KS_p_val': 1.1127225408894443e-108,
   'rel_type': 1,
   'GINI': 0.9537750000000003,
 

In [11]:
get_all_computions(
    temp_frame['related_col_emp'], 
    temp_frame.Y
)

{'name': 'related_col_emp',
 'empty count': 100,
 'empty part': 0.16666666666666666,
 'predictor_type': 'numeric',
 'describe_table':            Value:    Part %:
 count  500.000000        NaN
 mean     1.181972        NaN
 std      1.094301        NaN
 min     -1.039768        NaN
 25%      0.269243        NaN
 50%      1.214637        NaN
 75%      2.149139        NaN
 max      3.504165        NaN
 Empty  100.000000  16.666667,
 'stats_result': {0.0: {'AUC': 0.9766173867963317,
   'KS': 0.8625288260426536,
   'KS_p_val': 8.8519437761965e-97,
   'rel_type': -1,
   'GINI': 0.9532347735926634,
   'Count': 200,
   'Empty': 31,
   'Empty% in level': 15.5,
   'Empty% in all Empty': 31.0},
  1.0: {'AUC': 0.5098589535029228,
   'KS': 0.36378912744239256,
   'KS_p_val': 8.63729407474017e-14,
   'rel_type': 1,
   'GINI': 0.019717907005845525,
   'Count': 200,
   'Empty': 31,
   'Empty% in level': 15.5,
   'Empty% in all Empty': 31.0},
  2.0: {'AUC': 0.976842720432464,
   'KS': 0.85364161005186

In [12]:
get_all_computions(
    temp_frame['non_related_col_cat'], 
    temp_frame.Y
)

{'name': 'non_related_col_cat',
 'empty count': 0,
 'empty part': 0.0,
 'predictor_type': 'nominal',
 'describe_table':                      count  0.0  0.0%  1.0  1.0%  2.0  2.0%
 non_related_col_cat                                        
 1                      200   63  31.5   67  33.5   70  35.0
 0                      200   68  34.0   67  33.5   65  32.5
 2                      200   69  34.5   66  33.0   65  32.5,
 'stats_result': {0.0: {'AUC': 0.515,
   'KS': 0.02750000000000008,
   'KS_p_val': 0.9999080320232008,
   'rel_type': 1,
   'GINI': 0.030000000000000027,
   'Count': 200,
   'Empty': 0,
   'Empty% in level': 0.0,
   'Empty% in all Empty': 0.0},
  1.0: {'AUC': 0.5025,
   'KS': 0.0050000000000000044,
   'KS_p_val': 1.0,
   'rel_type': 1,
   'GINI': 0.004999999999999893,
   'Count': 200,
   'Empty': 0,
   'Empty% in level': 0.0,
   'Empty% in all Empty': 0.0},
  2.0: {'AUC': 0.5125000000000001,
   'KS': 0.024999999999999967,
   'KS_p_val': 0.9999897215931545,
   'rel_type

In [13]:
get_all_computions(
    temp_frame['non_related_col_cat_emp'], 
    temp_frame.Y
)

{'name': 'non_related_col_cat_emp',
 'empty count': 100,
 'empty part': 0.16666666666666666,
 'predictor_type': 'nominal',
 'describe_table':                          count    0.0       0.0%  1.0       1.0%    2.0  \
 non_related_col_cat_emp                                                   
 2                          174    0.0   0.000000   17   9.770115  157.0   
 0                          168  148.0  88.095238   20  11.904762    0.0   
 1                          158   17.0  10.759494  125  79.113924   16.0   
 Empty                      100   35.0  35.000000   38  38.000000   27.0   
 
                               2.0%  
 non_related_col_cat_emp             
 2                        90.229885  
 0                         0.000000  
 1                        10.126582  
 Empty                    27.000000  ,
 'stats_result': {0.0: {'AUC': 0.9254874999999999,
   'KS': 0.7025,
   'KS_p_val': 2.0825070258460832e-66,
   'rel_type': 1,
   'GINI': 0.8509749999999998,
   'Count': 200,

## stats_info_to_DataFrame

In [14]:
data_for_testing = inter_data_gen(relation_param = 1.2)


stats_info_to_DataFrame(
    get_all_computions(
        data_for_testing['non_related_col'], 
        data_for_testing.Y
    )['stats_result']
)


stats_info_to_DataFrame(
    get_all_computions(
        data_for_testing['related_col_emp'], 
        data_for_testing.Y
    )['stats_result']
)

Unnamed: 0_level_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
Unnamed: 0_level_1,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,AUC,...,Empty% in all Empty,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty
0,0.979536,0.859021,0.0,-1,0.959073,200,33,16.5,33.0,0.5116,...,32.0,0.970746,0.844324,0.0,1,0.941493,200,35,17.5,35.0


Getting indicators from computions uotput

In [15]:
data_for_testing = inter_data_gen(relation_param = 1.2)

comp_result =  get_all_computions(
    data_for_testing['non_related_col'], 
    data_for_testing.Y
)

print('full numeric data')
get_predictor_row(comp_result)

full numeric data


Unnamed: 0_level_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,Empty,Empty part
Unnamed: 0_level_1,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,AUC,...,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,Unnamed: 20_level_1,Unnamed: 21_level_1
non_related_col,0.501563,0.055,0.79504,-1,0.003125,200,0,0.0,0.0,0.527238,...,0.085,0.275855,-1,0.05135,200,0,0.0,0.0,0,0.0


# Writing to excel

Testing data

In [16]:
# frame size
n = 1000

test_frame = pd.DataFrame({
    "numeric_variable": np.random.normal(10, 5, n), 
    "object_variable": np.round(np.random.uniform(0, 10, n)).astype('O'),
    "column with soo000oo000oo000oo000oo long name": np.random.normal(10, 5, n),
    "Num_col_with_emp" : np.random.normal(10, 5, n),
    "Nom_col_with_emp" : np.round(np.random.uniform(0, 10, n)).astype('O')
})
test_frame.loc[test_frame.sample(50).index, "Num_col_with_emp"] = np.NaN
test_frame.loc[test_frame.sample(50).index, "Nom_col_with_emp"] = np.NaN

nv = test_frame["numeric_variable"]
ov = test_frame["object_variable"]

probs1 = nv.apply(lambda x: (x/(max(nv) - min(nv))) + ((np.random.rand())/5))
probs2 = ov.apply(lambda x: (x/(max(ov) - min(ov))) + ((np.random.rand())/5))

f_probs = (probs1 + probs2)/2
f_probs[f_probs >= 1] = 1


Y = np.zeros(n)
Y[f_probs > 0.4] = 1
Y[f_probs > 0.7] = 2

Y = pd.Series(Y)
Y_strs = Y.replace({0: 'cetegory 0', 1:'category 1', 2:'category 2'})
Y.value_counts()
Y_binary = Y.replace({2:1})

na_containts_frame = test_frame.copy()

for col in test_frame.columns:
    na_containts_frame.loc[na_containts_frame.sample(int(n/50)).index, col] = np.NaN

Adding a info about different predictors

In [17]:
my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()

Let's show final table

In [18]:
my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()
#my_cpp.get_predictors_data()
my_cpp.result_DF

Unnamed: 0_level_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,Empty,Empty part
Unnamed: 0_level_1,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,AUC,...,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,Unnamed: 20_level_1,Unnamed: 21_level_1
numeric_variable,0.705242,0.299232,0.0,-1,0.410484,306,0,0.0,0.0,0.564492,...,0.450255,0.0,1,0.570337,110,0,0.0,0.0,0,0.0
object_variable,0.924841,0.682865,0.0,1,0.849683,306,0,0.0,0.0,0.821693,...,0.729316,0.0,1,0.841665,110,0,0.0,0.0,0,0.0
column with soo000oo000oo000oo000oo long name,0.509865,0.052843,0.576098,-1,0.01973,306,0,0.0,0.0,0.501453,...,0.063023,0.807672,1,0.05001,110,0,0.0,0.0,0,0.0
Num_col_with_emp,0.505194,0.054201,0.574111,1,0.010388,306,14,4.575163,28.0,0.506382,...,0.064375,0.811737,1,0.009138,110,6,5.454545,12.0,50,0.05
Nom_col_with_emp,0.547233,0.067902,0.269958,1,0.094465,306,13,4.248366,26.0,0.542792,...,0.120429,0.107217,1,0.158049,110,6,5.454545,12.0,50,0.05


Writig a double header table

In [19]:
xl_writer = pd.ExcelWriter("test_result/double_header_saver.xlsx",engine='xlsxwriter')
ws = xl_writer.book.add_worksheet('test')
xl_writer.sheets['test'] = ws      

my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()
my_cpp.my_writer = xl_writer

print_double_column_header( 
    my_cpp, ws, 
    my_cpp.result_DF.loc[['numeric_variable'],:],
    "A1", 'Hello World', 
    xl_writer.book.add_format(my_cpp.default_header_format)
)

xl_writer.close()

Writing info about some predictor

# Full test

Basic setup

In [20]:
xl_writer = pd.ExcelWriter("test_result/writings.xlsx",engine='xlsxwriter')

ws = xl_writer.book.add_worksheet('test')
xl_writer.sheets['test'] = ws                             

my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()

xl_writer.close()

In [21]:
xl_writer = pd.ExcelWriter("test_result/result_test.xlsx",engine='xlsxwriter')
my_cpp.write_to_book(xl_writer)
xl_writer.close()