# Testing notebook

In [1]:
import xlsxwriter
import pandas as pd
import numpy as np

from classification_power_predictor import classification_power_predictor
from writing import *

## We need some testing data

Data generator for the study of interrelationships

In [2]:
def inter_data_gen(grop_size = 200, grops_count = 3, relation_param = 1.2):
    # lets creaate a sample wich contains columns for every situation
    Y = np.concatenate([np.zeros(grop_size) + i for i in range(grops_count)])
    related_col = np.concatenate(
        [np.random.normal(i*relation_param, 0.5, grop_size) for i in range(grops_count)]
    )
    non_related_col = np.concatenate(
        [np.random.normal(0, 0.5, grop_size) for i in range(grops_count)]
    )
    # get <grops_count>-order quantiles
    related_col_quants = np.array(
        [np.quantile(related_col,(i+1)/grops_count) for i in range(grops_count)]
    )
    non_related_col_quants = np.array(
        [np.quantile(non_related_col,(i+1)/grops_count) for i in range(grops_count)]
    )
    test_frame = pd.DataFrame({
        "Y" : Y,
        "related_col" : related_col,
        "non_related_col" : non_related_col,
        "related_col_cat" : \
            map(lambda x: str(sum(x > related_col_quants)), related_col),
        "non_related_col_cat":\
            map(
                lambda x: str(sum(x > non_related_col_quants)), 
                non_related_col
            ),
        "same_value_num": np.zeros(grop_size * grops_count),
        "same_value_nom": ["yes"] * (grop_size * grops_count)
    })
    
    test_frame["Y_names"] =\
    test_frame['Y'].replace({
        level : "name " + str(level) \
        for level in test_frame['Y'].unique()
    })
    
    
    test_frame['related_col_emp'] = test_frame['related_col']
    test_frame.loc[
        test_frame['related_col_emp'].sample(int(grop_size/2)).index,
        'related_col_emp'
    ] = np.NaN
    
    test_frame['non_related_col_cat_emp'] = test_frame['related_col_cat']
    test_frame.loc[
        test_frame['non_related_col_cat_emp'].sample(int(grop_size/2)).index,
        'non_related_col_cat_emp'
    ] = np.NaN
    
    return test_frame

temp_frame = inter_data_gen()
temp_frame

Unnamed: 0,Y,related_col,non_related_col,related_col_cat,non_related_col_cat,same_value_num,same_value_nom,Y_names,related_col_emp,non_related_col_cat_emp
0,0.0,-0.841267,-0.430856,0,0,0.0,yes,name 0.0,-0.841267,0
1,0.0,0.641915,-0.485366,1,0,0.0,yes,name 0.0,0.641915,1
2,0.0,0.407267,0.417193,0,2,0.0,yes,name 0.0,0.407267,0
3,0.0,0.090025,0.662345,0,2,0.0,yes,name 0.0,0.090025,0
4,0.0,0.052934,0.066626,0,1,0.0,yes,name 0.0,0.052934,0
...,...,...,...,...,...,...,...,...,...,...
595,2.0,2.574189,-0.443823,2,0,0.0,yes,name 2.0,2.574189,2
596,2.0,1.945861,-0.118554,2,1,0.0,yes,name 2.0,1.945861,2
597,2.0,1.908283,0.921075,2,2,0.0,yes,name 2.0,1.908283,2
598,2.0,2.577461,0.053144,2,1,0.0,yes,name 2.0,2.577461,


# Computions funcitons

In [3]:
from computions import *

## get_describe_nominal function test

Basic situation

In [4]:
get_describe_nominal(temp_frame["related_col_cat"], temp_frame.Y)

Unnamed: 0_level_0,count,0.0,0.0%,1.0,1.0%,2.0,2.0%
related_col_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,200,183.0,91.5,17,8.5,0.0,0.0
1,200,17.0,8.5,159,79.5,24.0,12.0
2,200,0.0,0.0,24,12.0,176.0,88.0


## Stats computing test

Testig settings

In [5]:
KS_conv_level = 0.05
second_kind_error_count = 0
first_kind_error_count = 0
test_count = 200

Stats numeric

In [6]:
for i in range(test_count):
    data_for_testing = inter_data_gen(relation_param = 1.2)
    
    related_stats = get_stats_numeric(
        data_for_testing['related_col'], data_for_testing['Y']
    )
    non_related_stats = get_stats_numeric(
        data_for_testing['non_related_col'], data_for_testing['Y']
    )
    
    if related_stats[2]['AUC'] < non_related_stats[2]['AUC']:  
        raise ValueError(
            'AUC of related sample is smaller than AUC of independent!'
        )
        break
    if related_stats[0]['AUC'] > 0.5:
        raise ValueError(
            'AUC for factor with smaller on average levels must be lower than 0.5'
        )
        break
    if related_stats[2]['KS'] < non_related_stats[2]['KS']:
        raise ValueError(
            'KS of related sample is smaller than KS of independent!'
        )
    
    if related_stats[0]['KS_p_val'] > KS_conv_level:
        first_kind_error_count += 1
    if non_related_stats[0]['KS_p_val'] < KS_conv_level:
        second_kind_error_count += 1
        
print("KS test results")
print("first kind error")
print(
    "count " + str(first_kind_error_count) + "; share " +\
    str(np.round(first_kind_error_count*100/test_count, 3)) + "%"
)
print("second kind error")
print(
    "count " + str(second_kind_error_count) + "; share " +\
    str(np.round(second_kind_error_count*100/test_count, 3)) + "%"
)

KS test results
first kind error
count 0; share 0.0%
second kind error
count 12; share 6.0%


Stats nominal

In [7]:
for i in range(test_count):
    data_for_testing = inter_data_gen(relation_param = 1.2)
    
    related_stats = get_stats_nominal(
        data_for_testing['related_col_cat'], data_for_testing['Y']
    )
    non_related_stats = get_stats_nominal(
        data_for_testing['non_related_col_cat'], data_for_testing['Y']
    )
    
    if related_stats[2]['AUC'] < non_related_stats[2]['AUC']:  
        raise ValueError(
            'AUC of related sample is smaller than AUC of independent!'
        )
        break
    if related_stats[2]['KS'] < non_related_stats[2]['KS']:
        raise ValueError(
            'KS of related sample is smaller than KS of independent!'
        )
        break
        
    if related_stats[0]['KS_p_val'] > KS_conv_level:
        first_kind_error_count += 1
    if non_related_stats[0]['KS_p_val'] < KS_conv_level:
        second_kind_error_count += 1
        
print("KS test results")
print("first kind error")
print(
    "count " + str(first_kind_error_count) + "; share " +\
    str(np.round(first_kind_error_count*100/test_count, 3)) + "%"
)
print("second kind error")
print(
    "count " + str(second_kind_error_count) + "; share " +\
    str(np.round(second_kind_error_count*100/test_count, 3)) + "%"
)

KS test results
first kind error
count 0; share 0.0%
second kind error
count 15; share 7.5%


Does it work with description table

In [8]:
dn_tab = get_describe_nominal(
    temp_frame["related_col_cat"], 
    temp_frame["Y_names"]
)

with_table = get_stats_nominal(
    temp_frame["related_col_cat"], 
    temp_frame["Y_names"], dn_tab
)
with_table

{'name 0.0': {'AUC': 0.9575, 'KS': 0.8725, 'KS_p_val': 2.868994402148923e-119},
 'name 1.0': {'AUC': 0.855, 'KS': 0.6925, 'KS_p_val': 3.440346392592297e-64},
 'name 2.0': {'AUC': 0.94,
  'KS': 0.8200000000000001,
  'KS_p_val': 1.192022874466095e-98}}

## Getting full info about stats - no difference nominal or numeric

In [9]:
data_for_testing = inter_data_gen(relation_param = 1.2)
print("result for numeric column")
print(get_full_stats(
    data_for_testing["related_col"], data_for_testing.Y, "numeric"
))
print("result for nominal column")
print(get_full_stats(
    data_for_testing["related_col_cat"], data_for_testing.Y, "nominal"
))

result for numeric column
{0.0: {'AUC': 0.9842625, 'KS': 0.9025, 'KS_p_val': 7.111292868302766e-135, 'rel_type': -1, 'GINI': 0.9685250000000001, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 1.0: {'AUC': 0.5131625, 'KS': 0.39, 'KS_p_val': 9.909944208347607e-19, 'rel_type': 1, 'GINI': 0.026324999999999932, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 2.0: {'AUC': 0.9711000000000001, 'KS': 0.8374999999999999, 'KS_p_val': 7.22298933508805e-105, 'rel_type': 1, 'GINI': 0.9422000000000001, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}}
result for nominal column
{0.0: {'AUC': 0.9625, 'KS': 0.8875000000000001, 'KS_p_val': 1.4208747055147733e-126, 'rel_type': 1, 'GINI': 0.925, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 1.0: {'AUC': 0.8475, 'KS': 0.655, 'KS_p_val': 2.097350950346832e-56, 'rel_type': 1, 'GINI': 0.6950000000000001, 'Count': 200, 'Empty': 0, 'Empty%

## all computions

Basic situations

In [10]:
def get_all_computions(column, y_col, fillna_nominal = 'Empty'):
    '''Realise all computions for each column'''
    # inputs:
    # column - pandas.Series predictors column
    # y_col - pandas.Series predicted column
    # fillna_nominal -  optional, the value vich will replace na-values
    #                   "Empty" by default
    
    if len(column[np.invert(column.isna())].unique()) <= 1:
        raise ValueError(
            "getted column with all same values - " +
            column.name
        )
    
    new_column_data = {}
    new_column_data['name'] = column.name
    new_column_data['empty count'] = sum(column.isna())
    new_column_data['empty part'] = (new_column_data['empty count'] / 
                                      column.shape[0])

    is_numeric = np.isin(
        column.dtype, [np.int64, np.float64, np.int32, np.float64]
    )

    if is_numeric:
        new_column_data['predictor_type'] = 'numeric'
        new_column_data['describe_table'] = get_describe_numeric(column)
                
    else:
        new_column_data['predictor_type'] = 'nominal'
        new_column_data['describe_table'] = \
        get_describe_nominal(column.fillna(fillna_nominal), y_col)

    new_column_data['stats_result'] = get_full_stats( 
        column, y_col, 
        new_column_data['predictor_type'],
        descr_table = new_column_data['describe_table'],
        fillna_nominal = fillna_nominal
    )
            
    return new_column_data

In [11]:
get_all_computions(
    temp_frame['related_col'], 
    temp_frame.Y
)

{'name': 'related_col',
 'empty count': 0,
 'empty part': 0.0,
 'predictor_type': 'numeric',
 'describe_table':            Value:  Part %:
 count  600.000000      NaN
 mean     1.180370      NaN
 std      1.085079      NaN
 min     -1.374252      NaN
 25%      0.279912      NaN
 50%      1.183217      NaN
 75%      2.017473      NaN
 max      3.736054      NaN
 Empty    0.000000      0.0,
 'stats_result': {0.0: {'AUC': 0.985075,
   'KS': 0.885,
   'KS_p_val': 2.7004783689497414e-125,
   'rel_type': -1,
   'GINI': 0.9701500000000001,
   'Count': 200,
   'Empty': 0,
   'Empty% in level': 0.0,
   'Empty% in all Empty': 0.0},
  1.0: {'AUC': 0.514775,
   'KS': 0.3949999999999999,
   'KS_p_val': 3.2090210216417983e-19,
   'rel_type': 1,
   'GINI': 0.029549999999999965,
   'Count': 200,
   'Empty': 0,
   'Empty% in level': 0.0,
   'Empty% in all Empty': 0.0},
  2.0: {'AUC': 0.9702999999999999,
   'KS': 0.8325,
   'KS_p_val': 4.902183744688514e-103,
   'rel_type': 1,
   'GINI': 0.9405999999999

In [12]:
get_all_computions(
    temp_frame['related_col_emp'], 
    temp_frame.Y
)

{'name': 'related_col_emp',
 'empty count': 100,
 'empty part': 0.16666666666666666,
 'predictor_type': 'numeric',
 'describe_table':            Value:    Part %:
 count  500.000000        NaN
 mean     1.165168        NaN
 std      1.086926        NaN
 min     -1.374252        NaN
 25%      0.232285        NaN
 50%      1.202541        NaN
 75%      2.017473        NaN
 max      3.736054        NaN
 Empty  100.000000  16.666667,
 'stats_result': {0.0: {'AUC': 0.9836470609146981,
   'KS': 0.8802680460015286,
   'KS_p_val': 1.596497478655724e-104,
   'rel_type': -1,
   'GINI': 0.9672941218293962,
   'Count': 200,
   'Empty': 29,
   'Empty% in level': 14.5,
   'Empty% in all Empty': 29.0},
  1.0: {'AUC': 0.5298232364100628,
   'KS': 0.3995792199385013,
   'KS_p_val': 1.664084732260213e-16,
   'rel_type': 1,
   'GINI': 0.059646472820125584,
   'Count': 200,
   'Empty': 33,
   'Empty% in level': 16.5,
   'Empty% in all Empty': 33.0},
  2.0: {'AUC': 0.9666337935568705,
   'KS': 0.8149243918

In [13]:
get_all_computions(
    temp_frame['non_related_col_cat'], 
    temp_frame.Y
)

{'name': 'non_related_col_cat',
 'empty count': 0,
 'empty part': 0.0,
 'predictor_type': 'nominal',
 'describe_table':                      count  0.0  0.0%  1.0  1.0%  2.0  2.0%
 non_related_col_cat                                        
 0                      200   65  32.5   61  30.5   74  37.0
 2                      200   73  36.5   61  30.5   66  33.0
 1                      200   62  31.0   78  39.0   60  30.0,
 'stats_result': {0.0: {'AUC': 0.5275,
   'KS': 0.04749999999999999,
   'KS_p_val': 0.910913641987852,
   'rel_type': 1,
   'GINI': 0.05499999999999994,
   'Count': 200,
   'Empty': 0,
   'Empty% in level': 0.0,
   'Empty% in all Empty': 0.0},
  1.0: {'AUC': 0.5425000000000001,
   'KS': 0.08500000000000002,
   'KS_p_val': 0.2758551010228671,
   'rel_type': 1,
   'GINI': 0.08500000000000019,
   'Count': 200,
   'Empty': 0,
   'Empty% in level': 0.0,
   'Empty% in all Empty': 0.0},
  2.0: {'AUC': 0.5349999999999999,
   'KS': 0.05499999999999999,
   'KS_p_val': 0.79503984

In [14]:
get_all_computions(
    temp_frame['non_related_col_cat_emp'], 
    temp_frame.Y
)

{'name': 'non_related_col_cat_emp',
 'empty count': 100,
 'empty part': 0.16666666666666666,
 'predictor_type': 'nominal',
 'describe_table':                          count    0.0       0.0%  1.0       1.0%    2.0  \
 non_related_col_cat_emp                                                   
 0                          170  153.0  90.000000   17  10.000000    0.0   
 1                          169   13.0   7.692308  134  79.289941   22.0   
 2                          161    0.0   0.000000   20  12.422360  141.0   
 Empty                      100   34.0  34.000000   29  29.000000   37.0   
 
                               2.0%  
 non_related_col_cat_emp             
 0                         0.000000  
 1                        13.017751  
 2                        87.577640  
 Empty                    37.000000  ,
 'stats_result': {0.0: {'AUC': 0.93633125,
   'KS': 0.7275,
   'KS_p_val': 3.022160932114918e-72,
   'rel_type': 1,
   'GINI': 0.8726624999999999,
   'Count': 200,
   'Empt

## stats_info_to_DataFrame

In [18]:
data_for_testing = inter_data_gen(relation_param = 1.2)


stats_info_to_DataFrame(
    get_all_computions(
        data_for_testing['non_related_col'], 
        data_for_testing.Y
    )['stats_result']
)


stats_info_to_DataFrame(
    get_all_computions(
        data_for_testing['related_col_emp'], 
        data_for_testing.Y
    )['stats_result']
)

      0.0                                                                 \
      AUC      KS  KS_p_val rel_type    GINI Count Empty Empty% in level   
0  0.5296  0.0875  0.245688       -1  0.0592   200     0             0.0   

                           1.0  ...                          2.0          \
  Empty% in all Empty      AUC  ... Empty% in all Empty      AUC      KS   
0                 0.0  0.50865  ...                 0.0  0.53825  0.0875   

                                                                              
   KS_p_val rel_type    GINI Count Empty Empty% in level Empty% in all Empty  
0  0.245688        1  0.0765   200     0             0.0                 0.0  

[1 rows x 27 columns]


Unnamed: 0_level_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
Unnamed: 0_level_1,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,AUC,...,Empty% in all Empty,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty
0,0.977422,0.828675,0.0,-1,0.954844,200,35,17.5,35.0,0.512611,...,29.0,0.966028,0.826074,0.0,1,0.932056,200,36,18.0,36.0


Getting indicators from computions uotput

In [17]:
data_for_testing = inter_data_gen(relation_param = 1.2)

comp_result =  get_all_computions(
    data_for_testing['non_related_col'], 
    data_for_testing.Y
)

print('full numeric data')
get_predictor_row(comp_result)

full numeric data


Unnamed: 0_level_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,Empty,Empty part
Unnamed: 0_level_1,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,AUC,...,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,Unnamed: 20_level_1,Unnamed: 21_level_1
non_related_col,0.500675,0.06,0.701569,1,0.00135,200,0,0.0,0.0,0.51125,...,0.0625,0.65301,-1,0.02385,200,0,0.0,0.0,0,0.0


# Writing to excel

Testing data

In [None]:
# frame size
n = 1000

test_frame = pd.DataFrame({
    "numeric_variable": np.random.normal(10, 5, n), 
    "object_variable": np.round(np.random.uniform(0, 10, n)).astype('O'),
    "column with soo000oo000oo000oo000oo long name": np.random.normal(10, 5, n),
    "Num_col_with_emp" : np.random.normal(10, 5, n),
    "Nom_col_with_emp" : np.round(np.random.uniform(0, 10, n)).astype('O')
})
test_frame.loc[test_frame.sample(50).index, "Num_col_with_emp"] = np.NaN
test_frame.loc[test_frame.sample(50).index, "Nom_col_with_emp"] = np.NaN

nv = test_frame["numeric_variable"]
ov = test_frame["object_variable"]

probs1 = nv.apply(lambda x: (x/(max(nv) - min(nv))) + ((np.random.rand())/5))
probs2 = ov.apply(lambda x: (x/(max(ov) - min(ov))) + ((np.random.rand())/5))

f_probs = (probs1 + probs2)/2
f_probs[f_probs >= 1] = 1


Y = np.zeros(n)
Y[f_probs > 0.4] = 1
Y[f_probs > 0.7] = 2

Y = pd.Series(Y)
Y_strs = Y.replace({0: 'cetegory 0', 1:'category 1', 2:'category 2'})
Y.value_counts()
Y_binary = Y.replace({2:1})

na_containts_frame = test_frame.copy()

for col in test_frame.columns:
    na_containts_frame.loc[na_containts_frame.sample(int(n/50)).index, col] = np.NaN

Adding a info about different predictors

In [15]:
my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

Let's show final table

In [None]:
my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()
#my_cpp.get_predictors_data()
my_cpp.result_DF

Writig a double header table

In [25]:
xl_writer = pd.ExcelWriter("test_result/double_header_saver.xlsx",engine='xlsxwriter')
ws = xl_writer.book.add_worksheet('test')
xl_writer.sheets['test'] = ws      

my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()
my_cpp.my_writer = xl_writer

print_double_column_header( 
    my_cpp, ws, 
    my_cpp.result_DF.loc[['numeric_variable'],:],
    "A1", 'Hello World', 
    xl_writer.book.add_format(my_cpp.default_header_format)
)

xl_writer.close()

Writing info about some predictor

# Full test

Basic setup

In [32]:
xl_writer = pd.ExcelWriter("test_result/writings.xlsx",engine='xlsxwriter')

ws = xl_writer.book.add_worksheet('test')
xl_writer.sheets['test'] = ws                             

my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()

xl_writer.close()

In [33]:
xl_writer = pd.ExcelWriter("test_result/result_test.xlsx",engine='xlsxwriter')
my_cpp.write_to_book(xl_writer)
xl_writer.close()