# Testing notebook

In [1]:
import xlsxwriter
import pandas as pd
import numpy as np

from classification_power_predictor import classification_power_predictor
from writing import *

## We need some testing data

Data generator for the study of interrelationships

In [2]:
def inter_data_gen(grop_size = 200, grops_count = 3, relation_param = 1.2):
    # lets creaate a sample wich contains columns for every situation
    Y = np.concatenate([np.zeros(grop_size) + i for i in range(grops_count)])
    related_col = np.concatenate(
        [np.random.normal(i*relation_param, 0.5, grop_size) for i in range(grops_count)]
    )
    non_related_col = np.concatenate(
        [np.random.normal(0, 0.5, grop_size) for i in range(grops_count)]
    )
    # get <grops_count>-order quantiles
    related_col_quants = np.array(
        [np.quantile(related_col,(i+1)/grops_count) for i in range(grops_count)]
    )
    non_related_col_quants = np.array(
        [np.quantile(non_related_col,(i+1)/grops_count) for i in range(grops_count)]
    )
    test_frame = pd.DataFrame({
        "Y" : Y,
        "related_col" : related_col,
        "non_related_col" : non_related_col,
        "related_col_cat" : \
            map(lambda x: sum(x > related_col_quants), related_col),
        "non_related_col_cat":\
            map(lambda x: sum(x > non_related_col_quants), non_related_col)
    })
    
    test_frame["Y_names"] =\
    test_frame['Y'].replace({
        level : "name " + str(level) \
        for level in test_frame['Y'].unique()
    })
    
    return test_frame

temp_frame = inter_data_gen()
temp_frame

Unnamed: 0,Y,related_col,non_related_col,related_col_cat,non_related_col_cat,Y_names
0,0.0,0.774005,0.004957,1,1,name 0.0
1,0.0,0.101674,0.021940,0,1,name 0.0
2,0.0,-0.287045,-0.551596,0,0,name 0.0
3,0.0,-0.427489,0.865847,0,2,name 0.0
4,0.0,0.289909,1.075807,0,2,name 0.0
...,...,...,...,...,...,...
595,2.0,2.276009,0.231230,2,2,name 2.0
596,2.0,2.447916,-0.191387,2,1,name 2.0
597,2.0,2.483454,0.012697,2,1,name 2.0
598,2.0,2.006605,-0.920000,2,0,name 2.0


# Computions funcitons

In [3]:
from computions import *

## get_describe_nominal function test

Basic situation

In [4]:
get_describe_nominal(temp_frame["related_col_cat"], temp_frame.Y)

Unnamed: 0_level_0,count,0.0,0.0%,1.0,1.0%,2.0,2.0%
related_col_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,200,22.0,11.0,159,79.5,19.0,9.5
0,200,178.0,89.0,22,11.0,0.0,0.0
2,200,0.0,0.0,19,9.5,181.0,90.5


## Stats computing test

Testig settings

In [5]:
def stats_error_mesage(error_type):
    print("ERROR:")
    print(error_type)
    print("may be try again random is random")

KS_conv_level = 0.05
second_kind_error_count = 0
first_kind_error_count = 0
test_count = 200

Stats numeric

In [6]:
for i in range(test_count):
    data_for_testing = inter_data_gen(relation_param = 1.2)
    
    related_stats = get_stats_numeric(
        data_for_testing['related_col'], data_for_testing['Y']
    )
    non_related_stats = get_stats_numeric(
        data_for_testing['non_related_col'], data_for_testing['Y']
    )
    
    if related_stats[2]['AUC'] < non_related_stats[2]['AUC']:  
        stats_error_mesage(
            'AUC of related sample is smaller than AUC of independent!'
        )
        break
    if related_stats[0]['AUC'] > 0.5:
        stats_error_mesage(
            'AUC for factor with smaller on average levels must be lower than 0.5'
        )
        break
    if related_stats[2]['KS'] < non_related_stats[2]['KS']:
        stats_error_mesage(
            'KS of related sample is smaller than KS of independent!'
        )
    
    if related_stats[0]['KS_p_val'] > KS_conv_level:
        first_kind_error_count += 1
    if non_related_stats[0]['KS_p_val'] < KS_conv_level:
        second_kind_error_count += 1
        
print("KS test results")
print("first kind error")
print(
    "count " + str(first_kind_error_count) + "; share " +\
    str(np.round(first_kind_error_count*100/test_count, 3)) + "%"
)
print("second kind error")
print(
    "count " + str(second_kind_error_count) + "; share " +\
    str(np.round(second_kind_error_count*100/test_count, 3)) + "%"
)

KS test results
first kind error
count 0; share 0.0%
second kind error
count 15; share 7.5%


Stats nominal

In [7]:
for i in range(test_count):
    data_for_testing = inter_data_gen(relation_param = 1.2)
    
    related_stats = get_stats_numeric(
        data_for_testing['related_col_cat'], data_for_testing['Y']
    )
    non_related_stats = get_stats_numeric(
        data_for_testing['non_related_col_cat'], data_for_testing['Y']
    )
    
    if related_stats[2]['AUC'] < non_related_stats[2]['AUC']:  
        stats_error_mesage(
            'AUC of related sample is smaller than AUC of independent!'
        )
        break
    if related_stats[0]['AUC'] > 0.5:
        stats_error_mesage(
            'AUC for factor with smaller on average levels must be lower than 0.5'
        )
        break
    if related_stats[2]['KS'] < non_related_stats[2]['KS']:
        stats_error_mesage(
            'KS of related sample is smaller than KS of independent!'
        )
        break
        
    if related_stats[0]['KS_p_val'] > KS_conv_level:
        first_kind_error_count += 1
    if non_related_stats[0]['KS_p_val'] < KS_conv_level:
        second_kind_error_count += 1
        
print("KS test results")
print("first kind error")
print(
    "count " + str(first_kind_error_count) + "; share " +\
    str(np.round(first_kind_error_count*100/test_count, 3)) + "%"
)
print("second kind error")
print(
    "count " + str(second_kind_error_count) + "; share " +\
    str(np.round(second_kind_error_count*100/test_count, 3)) + "%"
)

KS test results
first kind error
count 0; share 0.0%
second kind error
count 19; share 9.5%


Does it work with description table

In [8]:
#Y_strs = ["name" + str(i) for i in range(3)]


dn_tab = get_describe_nominal(
    temp_frame["related_col_cat"], 
    temp_frame["Y_names"]
)

with_table = get_stats_nominal(
    temp_frame["related_col_cat"], 
    temp_frame["Y_names"], dn_tab
)
with_table

{'name 0.0': {'AUC': 0.9450000000000001,
  'KS': 0.835,
  'KS_p_val': 6.03148620091097e-104},
 'name 1.0': {'AUC': 0.8500000000000001,
  'KS': 0.6925,
  'KS_p_val': 3.440346392592297e-64},
 'name 2.0': {'AUC': 0.9525,
  'KS': 0.8575,
  'KS_p_val': 1.0254123654727213e-112}}

Getting full info about stats - no difference nominal or numeric

In [9]:
data_for_testing = inter_data_gen(relation_param = 1.2)
print("result for numeric column")
print(get_full_stats(
    data_for_testing["related_col"], data_for_testing.Y, "numeric"
))
print("result for nominal column")
print(get_full_stats(
    data_for_testing["related_col_cat"], data_for_testing.Y, "nominal"
))

result for numeric column
{0.0: {'AUC': 0.9805875000000001, 'KS': 0.8525, 'KS_p_val': 1.1444416430251821e-110, 'rel_type': -1, 'GINI': 0.9611750000000001, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 1.0: {'AUC': 0.5035625, 'KS': 0.375, 'KS_p_val': 2.6421118749418694e-17, 'rel_type': 1, 'GINI': 0.007125000000000048, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 2.0: {'AUC': 0.9770249999999999, 'KS': 0.8325, 'KS_p_val': 4.902183744688514e-103, 'rel_type': 1, 'GINI': 0.9540499999999998, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}}
result for nominal column
{0.0: {'AUC': 0.94, 'KS': 0.8200000000000001, 'KS_p_val': 1.192022874466095e-98, 'rel_type': 1, 'GINI': 0.8799999999999999, 'Count': 200, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 1.0: {'AUC': 0.82, 'KS': 0.64, 'KS_p_val': 1.6598382707722782e-53, 'rel_type': 1, 'GINI': 0.6399999999999999, 'Count': 200, 'Empty': 0,

**And finally all computions**

In [10]:
data_for_testing = inter_data_gen(relation_param = 1.2)

print('describe_table with no nas')
print(get_all_comuptions(
    data_for_testing['related_col'], 
    data_for_testing.Y
)['describe_table'])

data_for_testing.loc[data_for_testing.sample(20).index] = np.NaN
print('describe_table with nas')
print(get_all_comuptions(
    data_for_testing['related_col'], 
    data_for_testing.Y
))

describe_table with no nas
           Value:  Part %:
count  600.000000      NaN
mean     1.183592      NaN
std      1.090629      NaN
min     -1.423055      NaN
25%      0.295307      NaN
50%      1.118480      NaN
75%      2.123768      NaN
max      3.612188      NaN
Empty    0.000000      0.0
describe_table with nas
{'name': 'related_col', 'empty count': 20, 'empty part': 0.03333333333333333, 'predictor_type': 'numeric', 'describe_table':            Value:   Part %:
count  580.000000       NaN
mean     1.182237       NaN
std      1.089016       NaN
min     -1.423055       NaN
25%      0.300463       NaN
50%      1.106985       NaN
75%      2.114439       NaN
max      3.612188       NaN
Empty   20.000000  3.333333, 'stats_result': {0.0: {'AUC': 0.9746295103092784, 'KS': 0.8391859965635738, 'KS_p_val': 1.5085323414505867e-101, 'rel_type': -1, 'GINI': 0.9492590206185567, 'Count': 192, 'Empty': 0, 'Empty% in level': 0.0, 'Empty% in all Empty': 0.0}, 1.0: {'AUC': 0.5083558094029299, 'KS'

In [11]:
data_for_testing = inter_data_gen(relation_param = 1.2)

stats_info_to_DataFrame(
    get_all_comuptions(
        data_for_testing['non_related_col'], 
        data_for_testing.Y
    )['stats_result']
)

Unnamed: 0_level_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
Unnamed: 0_level_1,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,AUC,...,Empty% in all Empty,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty
0,0.514138,0.0575,0.749236,1,0.028275,200,0,0.0,0.0,0.510138,...,0.0,0.524275,0.065,0.604419,-1,0.04855,200,0,0.0,0.0


Getting indicators from computions uotput

In [12]:
data_for_testing = inter_data_gen(relation_param = 1.2)

comp_result =  get_all_comuptions(
    data_for_testing['non_related_col'], 
    data_for_testing.Y
)

print('full numeric data')
get_predictor_row(comp_result)

full numeric data


Unnamed: 0_level_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,Empty,Empty part
Unnamed: 0_level_1,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,AUC,...,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,Unnamed: 20_level_1,Unnamed: 21_level_1
non_related_col,0.530875,0.0725,0.465057,1,0.06175,200,0,0.0,0.0,0.502812,...,0.0825,0.308601,-1,0.067375,200,0,0.0,0.0,0,0.0


# Writing to excel

Testing data

In [13]:
# frame size
n = 1000

test_frame = pd.DataFrame({
    "numeric_variable": np.random.normal(10, 5, n), 
    "object_variable": np.round(np.random.uniform(0, 10, n)).astype('O'),
    "column with soo000oo000oo000oo000oo long name": np.random.normal(10, 5, n),
    "col_with_emp" : np.random.normal(10, 5, n)
})
test_frame.loc[test_frame.sample(50).index, "col_with_emp"] = np.NaN

nv = test_frame["numeric_variable"]
ov = test_frame["object_variable"]

probs1 = nv.apply(lambda x: (x/(max(nv) - min(nv))) + ((np.random.rand())/5))
probs2 = ov.apply(lambda x: (x/(max(ov) - min(ov))) + ((np.random.rand())/5))

f_probs = (probs1 + probs2)/2
f_probs[f_probs >= 1] = 1


Y = np.zeros(n)
Y[f_probs > 0.4] = 1
Y[f_probs > 0.7] = 2

Y = pd.Series(Y)
Y_strs = Y.replace({0: 'cetegory 0', 1:'category 1', 2:'category 2'})
Y.value_counts()
Y_binary = Y.replace({2:1})

na_containts_frame = test_frame.copy()

for col in test_frame.columns:
    na_containts_frame.loc[na_containts_frame.sample(int(n/50)).index, col] = np.NaN

Adding a info about different predictors

In [14]:
my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()

Let's show final table

In [15]:
my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()
#my_cpp.get_predictors_data()
my_cpp.result_DF

Unnamed: 0_level_0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,Empty,Empty part
Unnamed: 0_level_1,AUC,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,AUC,...,KS,KS_p_val,rel_type,GINI,Count,Empty,Empty% in level,Empty% in all Empty,Unnamed: 20_level_1,Unnamed: 21_level_1
numeric_variable,0.515189,0.060248,0.323139,1,0.030377,574,0,0.0,0.0,0.691697,...,0.451532,0.0,1,0.555821,147,0,0.0,0.0,0,0.0
object_variable,0.819114,0.487191,0.0,1,0.638228,574,0,0.0,0.0,0.932921,...,0.691134,0.0,1,0.83376,147,0,0.0,0.0,0,0.0
column with soo000oo000oo000oo000oo long name,0.50348,0.030974,0.967091,-1,0.00696,574,0,0.0,0.0,0.511198,...,0.070188,0.545545,-1,0.022354,147,0,0.0,0.0,0,0.0
col_with_emp,0.516536,0.067294,0.23139,-1,0.033072,574,31,5.400697,62.0,0.505055,...,0.108437,0.113025,1,0.048444,147,8,5.442177,16.0,50,0.05


Writig a double header table

In [16]:
xl_writer = pd.ExcelWriter("test_result/double_header_saver.xlsx",engine='xlsxwriter')
ws = xl_writer.book.add_worksheet('test')
xl_writer.sheets['test'] = ws      

my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()
my_cpp.my_writer = xl_writer

print_double_column_header( 
    my_cpp, ws, 
    my_cpp.result_DF.loc[['numeric_variable'],:],
    "A1", 'Hello World', 
    xl_writer.book.add_format(my_cpp.default_header_format)
)

xl_writer.close()

Writing info about some predictor

In [18]:
xl_writer = pd.ExcelWriter("test_result/writings.xlsx",engine='xlsxwriter')

ws = xl_writer.book.add_worksheet('test')
xl_writer.sheets['test'] = ws                             

my_cpp = classification_power_predictor(test_frame, Y)
my_cpp.update_predictors()

xl_writer.close()

In [None]:
xl_writer = pd.ExcelWriter("test_result/result_test.xlsx",engine='xlsxwriter')
my_cpp.write_to_book(xl_writer)
xl_writer.close()