In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('sample.csv')

In [3]:
df.head()

Unnamed: 0,record,fraud_label,dow_risk,ssn_day_since,ssn_count_0,ssn_count_1,ssn_count_3,ssn_count_7,ssn_count_14,ssn_count_30,...,ssn_homephone_count_0_by_3,ssn_homephone_count_0_by_7,ssn_homephone_count_0_by_14,ssn_homephone_count_0_by_30,ssn_homephone_count_1_by_3,ssn_homephone_count_1_by_7,ssn_homephone_count_1_by_14,ssn_homephone_count_1_by_30,ssn_name_count_0_by_3,ssn_name_count_0_by_7
0,1,0,0.014499,0.0,1,1,1,1,1,1,...,3.0,7.0,14.0,30.0,3.0,7.0,14.0,30.0,3.0,7.0
1,2,1,0.014499,0.0,1,1,1,1,1,1,...,3.0,7.0,14.0,30.0,3.0,7.0,14.0,30.0,3.0,7.0
2,3,0,0.014499,0.0,1,1,1,1,1,1,...,3.0,7.0,14.0,30.0,3.0,7.0,14.0,30.0,3.0,7.0
3,4,0,0.014499,0.0,1,1,1,1,1,1,...,3.0,7.0,14.0,30.0,3.0,7.0,14.0,30.0,3.0,7.0
4,5,0,0.014499,0.0,1,1,1,1,1,1,...,3.0,7.0,14.0,30.0,3.0,7.0,14.0,30.0,3.0,7.0


In [4]:
df.columns

Index(['record', 'fraud_label', 'dow_risk', 'ssn_day_since', 'ssn_count_0',
       'ssn_count_1', 'ssn_count_3', 'ssn_count_7', 'ssn_count_14',
       'ssn_count_30',
       ...
       'ssn_homephone_count_0_by_3', 'ssn_homephone_count_0_by_7',
       'ssn_homephone_count_0_by_14', 'ssn_homephone_count_0_by_30',
       'ssn_homephone_count_1_by_3', 'ssn_homephone_count_1_by_7',
       'ssn_homephone_count_1_by_14', 'ssn_homephone_count_1_by_30',
       'ssn_name_count_0_by_3', 'ssn_name_count_0_by_7'],
      dtype='object', length=311)

In [5]:
y = df['fraud_label']

In [6]:
df_f = df[y==1]
df_nf = df[y==0]

In [7]:
df_f = pd.DataFrame(np.repeat(df_f.values, 100, axis=0))
df_f.set_axis(df.columns, axis='columns', inplace=True)

In [8]:
ks_stat = {}
for col in df.columns:
    ks = ks_2samp(df_f[col], df_nf[col])
    print(f'KS for {col}: {ks.statistic:.3f}, with p: {ks.pvalue:.3f}')
    ks_stat[col] = ks

KS for record: 0.097, with p: 1.000
KS for fraud_label: 1.000, with p: 1.000
KS for dow_risk: 0.066, with p: 1.000
KS for ssn_day_since: 0.117, with p: 1.000
KS for ssn_count_0: 0.084, with p: 1.000
KS for ssn_count_1: 0.135, with p: 1.000
KS for ssn_count_3: 0.135, with p: 1.000
KS for ssn_count_7: 0.135, with p: 1.000
KS for ssn_count_14: 0.135, with p: 1.000
KS for ssn_count_30: 0.135, with p: 1.000
KS for address_day_since: 0.043, with p: 1.000
KS for address_count_0: 0.041, with p: 1.000
KS for address_count_1: 0.051, with p: 1.000
KS for address_count_3: 0.060, with p: 1.000
KS for address_count_7: 0.060, with p: 1.000
KS for address_count_14: 0.060, with p: 1.000
KS for address_count_30: 0.060, with p: 1.000
KS for dob_day_since: 0.116, with p: 1.000
KS for dob_count_0: 0.060, with p: 1.000
KS for dob_count_1: 0.120, with p: 1.000
KS for dob_count_3: 0.115, with p: 1.000
KS for dob_count_7: 0.115, with p: 1.000
KS for dob_count_14: 0.115, with p: 1.000
KS for dob_count_30: 0.115

In [18]:
sorted_ks_pvalue = sorted(ks_stat.items(), key=lambda x:x[1].pvalue, reverse=True)
sorted_ks_stat = sorted(ks_stat.items(), key=lambda x:x[1].statistic, reverse=True)

In [19]:
with open('ks_stat.csv', 'w') as f:
    f.write('\n'.join([f'{i[0]},{i[1].statistic},{i[1].pvalue}' for i in sorted_ks_stat]))

In [11]:
from sklearn.metrics import normalized_mutual_info_score

In [12]:
mi = {}
for col in df.columns:
    mi[col] = normalized_mutual_info_score(y, df[col])

In [13]:
sorted_mi = sorted(mi.items(), key=lambda x:x[1], reverse=True)

In [17]:
with open('mi_stat.csv', 'w') as f:
    f.write('\n'.join([f'{i[0]},{i[1]}' for i in sorted_mi]))

In [15]:
num = len(sorted_mi) // 2
mi_set = {i[0] for i in sorted_mi[:num]}
ks_set = {i[0] for i in sorted_ks_stat[:num]}