In [4]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import random
from sklearn.preprocessing import StandardScaler
from scipy.stats import skew, boxcox

In [5]:
import features

In [6]:

def readDataSetLexical(nrows = None):
    ## read data
    train = pd.read_csv('data/train.csv', nrows = None)
    test = pd.read_csv('data/test.csv', nrows = None)

    numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x]
    cats = [x for x in train.columns[1:-1] if 'cat' in x]
    train_test, ntrain = mungeskewed(train, test, numeric_feats)
    for col in cats:
        train_test[col] = train_test[col].apply(encode)

    ss = StandardScaler()
    train_test[numeric_feats] = \
        ss.fit_transform(train_test[numeric_feats].values)
    train = train_test.iloc[:ntrain, :].copy()
    test = train_test.iloc[ntrain:, :].copy()
    test.drop('loss', inplace=True, axis=1)
    feats = numeric_feats+ cats

    return train[feats], test[feats], train['id'], test['id'], train["loss"]

def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    # compute skew and do Box-Cox transformation (Tilli)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    print("\nSkew in numeric features:")
    print(skewed_feats)
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    return train_test, ntrain

def encode(charcode):
    r = 0
    ln = len(charcode)
    if(ln > 2):
        print("Error: Expected Maximum of Two Characters!")
        exit(0)
    for i in range(ln):
        r += (ord(charcode[i])-ord('A')+1)*26**(ln-i-1)
    return r

In [7]:
train, test, trainid, testid, trainloss = readDataSetLexical()


Skew in numeric features:
cont1     0.516420
cont2    -0.310939
cont3    -0.010002
cont4     0.416093
cont5     0.681617
cont6     0.461211
cont7     0.826046
cont8     0.676629
cont9     1.072420
cont10    0.354998
cont11    0.280819
cont12    0.291990
cont13    0.380739
cont14    0.248672
dtype: float64


In [8]:
scaler = StandardScaler()

In [44]:
colnames = train.columns.values

In [9]:
trainscaled = np.array(scaler.fit_transform(train) )

In [10]:
print trainscaled

[[ 1.21847312 -1.26093561 -1.54047095 ..., -0.50931326  0.89154504
   1.57778495]
 [-0.86890602  1.10945138  0.46393202 ..., -0.50931326  0.89154504
  -0.90754745]
 [-1.33403589 -0.71847733 -0.07284589 ..., -0.50931326 -1.54631176
   0.02765495]
 ..., 
 [ 0.04762216  1.34456586  1.45202196 ..., -0.50931326 -0.73369283
  -0.9844134 ]
 [-0.21036669 -0.41018759 -0.98929947 ..., -0.50931326  0.89154504
   1.93649272]
 [ 1.93708147  0.54833973 -0.28834872 ..., -0.50931326  0.89154504
   2.01335867]]


In [18]:
featurenum = trainscaled.shape[1]
feature_corr_list = []
for i in range(featurenum):
    corr = stats.pearsonr(trainscaled[:,i], trainloss)
    if abs(corr[0]) > 0.01:
        feature_corr = (i, abs(corr[0]))
        feature_corr_list.append(feature_corr)

In [21]:
feature_corr_list = sorted(feature_corr_list, key=lambda tup: -tup[1])

In [46]:
#print feature_corr_list
feature_corr_list_ = [(colnames[t[0]], t[1]) for t in feature_corr_list]
print feature_corr_list_

[('cat80', 0.47325728726873184), ('cat79', 0.44117227537890097), ('cat101', 0.35369826793236925), ('cat87', 0.34631097830426832), ('cat57', 0.31673566668075703), ('cat12', 0.31107953218713497), ('cat10', 0.28216214205277251), ('cat7', 0.28021582959902225), ('cat89', 0.26614735278662655), ('cat81', 0.25919122210717938), ('cat2', 0.22966531297670281), ('cat72', 0.22822718052548988), ('cat11', 0.22534650529726155), ('cat1', 0.22210474388940241), ('cat13', 0.22197014571139215), ('cat9', 0.22187012692636437), ('cat3', 0.19317951947013218), ('cat16', 0.19107617368746471), ('cat90', 0.18978919971172484), ('cat23', 0.16026437327846893), ('cat36', 0.15652937504919062), ('cat111', 0.15502121755135873), ('cat73', 0.15445688773095853), ('cat103', 0.15386752655923827), ('cont2', 0.1415279588790582), ('cat40', 0.12518099623227541), ('cat28', 0.12300409296035833), ('cat6', 0.11688331272652899), ('cat105', 0.11527333216932195), ('cont3', 0.11105330817721698), ('cat50', 0.11030016238477215), ('cat5', 0

In [28]:
feature_corr_list_minus = []
for i in range(featurenum-1):
    if i%7 == 0:
        print i
    for j in range(i+1, featurenum):
        
        corr = stats.pearsonr(trainscaled[:,i]-trainscaled[:,j], trainloss)
        if abs(corr[0]) > 0.1:
            feature_corr = (i, j,  abs(corr[0]))
            feature_corr_list_minus.append(feature_corr)
            
feature_corr_list_minus = sorted(feature_corr_list_minus, key=lambda tup: -tup[2])

0
7
14
21
28
35
42
49
56
63
70
77
84
91
98
105
112
119
126


In [47]:
#print feature_corr_list_minus
feature_corr_list_minus_ = [(colnames[t[0]], colnames[t[1]], t[2]) for t in feature_corr_list_minus]
print feature_corr_list_minus_

[('cat79', 'cat80', 0.51642763471610775), ('cat80', 'cat101', 0.51393498042626329), ('cat57', 'cat80', 0.51299098235271412), ('cat12', 'cat80', 0.49825427112262732), ('cat7', 'cat80', 0.48806085496571633), ('cat10', 'cat80', 0.48126509825660163), ('cat80', 'cat89', 0.48096369130339722), ('cat80', 'cat87', 0.47987062899139993), ('cat79', 'cat81', 0.45871610163362347), ('cat2', 'cat80', 0.45396790277577048), ('cat72', 'cat80', 0.45211734084855926), ('cat13', 'cat80', 0.45134804474065676), ('cat11', 'cat80', 0.45113164816530527), ('cat9', 'cat80', 0.44988434593241827), ('cat1', 'cat79', 0.43567247985697405), ('cont2', 'cat80', 0.43264140948385715), ('cat80', 'cat103', 0.42221481723386689), ('cat81', 'cat101', 0.42137801108322909), ('cat36', 'cat80', 0.41980084376293125), ('cont7', 'cat80', 0.41959172838821635), ('cat16', 'cat80', 0.41751928055852022), ('cat80', 'cat105', 0.41605658967009473), ('cat23', 'cat80', 0.41587141801328675), ('cat3', 'cat80', 0.41464154951603044), ('cat80', 'cat90

In [32]:
feature_corr_list_plus = []
for i in range(featurenum-1):
    if i%7 == 0:
        print i
    for j in range(i+1, featurenum):
        
        corr = stats.pearsonr(trainscaled[:,i]+trainscaled[:,j], trainloss)
        if abs(corr[0]) > 0.1:
            feature_corr = (i, j,  abs(corr[0]))
            feature_corr_list_plus.append(feature_corr)
            
feature_corr_list_plus = sorted(feature_corr_list_plus, key=lambda tup: -tup[2])

0
7
14
21
28
35
42
49
56
63
70
77
84
91
98
105
112
119
126


In [33]:
print feature_corr_list_plus

[(92, 114, 0.50321053088422096), (70, 92, 0.49166537439772073), (25, 92, 0.48623674217667906), (20, 92, 0.47132943278778938), (23, 92, 0.46630726094733316), (92, 100, 0.46528069831210961), (92, 102, 0.46426433809655582), (93, 94, 0.45257293535894222), (14, 93, 0.4471705695427487), (70, 114, 0.44389562900430884), (70, 100, 0.43943865485936512), (15, 92, 0.43844367471301743), (24, 92, 0.43697331125831346), (26, 92, 0.43660476100390194), (22, 92, 0.43390586188605734), (85, 92, 0.43153639235995667), (100, 114, 0.42897350301547776), (20, 114, 0.4226666882065217), (25, 70, 0.42042505769795696), (20, 100, 0.41655126893389205), (102, 114, 0.41445214209986397), (86, 93, 0.41403945365887185), (25, 100, 0.4118346759614378), (100, 102, 0.40855314927400516), (16, 92, 0.40451086618940235), (29, 92, 0.40417590805480863), (1, 92, 0.40384871912893333), (92, 103, 0.40280629781277483), (92, 116, 0.40158024550820892), (23, 70, 0.40108761594890957), (36, 92, 0.40108609791800504), (49, 92, 0.400807905570361

In [38]:
feature_corr_list_multi = []
for i in range(featurenum-1):
    if i%7 == 0:
        print i
    for j in range(i+1, featurenum):
        
        corr = stats.pearsonr(np.multiply(trainscaled[:,i], trainscaled[:,j] ) , trainloss)
        if abs(corr[0]) > 0.1:
            feature_corr = (i, j,  abs(corr[0]))
            feature_corr_list_multi.append(feature_corr)
            
feature_corr_list_multi = sorted(feature_corr_list_multi, key=lambda tup: -tup[2])

0
7
14
21
28
35
42
49
56
63
70
77
84
91
98
105
112
119
126


In [39]:
feature_corr_list_multi

[(92, 93, 0.38121683759931713),
 (15, 114, 0.3258422378803803),
 (25, 114, 0.3209654588497276),
 (22, 114, 0.31965482213335439),
 (20, 70, 0.31622438342130149),
 (23, 114, 0.30404924585598164),
 (70, 102, 0.29659854465878166),
 (92, 100, 0.27751649398708667),
 (93, 100, 0.27004983964903706),
 (70, 93, 0.26742758346990742),
 (20, 102, 0.26548759356141127),
 (15, 25, 0.25626065345853133),
 (23, 25, 0.25617930673657835),
 (100, 114, 0.25189444149679974),
 (70, 92, 0.25165932253022999),
 (24, 114, 0.25158749326090912),
 (22, 25, 0.25083875398269012),
 (26, 114, 0.2496046085379087),
 (92, 114, 0.2435834042105991),
 (93, 114, 0.2401082707550358),
 (20, 93, 0.23778728926748449),
 (20, 92, 0.23283697668229042),
 (85, 92, 0.23055801219098332),
 (1, 70, 0.22950088852675676),
 (22, 23, 0.22564455181197254),
 (93, 102, 0.22505183010622515),
 (15, 23, 0.22377929579123601),
 (92, 102, 0.22362850402109433),
 (25, 100, 0.21952198165547995),
 (15, 100, 0.21549028362579647),
 (23, 100, 0.215352545593506

In [40]:
feature_corr_list_divide = []
for i in range(featurenum-1):
    if i%7 == 0:
        print i
    for j in range(featurenum):
        
        corr = stats.pearsonr(np.divide(trainscaled[:,i], np.absolute(trainscaled[:,j]) +1 ) , trainloss)
        if abs(corr[0]) > 0.1:
            feature_corr = (i, j,  abs(corr[0]))
            feature_corr_list_divide.append(feature_corr)
            
feature_corr_list_divide = sorted(feature_corr_list_divide, key=lambda tup: -tup[2])

0
7
14
21
28
35
42
49
56
63
70
77
84
91
98
105
112
119
126


In [41]:
feature_corr_list_divide

[(93, 90, 0.48259984805506123),
 (93, 107, 0.48171657092652004),
 (93, 122, 0.48158488810725919),
 (93, 88, 0.48000823305667556),
 (93, 91, 0.47883366802225608),
 (93, 105, 0.47803979602051372),
 (93, 93, 0.47715820367467399),
 (93, 98, 0.47574316278299378),
 (93, 79, 0.47374938062700533),
 (93, 34, 0.47359246137287953),
 (93, 83, 0.47327458888839408),
 (93, 28, 0.47321425821266039),
 (93, 35, 0.47314331035913326),
 (93, 76, 0.47313892908374922),
 (93, 75, 0.47308973150743211),
 (93, 69, 0.47306202447840368),
 (93, 61, 0.47298049567846118),
 (93, 68, 0.47295947013323342),
 (93, 77, 0.47285413584991909),
 (93, 73, 0.47279021691697759),
 (93, 48, 0.47275761206219774),
 (93, 81, 0.47272630514751052),
 (93, 82, 0.47263350249370117),
 (93, 74, 0.47226554211736321),
 (93, 33, 0.47225691887668242),
 (93, 71, 0.47196094144054296),
 (93, 72, 0.47171582706596077),
 (93, 45, 0.47158519733675303),
 (93, 47, 0.47151446106319167),
 (93, 32, 0.47150605260572503),
 (93, 31, 0.4714865548977934),
 (93, 

In [52]:
print train['cat79'].describe()

count    188318.000000
mean          2.254453
std           0.740161
min           1.000000
25%           2.000000
50%           2.000000
75%           2.000000
max           4.000000
Name: cat79, dtype: float64
