In [1]:
# In[2]:

import pandas as pd
import numpy as np
import scipy.stats as stats
import random
from sklearn.preprocessing import StandardScaler
from scipy.stats import skew, boxcox


def readDataSetLexical(nrows = None):
    ## read data
    train = pd.read_csv('data/train.csv', nrows = None)
    test = pd.read_csv('data/test.csv', nrows = None)

    numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x]
    cats = [x for x in train.columns[1:-1] if 'cat' in x]
    train_test, ntrain = mungeskewed(train, test, numeric_feats)
    for col in cats:
        train_test[col] = train_test[col].apply(encode)

    ss = StandardScaler()
    train_test[numeric_feats] =         ss.fit_transform(train_test[numeric_feats].values)
    train = train_test.iloc[:ntrain, :].copy()
    test = train_test.iloc[ntrain:, :].copy()
    test.drop('loss', inplace=True, axis=1)
    feats = numeric_feats+ cats

    return train[feats], test[feats], train['id'], test['id'], train["loss"]

def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    # compute skew and do Box-Cox transformation (Tilli)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    print("\nSkew in numeric features:")
    print(skewed_feats)
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    return train_test, ntrain

def encode(charcode):
    r = 0
    ln = len(charcode)
    if(ln > 2):
        print("Error: Expected Maximum of Two Characters!")
        exit(0)
    for i in range(ln):
        r += (ord(charcode[i])-ord('A')+1)*26**(ln-i-1)
    return r


In [18]:
train, test, trainid, testid, trainloss = readDataSetLexical()
shift = 200
trainloss = np.log(trainloss + shift)
scaler = StandardScaler()
colnames = train.columns.values
trainscaled = np.array(scaler.fit_transform(train) )


Skew in numeric features:
cont1     0.516420
cont2    -0.310939
cont3    -0.010002
cont4     0.416093
cont5     0.681617
cont6     0.461211
cont7     0.826046
cont8     0.676629
cont9     1.072420
cont10    0.354998
cont11    0.280819
cont12    0.291990
cont13    0.380739
cont14    0.248672
dtype: float64


In [19]:
featurenum = trainscaled.shape[1]
feature_corr_list = []
for i in range(featurenum):
    corr = stats.pearsonr(trainscaled[:,i], trainloss)
    if abs(corr[0]) > 0.01:
        feature_corr = (i, abs(corr[0]))
        feature_corr_list.append(feature_corr)


feature_corr_list = sorted(feature_corr_list, key=lambda tup: -tup[1])

In [20]:
feature_corr_list_ = [(colnames[t[0]], t[1]) for t in feature_corr_list]

In [21]:
feature_corr_list_minus = []
for i in range(featurenum-1):
    if i%7 == 0:
        print i
    for j in range(i+1, featurenum):
        
        corr = stats.pearsonr(trainscaled[:,i]-trainscaled[:,j], trainloss)
        if abs(corr[0]) > 0.1:
            feature_corr = (i, j,  abs(corr[0]))
            feature_corr_list_minus.append(feature_corr)
            
feature_corr_list_minus = sorted(feature_corr_list_minus, key=lambda tup: -tup[2])
feature_corr_list_minus_ = [(colnames[t[0]], colnames[t[1]], t[2]) for t in feature_corr_list_minus]

0
7
14
21
28
35
42
49
56
63
70
77
84
91
98
105
112
119
126


In [22]:

feature_corr_list_plus = []
for i in range(featurenum-1):
    if i%7 == 0:
        print i
    for j in range(i+1, featurenum):
        
        corr = stats.pearsonr(trainscaled[:,i]+trainscaled[:,j], trainloss)
        if abs(corr[0]) > 0.1:
            feature_corr = (i, j,  abs(corr[0]))
            feature_corr_list_plus.append(feature_corr)
            
feature_corr_list_plus = sorted(feature_corr_list_plus, key=lambda tup: -tup[2])
feature_corr_list_plus_ = [(colnames[t[0]], colnames[t[1]], t[2]) for t in feature_corr_list_plus]

0
7
14
21
28
35
42
49
56
63
70
77
84
91
98
105
112
119
126


In [23]:

feature_corr_list_multi = []
for i in range(featurenum-1):
    if i%7 == 0:
        print i
    for j in range(i+1, featurenum):
        
        corr = stats.pearsonr(np.multiply(trainscaled[:,i], trainscaled[:,j] ) , trainloss)
        if abs(corr[0]) > 0.1:
            feature_corr = (i, j,  abs(corr[0]))
            feature_corr_list_multi.append(feature_corr)
            
feature_corr_list_multi = sorted(feature_corr_list_multi, key=lambda tup: -tup[2])
feature_corr_list_multi_ = [(colnames[t[0]], colnames[t[1]], t[2]) for t in feature_corr_list_multi]

0
7
14
21
28
35
42
49
56
63
70
77
84
91
98
105
112
119
126


In [24]:

feature_corr_list_divide = []
for i in range(featurenum-1):
    if i%7 == 0:
        print i
    for j in range(featurenum):
        
        corr = stats.pearsonr(np.divide(trainscaled[:,i], np.absolute(trainscaled[:,j]) +1 ) , trainloss)
        if abs(corr[0]) > 0.1:
            feature_corr = (i, j,  abs(corr[0]))
            feature_corr_list_divide.append(feature_corr)
            
feature_corr_list_divide = sorted(feature_corr_list_divide, key=lambda tup: -tup[2])
feature_corr_list_divide_ = [(colnames[t[0]], colnames[t[1]], t[2]) for t in feature_corr_list_divide]

0
7
14
21
28
35
42
49
56
63
70
77
84
91
98
105
112
119
126


In [30]:

feature_corr_list_3 = []
for i in range(featurenum-1):
    if i%7 == 0:
        print i
    for j in range(i+1, featurenum):
        for k in range(featurenum):
            if k!=i and k!=j:
                corr = stats.pearsonr(np.multiply(trainscaled[:,i]-trainscaled[:,j], trainscaled[:,k] ) , trainloss)
                if abs(corr[0]) > 0.1:
                    feature_corr = (i, j, k,  abs(corr[0]))
                    feature_corr_list_3.append(feature_corr)
            
feature_corr_list_3 = sorted(feature_corr_list_3, key=lambda tup: -tup[3])
feature_corr_list_3_ = [(colnames[t[0]], colnames[t[1]],  colnames[t[2]], t[3]) for t in feature_corr_list_3]

0
7


KeyboardInterrupt: 

In [25]:
final_list = []
for tup in feature_corr_list_minus_:
    final_list.append(tup+("-",))
for tup in feature_corr_list_plus_:
    final_list.append(tup+("+",))
for tup in feature_corr_list_multi_:
    final_list.append(tup+("*",))
for tup in feature_corr_list_divide_:
    final_list.append(tup+("/",))

In [26]:
final_list = sorted(final_list, key=lambda tup: -tup[2])

In [27]:
print final_list

[('cat80', 'cat101', 0.53907180825224987, '-'), ('cat12', 'cat80', 0.5244145983395182, '-'), ('cat79', 'cat80', 0.50649610441783344, '-'), ('cat79', 'cat101', 0.50374293055346264, '+'), ('cat10', 'cat80', 0.50148023762980576, '-'), ('cat80', 'cat77', 0.49555278684874948, '/'), ('cat80', 'cat78', 0.4924250081510429, '/'), ('cat80', 'cat85', 0.48989767108473703, '/'), ('cat80', 'cat80', 0.48983444401936038, '/'), ('cat80', 'cat92', 0.48974281863880154, '/'), ('cat12', 'cat79', 0.48725474852416256, '+'), ('cat80', 'cat94', 0.48691372219905266, '/'), ('cat80', 'cat76', 0.48575843147205339, '/'), ('cat80', 'cat21', 0.48460250736445493, '/'), ('cat80', 'cat70', 0.48429352572412526, '/'), ('cat80', 'cat15', 0.48423074553828777, '/'), ('cat80', 'cat75', 0.4842189402533193, '/'), ('cat80', 'cat63', 0.48420738952777959, '/'), ('cat80', 'cat22', 0.48420469229290169, '/'), ('cat80', 'cat48', 0.48417706680931089, '/'), ('cat80', 'cat56', 0.48417050575194304, '/'), ('cat80', 'cat62', 0.4841448480176

In [28]:

dictlim = {}
lim = 2
result = []
for tup in final_list:
    
    if tup[0] not in dictlim:
        dictlim[tup[0]] = 0
    if tup[1] not in dictlim:
        dictlim[tup[1]] = 0

    if dictlim[tup[1]] < lim and dictlim[tup[0]] < lim:
        result.append(tup)

        dictlim[tup[0]] += 1

        dictlim[tup[1]] += 1

        
print result

[('cat80', 'cat101', 0.53907180825224987, '-'), ('cat12', 'cat80', 0.5244145983395182, '-'), ('cat79', 'cat101', 0.50374293055346264, '+'), ('cat12', 'cat79', 0.48725474852416256, '+'), ('cat81', 'cat87', 0.41124259427488874, '-'), ('cat10', 'cat81', 0.40216609930497876, '-'), ('cat1', 'cat87', 0.39796687934844266, '-'), ('cat1', 'cat10', 0.37682725921935278, '-'), ('cat2', 'cat57', 0.33760439710228435, '+'), ('cat2', 'cat72', 0.33609668480943178, '+'), ('cat9', 'cat57', 0.32829734766307028, '+'), ('cat9', 'cat72', 0.32745999355935101, '+'), ('cat11', 'cat103', 0.31971760507461972, '+'), ('cat7', 'cat11', 0.31074403922891847, '+'), ('cat13', 'cat111', 0.3088851702393845, '+'), ('cat7', 'cat13', 0.30659326029319017, '+'), ('cat103', 'cat111', 0.2867545818618395, '+'), ('cat3', 'cat89', 0.27279044359043342, '+'), ('cat16', 'cat89', 0.2717765899443455, '+'), ('cat3', 'cat23', 0.26426274934931032, '+'), ('cat23', 'cat90', 0.26005007308112243, '+'), ('cat16', 'cat73', 0.25840263818584408, '

In [29]:
len(result)

51

In [31]:
for tup in result:
    if tup[3] == "+":
        resultvector = train[tup[0]] + train[tup[1]]
    if tup[3] == "-":
        resultvector = train[tup[0]] - train[tup[1]]
    if tup[3] == "*":
        resultvector = np.multiply(train[tup[0]], train[tup[1]] ) 
    if tup[3] == "/":
        resultvector = np.divide(train[tup[0]], np.absolute(train[tup[1]]) +1 )
    train[tup[0]+tup[3]+tup[1]] = resultvector

In [32]:
train.columns.values

array(['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14',
       'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15',
       'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22',
       'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29',
       'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36',
       'cat37', 'cat38', 'cat39', 'cat40', 'cat41', 'cat42', 'cat43',
       'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat50',
       'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57',
       'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64',
       'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat70', 'cat71',
       'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78',
       'cat79', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85',
       'cat86',

In [33]:
feature_corr_list

[(93, 0.48426741685278224),
 (92, 0.41257655529471199),
 (114, 0.38313496567576022),
 (25, 0.34125019930551953),
 (100, 0.3325707057262417),
 (23, 0.30288277604222846),
 (94, 0.27612298113105888),
 (14, 0.2666377239033475),
 (15, 0.2648029265637345),
 (22, 0.25182291799726714),
 (24, 0.24230924100333726),
 (85, 0.24208305262289656),
 (26, 0.23622435752797977),
 (70, 0.23069597110520951),
 (20, 0.21369007213971092),
 (102, 0.20298238197040516),
 (16, 0.19833738421048028),
 (29, 0.19569414399549337),
 (103, 0.19276206571627735),
 (116, 0.18447375543698122),
 (36, 0.18207668813924963),
 (86, 0.18143244531656993),
 (124, 0.17924946622543414),
 (49, 0.17713459302649201),
 (19, 0.16547835561215551),
 (63, 0.15971306893594131),
 (41, 0.13162972115908977),
 (53, 0.13008473579225774),
 (18, 0.12966470056240842),
 (17, 0.12418616591361825),
 (51, 0.12332704311216408),
 (38, 0.11318685336140366),
 (95, 0.10914126416290154),
 (1, 0.10866864538478962),
 (37, 0.10106860236457152),
 (54, 0.0959454096