In [266]:
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.metrics import log_loss
from scipy.optimize import minimize

In [267]:
def get_minmax(df):
    a=np.where(np.sum((df.iloc[:,:7]>.5).astype(int),axis=1)>3,
                                    df['is_iceberg_max'],
                                    df['is_iceberg_min'])

    return a

def get_max(df):
    a=df.iloc[:,:7].max(axis=1)
    return a

def get_mean(df):
    a=df.iloc[:,:7].mean(axis=1)
    return a


def get_median(df):
    a=df.iloc[:,:7].median(axis=1)
    return a

def mae_func(weights,Y_values,predictions):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction
    #print(final_prediction.shape)
    #print(prediction.shape)
    #print(Y_values.shape)
    return log_loss(Y_values, final_prediction)

def get_mini(df):
    predictions=[]
    Y_values = df['is_iceberg'].values
    #print(Y_values.shape)
    lls = []
    wghts = []
    for i in range(7):
        predictions.append(np.array(df.iloc[:,i]))

    for i in range(100):
        starting_values = np.random.uniform(size=7)
        cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
        bounds = [(0,1)]*len(predictions)

        res = minimize(fun=mae_func, x0=starting_values,args=(Y_values, predictions), method='L-BFGS-B', bounds=bounds, options={'disp': False, 'maxiter': 100000})

        lls.append(res['fun'])
        wghts.append(res['x'])
    # Uncomment the next line if you want to see the weights and scores calculated in real time
    #    print('Weights: {weights}  Score: {score}'.format(weights=res['x'], score=res['fun']))

    bestSC = np.min(lls)
    bestWght = wghts[np.argmin(lls)]
    #print(bestSC)
    print(bestWght)
    a = 0.0*Y_values
    for weight, prediction in zip(bestWght, predictions):
        a += weight*prediction
    return a

In [268]:
def gen_type(df):
    hig = .8
    mid = .5
    low = .2

    if df.lr9>.8 and df.is_iceberg<.5:
        return 1
    if df.lr9<.2 and df.is_iceberg>.5:
        return 1
    return 0

In [269]:
def gen_test(df,m,n):
    df['new_pred'] = df['lr9']
    #a = df#[df.type==3]
    a = df[df[m]>=n]
    index = a.index
    print(a.shape)
    print('orig logloss is: ',log_loss(df['is_iceberg'], df['lr9']))
    pred = get_minmax(a)
    df.loc[index,'new_pred'] = pred.clip(0.001,0.999)
    print('minmax logloss is: ',log_loss(df['is_iceberg'],df['new_pred']))
    df['new_pred'] = df['lr9']
    #print('minmax logloss is: ',log_loss(df['is_iceberg'],df['new_pred']))

    pred = get_mean(a)
    df.loc[index,'new_pred'] = pred.clip(0.001,0.999)
    print('mean logloss is: ',log_loss(df['is_iceberg'],df['new_pred']))
    df['new_pred'] = df['lr9']
    #print('mean logloss is: ',log_loss(df['is_iceberg'],df['new_pred']))

    pred = get_median(a)
    df.loc[index,'new_pred'] = pred.clip(0.001,0.999)
    print('median logloss is: ',log_loss(df['is_iceberg'],df['new_pred']))
    df['new_pred'] = df['lr9']
    #print('median logloss is: ',log_loss(df['is_iceberg'],df['new_pred']))

    pred = get_mini(a)
    df.loc[index,'new_pred'] = pred.clip(0.001,0.999)
    print('mini logloss is: ',log_loss(df['is_iceberg'],df['new_pred']))
    df['new_pred'] = df['lr9']
    #print('mini logloss is: ',log_loss(df['is_iceberg'],df['new_pred']))


In [270]:
train = pd.read_csv('myvalid.csv')
test = pd.read_csv('mytest.csv')
train['is_iceberg_max'] = train.iloc[:, :7].max(axis=1)
train['is_iceberg_min'] = train.iloc[:, :7].min(axis=1)
train['is_iceberg_mean'] = train.iloc[:, :7].mean(axis=1)
train['is_iceberg_median'] = train.iloc[:, :7].median(axis=1)
train['is_iceberg_std'] = train.iloc[:, :7].std(axis=1)
test['is_iceberg_max'] = test.iloc[:, :7].max(axis=1)
test['is_iceberg_min'] = test.iloc[:, :7].min(axis=1)
test['is_iceberg_mean'] = test.iloc[:, :7].mean(axis=1)
test['is_iceberg_median'] = test.iloc[:, :7].median(axis=1)
test['is_iceberg_std'] = test.iloc[:, :7].std(axis=1)

train['type'] = train.apply(gen_type,1)
#test['type'] = test.apply(gen_type,1)

In [271]:
gen_test(train,'type',.9)

(34, 16)
orig logloss is:  0.125017134156
minmax logloss is:  0.140457222874
mean logloss is:  0.102154960145
median logloss is:  0.109711374969
[ 1.  1.  1.  1.  1.  1.  1.]
mini logloss is:  0.147782394677


In [272]:
train[train.type==1]

Unnamed: 0,orig1+TLres50,trans1+TLvgg16+Adam,normf1+TLvgg16,mix+lr,mix+lgb,mix+svm,lr9,id,is_iceberg,is_iceberg_max,is_iceberg_min,is_iceberg_mean,is_iceberg_median,is_iceberg_std,type,new_pred
26,0.163231,0.131698,0.054178,0.980769,0.402647,0.777715,0.103521,958d42a8,1,0.980769,0.054178,0.373394,0.163231,0.367548,1,0.103521
161,0.992831,0.964832,0.971465,0.998016,0.98488,0.997456,0.988721,2f881d78,0,0.998016,0.964832,0.985457,0.988721,0.012834,1,0.988721
178,0.155006,0.049144,0.122662,0.953943,0.229853,0.09616,0.037888,a9ab128c,1,0.953943,0.037888,0.234951,0.122662,0.323676,1,0.037888
276,0.668055,0.584428,0.431836,0.101356,0.114832,0.317042,0.180001,e0323d3d,1,0.668055,0.101356,0.342507,0.317042,0.227132,1,0.180001
292,0.39507,0.110221,0.538535,0.038943,0.122276,0.033113,0.042504,a996191d,1,0.538535,0.033113,0.182952,0.110221,0.201379,1,0.042504
308,0.037274,0.036769,0.020798,0.032937,0.030173,0.01502,0.008056,87319e1c,1,0.037274,0.008056,0.025861,0.030173,0.011388,1,0.008056
333,0.937305,0.925763,0.921871,0.770765,0.678706,0.644751,0.948894,74c92938,0,0.948894,0.644751,0.832579,0.921871,0.131612,1,0.948894
350,0.978347,0.934119,0.903692,0.734713,0.917766,0.585872,0.965975,1303a495,0,0.978347,0.585872,0.860069,0.917766,0.145384,1,0.965975
364,0.92816,0.829317,0.961569,0.693024,0.932323,0.752311,0.965776,5a501d33,0,0.965776,0.693024,0.866069,0.92816,0.109196,1,0.965776
370,0.337595,0.636648,0.629657,0.169332,0.113514,0.095298,0.198777,2d348f42,1,0.636648,0.095298,0.311546,0.198777,0.233269,1,0.198777


In [318]:
train['lrdis'] = np.abs(train['lr9']-train['is_iceberg_std'])
train['dis'] = np.abs(train['is_iceberg_max']-train['is_iceberg_min'])
train['lrstd'] = np.sqrt((np.square(train['orig1+TLres50']-train['lr9'])+
                        np.square(train['trans1+TLvgg16+Adam']-train['lr9'])+
                        np.square(train['normf1+TLvgg16']-train['lr9'])+
                        np.square(train['mix+lr']-train['lr9'])+
                        np.square(train['mix+lgb']-train['lr9'])+
                        np.square(train['mix+svm']-train['lr9']))/6)
y = train.loc[:,'type']
y.drop([161,308,753,892,905],inplace=True)
X = train.loc[:,['lrstd','is_iceberg_std','is_iceberg_mean','lr9']]#
X.drop([161,308,753,892,905],inplace=True)
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X)
X = pd.DataFrame(scaling.transform(X),columns=X.columns)

In [319]:
log = 'knn'
if log == 'lr':
    from sklearn.linear_model import LogisticRegression
    C_regu = 1
    lr_model = LogisticRegression(penalty='l1',C=C_regu)
    log_model = lr_model
if log == 'nb':
    from sklearn.naive_bayes import GaussianNB
    lr_model = GaussianNB()
    log_model = lr_model
if log == 'knn':
    from sklearn.neighbors import KNeighborsClassifier
    lr_model = KNeighborsClassifier(n_neighbors=25, weights='distance')
    log_model = lr_model
if log == 'lda':
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    lr_model = LinearDiscriminantAnalysis()
    log_model = lr_model
if log == 'qda':
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    lr_model = QuadraticDiscriminantAnalysis()
    log_model = lr_model
if log == 'nc':
    from sklearn.neighbors.nearest_centroid import NearestCentroid
    lr_model = NearestCentroid()
    log_model = lr_model

In [321]:
from sklearn.model_selection import KFold
K = 1599
kf = KFold(n_splits=K, random_state = 1108, shuffle = False)
a = 0
b = 0
c = []
d = []
for i, (train_index, test_index) in enumerate(kf.split(X)):
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    model = log_model
    model.fit(X_train,y_train)
    tmp=model.score(X_valid,y_valid)
    a+=tmp
    tmp = model.predict(X_valid)[0]
    b+=tmp
    
    tmp = model.predict_proba(X_valid)[0,1]
    #print(tmp)
    c.append(tmp)
    #print(y_valid.values)
    d.append(y_valid.values[0])
print(a)
print(b)
#print(d,c)
print(log_loss(y.values,d))
print(log_loss(y.values,c))
print(log_loss(d,c))
a

1572.0
2
9.99200722163e-16
0.124621701456
0.124621701456


1572.0

In [322]:
train['new_type'] = c

ValueError: Length of values does not match length of index

In [310]:
train[train.new_type>=.1].shape

(143, 20)

In [340]:
gen_test(train,'dis',.91)

(6, 20)
orig logloss is:  0.125017134156
minmax logloss is:  0.128905111591
mean logloss is:  0.122795802362
median logloss is:  0.124017380665
[ 0.          0.          0.          0.76558854  0.          0.          0.        ]
mini logloss is:  0.121212121116


In [317]:
train[train.new_type>=.45]

Unnamed: 0,orig1+TLres50,trans1+TLvgg16+Adam,normf1+TLvgg16,mix+lr,mix+lgb,mix+svm,lr9,id,is_iceberg,is_iceberg_max,is_iceberg_min,is_iceberg_mean,is_iceberg_median,is_iceberg_std,type,new_pred,lrdis,dis,lrstd,new_type
21,0.084159,0.601634,0.789117,0.143754,0.871721,0.664435,0.769775,112a6cfa,1,0.871721,0.084159,0.560656,0.664435,0.317803,0,0.769775,0.451972,0.787562,0.389895,0.60355
198,0.119601,0.596353,0.817323,0.456248,0.709919,0.679267,0.781428,1c536d78,0,0.817323,0.119601,0.594306,0.679267,0.241435,0,0.781428,0.539993,0.697722,0.314867,0.556454
229,0.751123,0.71659,0.705483,0.655559,0.901599,0.697929,0.900649,72da83eb,1,0.901599,0.655559,0.761276,0.71659,0.099597,0,0.900649,0.801052,0.24604,0.180504,0.547117
325,0.502183,0.668711,0.743911,0.48093,0.833167,0.555934,0.812832,c0ce2db6,1,0.833167,0.48093,0.65681,0.668711,0.146169,0,0.812832,0.666663,0.352237,0.223082,0.526078
378,0.474965,0.165493,0.26087,0.550372,0.285898,0.568161,0.116082,430f9c05,0,0.568161,0.116082,0.345977,0.285898,0.184418,0,0.116082,0.068336,0.452079,0.309306,0.786857
538,0.640261,0.876056,0.673962,0.590024,0.903027,0.517185,0.901911,29ad22d1,1,0.903027,0.517185,0.728918,0.673962,0.161706,0,0.901911,0.740205,0.385842,0.24711,0.689708
545,0.542528,0.852167,0.871039,0.639971,0.798476,0.612177,0.926169,b5168440,1,0.926169,0.542528,0.748932,0.798476,0.148668,0,0.926169,0.777501,0.383641,0.242385,0.798374
550,0.195511,0.185765,0.452224,0.504511,0.247234,0.613155,0.107299,246a9cf9,0,0.613155,0.107299,0.329386,0.247234,0.191878,0,0.107299,0.084579,0.505856,0.307181,0.666231
710,0.557716,0.166456,0.272904,0.542138,0.439602,0.296192,0.144492,49f4d394,0,0.557716,0.144492,0.345643,0.296192,0.169666,0,0.144492,0.025174,0.413224,0.275666,0.532064
808,0.679462,0.598211,0.907389,0.604448,0.954651,0.713161,0.924682,1b059732,1,0.954651,0.598211,0.768858,0.713161,0.155583,0,0.924682,0.769099,0.35644,0.229203,0.468464


In [83]:
mytest = np.array([[0,1],[1,1],[2,1],[3,1],[4,1],[5,1],[6,1],[7,1],[8,1],[9,1]])
mykf = KFold(n_splits=10, random_state = 1108, shuffle = False)
for i, (train_index, test_index) in enumerate(mykf.split(mytest)):
    print(train_index,test_index)

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


In [40]:
1571/1604

0.979426433915212

In [69]:
    model = log_model
    model.fit(X,y)
    model.score(X,y)

0.97880299251870329

In [342]:
train[train.dis>.89]


Unnamed: 0,orig1+TLres50,trans1+TLvgg16+Adam,normf1+TLvgg16,mix+lr,mix+lgb,mix+svm,lr9,id,is_iceberg,is_iceberg_max,is_iceberg_min,is_iceberg_mean,is_iceberg_median,is_iceberg_std,type,new_pred,lrdis,dis,lrstd,new_type
26,0.163231,0.131698,0.054178,0.980769,0.402647,0.777715,0.103521,958d42a8,1,0.980769,0.054178,0.373394,0.163231,0.367548,1,0.103521,0.264027,0.926591,0.469108,0.282667
178,0.155006,0.049144,0.122662,0.953943,0.229853,0.09616,0.037888,a9ab128c,1,0.953943,0.037888,0.234951,0.122662,0.323676,1,0.037888,0.285788,0.916055,0.387391,0.054569
329,0.259603,0.555264,0.958281,0.047329,0.086697,0.559568,0.431034,0d42919f,0,0.958281,0.047329,0.413968,0.431034,0.317041,0,0.431034,0.113993,0.910952,0.317577,0.004307
644,0.400437,0.395778,0.034612,0.877413,0.981761,0.846545,0.554242,86ff8b5e,0,0.981761,0.034612,0.584398,0.554242,0.337903,0,0.554242,0.216339,0.947149,0.339469,0.003547
679,0.008681,0.13103,0.467136,0.978729,0.849243,0.216368,0.201254,606412bc,1,0.978729,0.008681,0.407492,0.216368,0.374067,0,0.201254,0.172813,0.970048,0.435372,0.071163
700,0.450949,0.92085,0.960072,0.289282,0.065857,0.408385,0.581704,e870fc10,1,0.960072,0.065857,0.5253,0.450949,0.325233,0,0.581704,0.256471,0.894215,0.33089,0.007536
865,0.969784,0.973139,0.728381,0.801494,0.567602,0.08117,0.878411,dedf2035,0,0.973139,0.08117,0.714283,0.801494,0.313416,1,0.878411,0.564995,0.891969,0.360079,0.112931
904,0.22126,0.114681,0.118583,0.957846,0.051696,0.637126,0.077182,5922e495,0,0.957846,0.051696,0.311196,0.118583,0.348878,0,0.077182,0.271696,0.90615,0.43082,0.201165
998,0.082144,0.000185,0.000327,0.902701,0.145994,0.570641,0.034986,308fc1fe,0,0.902701,0.000185,0.24814,0.082144,0.350897,0,0.034986,0.315911,0.902516,0.419685,0.106494
1062,0.977321,0.981095,0.507612,0.864349,0.084257,0.234959,0.611265,be24e1d3,1,0.981095,0.084257,0.608694,0.611265,0.35704,0,0.611265,0.254225,0.896838,0.35705,0.00397
