In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression


train = pd.read_csv('./train_feature.csv').fillna(0)
test = pd.read_csv('./test_feature.csv').fillna(0)
uid = test.uid

data = train.drop(['uid','label'],axis=1).values
target = train.label.values



In [6]:
def logloss(attempt, actual, epsilon=1.0e-15):
    """Logloss, i.e. the score of the bioresponse competition.
    """
    attempt = np.clip(attempt, epsilon, 1.0-epsilon)
    return - np.mean(actual * np.log(attempt) +
                     (1.0 - actual) * np.log(1.0 - attempt))

In [9]:
def load_data():
    train = pd.read_csv('./train_feature.csv')
    test = pd.read_csv('./test_feature.csv')
    label = train.label.values
    uid = test.uid.values
    train_feature = train.drop(['uid', 'label'], axis=1).values
    test_feature = test.drop(['uid'], axis=1).values
    return train_feature, label, test_feature


In [10]:
np.random.seed(0)  # seed to shuffle the train set

n_folds = 10
verbose = True
shuffle = False

X, y, X_submission = load_data()

if shuffle:
    idx = np.random.permutation(y.size)
    X = X[idx]
    y = y[idx]

skf = list(StratifiedKFold(y, n_folds))

clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]

print("Creating train and test sets for blending.")

dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

for j, clf in enumerate(clfs):
    dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
    for i, (train, test) in enumerate(skf):
        X_train = X[train]
        y_train = y[train]
        X_test = X[test]
        y_test = y[test]
        clf.fit(X_train, y_train)
        y_submission = clf.predict_proba(X_test)[:, 1]
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

print("Blending.")
clf = LogisticRegression()
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

print("Linear stretch of predictions to [0,1]")
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())

print("Saving Results.")
tmp = np.vstack([range(1, len(y_submission)+1), y_submission]).T
np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f',
            header='MoleculeId,PredictedProbability', comments='')

Creating train and test sets for blending.
Blending.
Linear stretch of predictions to [0,1]
Saving Results.


In [11]:
def evalMetric(preds,dtrain):
    label = dtrain.get_label()
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    auc = metrics.roc_auc_score(pre.label,pre.preds)
    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)
    f1 = metrics.f1_score(pre.label,pre.preds)
    res = 0.6*auc +0.4*f1
    return 'res',res,True

In [13]:
import lightgbm as lgb
from sklearn import metrics
from multiprocessing import cpu_count
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
   'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 24,  # 21 24
    'learning_rate': 0.08,  # 0.08
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
    'num_threads': cpu_count() - 1
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}

dtrain = lgb.Dataset(dataset_blend_train,label=y)

lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=5,metrics=['evalMetric'])

[5]	cv_agg's res: 0.819918 + 0.0266649
[10]	cv_agg's res: 0.822026 + 0.0260273
[15]	cv_agg's res: 0.821383 + 0.0226619
[20]	cv_agg's res: 0.821111 + 0.0189412
[25]	cv_agg's res: 0.822947 + 0.0243362
[30]	cv_agg's res: 0.822803 + 0.0234218
[35]	cv_agg's res: 0.822131 + 0.0214244
[40]	cv_agg's res: 0.823137 + 0.0201675
[45]	cv_agg's res: 0.823252 + 0.0198352
[50]	cv_agg's res: 0.823602 + 0.0192888
[55]	cv_agg's res: 0.821506 + 0.0200921
[60]	cv_agg's res: 0.820721 + 0.0215351
[65]	cv_agg's res: 0.820602 + 0.0215233
[70]	cv_agg's res: 0.818748 + 0.0242002
[75]	cv_agg's res: 0.818883 + 0.0222163
[80]	cv_agg's res: 0.819635 + 0.0198917
[85]	cv_agg's res: 0.817528 + 0.0209676
[90]	cv_agg's res: 0.816444 + 0.0212905
[95]	cv_agg's res: 0.816253 + 0.0211429
[100]	cv_agg's res: 0.815566 + 0.0209671
[105]	cv_agg's res: 0.816249 + 0.0219976
[110]	cv_agg's res: 0.814894 + 0.0230418
[115]	cv_agg's res: 0.813916 + 0.0223752
[120]	cv_agg's res: 0.814419 + 0.0229705
[125]	cv_agg's res: 0.81558 + 0.0234

{'res-mean': [0.8123633633399192,
  0.8165813523955062,
  0.8184929052416144,
  0.8197105924813869,
  0.8199179814676439,
  0.821533932408174,
  0.8218886112276111,
  0.8215955070921425,
  0.8225145158143873,
  0.8220263247750796,
  0.8226421937168225,
  0.822927609226543,
  0.823075591082782,
  0.8231860500270992,
  0.8213825251992762,
  0.822408635338495,
  0.8213222937077702,
  0.8209279440555785,
  0.8206427435191552,
  0.8211106330864875,
  0.822014686881111,
  0.8226028130091816,
  0.8220771499347839,
  0.82315430845357,
  0.8229469535571592,
  0.8234346918220995,
  0.8233573129051157,
  0.8235226644093008,
  0.8224746235390226,
  0.8228033247955745,
  0.8228091363870528,
  0.8225867568599344,
  0.8224216715799862,
  0.8222430725322483,
  0.8221305334268431,
  0.8220926689469076,
  0.8221648259886969,
  0.8230334278201485,
  0.8229796736395649,
  0.8231367808871835,
  0.8234442908789312,
  0.8232855415477651,
  0.8230457961864552,
  0.8227289079965056,
  0.8232518602729936,
  0.8

In [14]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.851164
[10]	training's res: 0.856637
[15]	training's res: 0.860279
[20]	training's res: 0.862803
[25]	training's res: 0.868212
[30]	training's res: 0.873046
[35]	training's res: 0.87424
[40]	training's res: 0.877448
[45]	training's res: 0.878544
[50]	training's res: 0.880604
[55]	training's res: 0.882248
[60]	training's res: 0.885483
[65]	training's res: 0.886853
[70]	training's res: 0.889819
[75]	training's res: 0.89224
[80]	training's res: 0.894978
[85]	training's res: 0.899907
[90]	training's res: 0.903344
[95]	training's res: 0.906501
[100]	training's res: 0.910029
[105]	training's res: 0.911485
[110]	training's res: 0.913773
[115]	training's res: 0.916049
[120]	training's res: 0.919757
[125]	training's res: 0.922685
[130]	training's res: 0.924781
[135]	training's res: 0.927344
[140]	training's res: 0.929237
[145]	training's res: 0.932897
[150]	training's res: 0.935934
[155]	training's res: 0.937553
[160]	training's res: 0.93997
[165]	training's res: 0.941644


In [15]:
test = pd.read_csv('./test_feature.csv')

pred = model.predict(dataset_blend_test)
res = pd.DataFrame({'uid':test.uid,'label':pred})
res = res.sort_values(by='label',ascending=False)
res.label = res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

res.to_csv('./result_stacking.csv',index=False,header=False,sep=',',columns=['uid','label'])

In [19]:
temp = pd.read_csv('./submission.csv')
res = pd.DataFrame({'uid':test.uid,'label':temp.PredictedProbability})
res = res.sort_values(by='label',ascending=False)
res.label = res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))
res.to_csv('./sumission_stacking.csv',index=False,header=False,sep=',',columns=['uid','label'])

In [32]:
#voting
a = pd.read_csv('./0.859888.csv', names=['uid','label1'])
b = pd.read_csv('./0.853034.csv', names=['uid','label2'])
c = pd.read_csv('./sumission_stacking.csv', names=['uid','label3'])
d = pd.read_csv('./result_stacking.csv', names=['uid','label4'])

In [33]:
ab=pd.merge(a,b,how='left',on='uid')
abc = pd.merge(ab,c,how='left',on='uid')
abcd = pd.merge(abc,d,how='left',on='uid')

In [35]:
abcd['cnt'] = abcd.label1 + abcd.label2 + abcd.label3 + abcd.label4

In [45]:
abcd[(abcd.cnt >= 3) & (abcd.label1 != 1)].uid.values  #14

array(['u8631', 'u8742', 'u7902', 'u7819', 'u7007', 'u9257', 'u7266',
       'u7032', 'u8369', 'u7111', 'u8882', 'u9705', 'u7477', 'u8114'],
      dtype=object)

In [69]:
abcd[(abcd.cnt <= 1) & (abcd.label1 != 0)].uid.values  #20

array(['u8764', 'u9683', 'u9908', 'u8736', 'u7742', 'u8454', 'u7879',
       'u7391', 'u8665', 'u7358', 'u9777', 'u8567', 'u8597', 'u7364',
       'u9088', 'u9128', 'u7213', 'u7765', 'u8779', 'u8836'], dtype=object)

In [56]:
special = pd.read_csv('./special.csv', names=['uid','label'])
for i in abcd[(abcd.cnt >= 3) & (abcd.label1 != 1)].uid.values:
    special.loc[special.uid == i, 'label'] = 1
    
for i in abcd[(abcd.cnt <= 1) & (abcd.label1 != 0)].uid.values:
    special.loc[special.uid == i, 'label'] = 0

In [57]:
special = special.sort_values(by='label',ascending=False)

In [58]:
special.to_csv('./special_sorted.csv',index=False,header=False,sep=',',columns=['uid','label'])

In [59]:
#testing
e = pd.read_csv('./0.864.csv', names=['uid','label5'])
f = pd.read_csv('./0.866.csv', names=['uid','label6'])
g = pd.read_csv('./0.867.csv', names=['uid','label7'])
h = pd.read_csv('./0.868.csv', names=['uid','label8'])
i = pd.read_csv('./0.869.csv', names=['uid','label9'])

In [60]:
abcde=pd.merge(abcd,e,how='left',on='uid')
abcdef = pd.merge(abcde,f,how='left',on='uid')
abcdefg = pd.merge(abcdef,g,how='left',on='uid')
abcdefgh = pd.merge(abcdefg,h,how='left',on='uid')
abcdefghi = pd.merge(abcdefgh,i,how='left',on='uid')

In [63]:
abcdefghi['cnt'] = abcdefghi.label1 + abcdefghi.label2 + abcdefghi.label3 + abcdefghi.label4 + abcdefghi.label5 + abcdefghi.label6 + abcdefghi.label7 + abcdefghi.label8 + abcdefghi.label9