In [394]:
import numpy as np
import pandas as pd

import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [395]:
def cal_ks(label,score):
    fpr,tpr,thresholds= roc_curve(label,score)
    return max(tpr-fpr)
    
def cal_auc_ks(y_true, y_pred, name = None, save=False):
    sample = name + " Sample : %s" % len(y_true)
    auc = name + ' test_set auc : %0.3f' % roc_auc_score(y_true, y_pred)
    ks = name + ' test_set ks  : %0.3f' % cal_ks(y_true,y_pred) 
    print (sample)
    print (auc)
    print (ks)
    print ('----------------cal_auc_ks process successfully!----------------')
    if save:
        if name:
            pass
        else:
            name = ''
        with open(name + '_auc&ks.txt', 'a+') as f:
            f.write(sample + '\n' + auc + '\n' + ks + '\n' + '------------------------------------' + '\n' )
            print ('----------------cal_auc_ks save successfully!----------------')
    return roc_auc_score(y_true, y_pred), cal_ks(y_true,y_pred) 

In [396]:
def dimensions_of_data(file_name):
    max_dimension = -1
    min_dimension = 1e6
    with open(file_name, 'r') as f:    
        lines = f.readlines()
        for line in lines[1:2]:
            data, label = line.split('\t')
            
            for pair in data.split(' '):
                index = int(pair.split(':')[0])
                max_dimension = max(index, max_dimension)
                min_dimension = min(index, min_dimension)
                
    return max_dimension, min_dimension
    

In [397]:
train_dims, train_min_dims = dimensions_of_data('train.data')
test_dims, test_min_dims = dimensions_of_data('test.data')
dims = max(train_dims, test_dims)

In [398]:
def read_data(file_name, dims):
    
    with open(file_name, 'r') as f:    
        lines = f.readlines()        
        
        rows, cols = len(lines) - 1, dims + 2
        all_data = np.array([[np.nan for _ in range(cols)] for _ in range(rows)])

        for idx, line in enumerate(lines[1:]):
            data, label = line.split('\t')
            
            all_data[idx, -1] = float(label)
            
            for pair in data.split(' '):
                index = int(pair.split(':')[0])
                val = pair.split(':')[1]
                
                all_data[idx, index] = val
    
    return pd.DataFrame(all_data)

In [399]:
train = read_data('train.data', dims)
test = read_data('test.data', dims)
feature_importance = pd.read_csv('feature_importance_lgb.txt', header=-1)

In [400]:
feature_importance_threshold = 1

feature_importance_index = feature_importance[feature_importance.iloc[:, 1] > feature_importance_threshold].iloc[:, 0]

def feature_select(data, idx):
    
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    
    y_name = data.columns[-1]
    
    X = X[idx]
    X[y_name] = y

    return X

train = feature_select(train, feature_importance_index)
test = feature_select(test, feature_importance_index)

In [401]:
train.head()

Unnamed: 0,2725,1048,2724,2728,2806,2730,2737,1588,1419,2787,...,2516,2386,1562,2225,1608,1483,2488,1378,2529,2840
0,-1.0,1.58489,-1.0,1.1447,0.0,0.55302,3.03143,,1.0,0.0,...,0.0,4.10574,3.57428,0.0,0.0,0.0,4.10574,5.87516,5.87516,1.0
1,-1.0,1.28474,-1.0,0.96253,0.0,0.5885,2.50176,1.1487,1.1487,1.0,...,3.98107,5.4928,0.0,1.55185,1.55185,1.31951,5.4928,3.19577,3.98107,1.0
2,0.0,1.20112,5.44866,0.0,0.0,0.0,2.75946,,1.0,1.0,...,0.0,4.95934,4.00094,0.0,0.0,0.0,4.95934,4.00094,4.31736,1.0
3,-1.0,1.1487,-1.0,0.96253,0.0,0.47131,2.56947,,1.0,0.0,...,0.0,4.95934,4.95934,0.0,0.0,0.0,4.95934,4.95934,4.95934,1.0
4,-1.0,1.35096,-1.0,0.0,0.0,0.40221,2.61807,,0.87055,1.0,...,3.98107,5.4928,0.0,1.24573,1.24573,1.0,5.4928,4.57305,4.84389,0.0


In [402]:
train_train_X, train_validation_X, train_train_y, train_validation_y = train_test_split(train.iloc[:, :-1], train.iloc[:, -1],test_size=0.3, random_state=4000, stratify=train.iloc[:, -1])

# cv_res = lgb.cv(param, train_ds, verbose_eval=True, shuffle=True)

In [403]:
# train_ds = lgb.Dataset(data=train_train_X, label=train_train_y)
# train_validation_ds = lgb.Dataset(data=train_validation_X, label=train_validation_y)

# param = {'num_leaves': 31,
#          'min_data_in_leaf': 30, 
#          'objective':'binary',
#          'max_depth': -1,
#          'learning_rate': 0.01,
#          "min_child_samples": 20,
#          "boosting": "gbdt",
#          "feature_fraction": 0.9,
#          "bagging_freq": 1,
#          "bagging_fraction": 0.9 ,
#          "bagging_seed": 11,
# #          "metric": 'binary_logloss',
#          "metric": 'binary_error',
#          "lambda_l1": 0.1,
#          "verbosity": -1,
#          "nthread": 4,
#          "random_state": 4590}


# clf = lgb.train(param, train_ds, num_boost_round=1000, verbose_eval=True, valid_sets=train_validation_ds, early_stopping_rounds=500)
# test_prob = clf.predict(test.iloc[:, :-1])

# def prop_to_label(data, threshold):
#     return np.array([1 if val > threshold else 0 for val in data])


# threshold = 0.5
# test_label = prop_to_label(test_prob, threshold)

# cal_auc_ks(test.iloc[:, -1].values.tolist(), test_label.tolist(), name='results')
# print('confusion matrix', confusion_matrix(test.iloc[:, -1].values.tolist(), test_label.tolist()))
# print('f1', f1_score(test.iloc[:, -1].values.tolist(), test_label.tolist()))

In [404]:
train.head()

Unnamed: 0,2725,1048,2724,2728,2806,2730,2737,1588,1419,2787,...,2516,2386,1562,2225,1608,1483,2488,1378,2529,2840
0,-1.0,1.58489,-1.0,1.1447,0.0,0.55302,3.03143,,1.0,0.0,...,0.0,4.10574,3.57428,0.0,0.0,0.0,4.10574,5.87516,5.87516,1.0
1,-1.0,1.28474,-1.0,0.96253,0.0,0.5885,2.50176,1.1487,1.1487,1.0,...,3.98107,5.4928,0.0,1.55185,1.55185,1.31951,5.4928,3.19577,3.98107,1.0
2,0.0,1.20112,5.44866,0.0,0.0,0.0,2.75946,,1.0,1.0,...,0.0,4.95934,4.00094,0.0,0.0,0.0,4.95934,4.00094,4.31736,1.0
3,-1.0,1.1487,-1.0,0.96253,0.0,0.47131,2.56947,,1.0,0.0,...,0.0,4.95934,4.95934,0.0,0.0,0.0,4.95934,4.95934,4.95934,1.0
4,-1.0,1.35096,-1.0,0.0,0.0,0.40221,2.61807,,0.87055,1.0,...,3.98107,5.4928,0.0,1.24573,1.24573,1.0,5.4928,4.57305,4.84389,0.0


In [405]:
uniq_counts = np.array([train.iloc[:, c].unique().shape[0] for c in range(train.shape[1])])
categorical_cols = train.columns[uniq_counts < 10]
numerical_cols = train.columns[uniq_counts >= 10]


for cat_col in categorical_cols:
    mode = train[cat_col].mode()[0]
    train[cat_col].fillna(mode, inplace=True)
    test[cat_col].fillna(mode, inplace=True)
    
for num_col in numerical_cols:
    mean = train[num_col].mean()
    train[num_col].fillna(mean, inplace=True)    
    test[num_col].fillna(mean, inplace=True)        
    
categorical_cols, numerical_cols

(Int64Index([2787, 2759, 2804,   45,  122,  253,   47, 2720, 2763, 2757, 2762,
             2758,  251, 2755,    8, 2840],
            dtype='int64'),
 Int64Index([2725, 1048, 2724, 2728, 2806, 2730, 2737, 1588, 1419, 2726,
             ...
             2397, 2516, 2386, 1562, 2225, 1608, 1483, 2488, 1378, 2529],
            dtype='int64', length=292))

In [406]:
# params = [{"C": np.logspace(-2, 1, 5)}]
# svc = SVC(gamma='auto')

# train_X = train.iloc[:, :-1]
# train_y = train.iloc[:, -1].astype('int')
# # train_y = train.iloc[:, -1]
# # print(train_y)

# test_X = test.iloc[:, :-1]
# test_y = test.iloc[:, -1].astype('int')

# grid = GridSearchCV(svc, cv=5, param_grid=params, scoring='f1')
# grid.fit(train_X, train_y)

# print('best parameter', grid.best_params_)
# print('train f1: ', np.mean(cross_val_score(grid.best_estimator_, train_X, train_y, cv=5, scoring='f1')))
# # print('test f1: ', np.mean(cross_val_score(grid.best_estimator_, test_X, test_y, cv=5, scoring='f1')))


In [None]:
# predictions = grid.best_estimator_.predict(test_X)

# # cal_auc_ks(predictions, test_y, name='results')
# print('confusion matrix:', confusion_matrix(predictions, test_y))
# print('f1 score:', f1_score(predictions, test_y))

In [None]:
params = [{"C": np.logspace(-2, 1, 5), "penalty": ['l1', 'l2']}]
clf = LogisticRegression()

train_X = train.iloc[:, :-1]
train_y = train.iloc[:, -1].astype('int')
# train_y = train.iloc[:, -1]
# print(train_y)

test_X = test.iloc[:, :-1]
test_y = test.iloc[:, -1].astype('int')

grid = GridSearchCV(clf, cv=5, param_grid=params, scoring='f1')
grid.fit(train_X, train_y)

print('best parameter', grid.best_params_)
print('train f1: ', np.mean(cross_val_score(grid.best_estimator_, train_X, train_y, cv=5, scoring='f1')))

predictions = grid.best_estimator_.predict(test_X)

cal_auc_ks(predictions, test_y, name='results')
print('confusion matrix:', confusion_matrix(predictions, test_y))
print('f1 score:', f1_score(predictions, test_y))



In [None]:
params = [{}]
clf = GaussianNB()

train_X = train.iloc[:, :-1]
train_y = train.iloc[:, -1].astype('int')
# train_y = train.iloc[:, -1]
# print(train_y)

test_X = test.iloc[:, :-1]
test_y = test.iloc[:, -1].astype('int')

grid = GridSearchCV(clf, cv=5, param_grid=params, scoring='f1')
grid.fit(train_X, train_y)

print('best parameter', grid.best_params_)
print('train f1: ', np.mean(cross_val_score(grid.best_estimator_, train_X, train_y, cv=5, scoring='f1')))

predictions = grid.best_estimator_.predict(test_X)

cal_auc_ks(predictions, test_y, name='results')
print('confusion matrix:', confusion_matrix(predictions, test_y))
print('f1 score:', f1_score(predictions, test_y))