In [1]:
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn import metrics
%matplotlib inline
plt.rcParams['figure.figsize']=12,4



In [2]:
train_df=pd.read_csv('/home/yw/study/Competition/Safe_Driver_Prediction_20171014/train.csv',na_values=-1)
test_df=pd.read_csv('/home/yw/study/Competition/Safe_Driver_Prediction_20171014/test.csv',na_values=-1)
sample_df=pd.read_csv('/home/yw/study/Competition/Safe_Driver_Prediction_20171014/sample_submission.csv')

In [3]:
train_df.fillna(99999,inplace=True)
test_df.fillna(99999,inplace=True)

In [4]:
#所有二元变量
bin_features=[feat for feat in train_df.columns if 'bin' in feat]
#所有的分类变量
cat_features=[feat for feat in train_df.columns if 'cat' in feat]

In [5]:
def transform_dtype_float(df,columns):
    for column in columns:
        df[column]=df[column].astype(np.float32)
def transform_dtype_int(df,columns):
    for column in columns:
        df[column]=df[column].astype(np.int32)

In [6]:
float_features=[feat for feat in train_df.columns if feat not in bin_features+cat_features+['id','target']]
transform_dtype_float(train_df,float_features)
transform_dtype_float(test_df,float_features)

In [7]:
transform_dtype_int(train_df,bin_features+cat_features+['target'])
transform_dtype_int(test_df,bin_features+cat_features)

In [8]:
train_df.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2.0,2,5.0,1,0,0,1,0,...,9.0,1.0,5.0,8.0,0,1,1,0,0,1
1,9,0,1.0,1,7.0,0,0,0,0,1,...,3.0,1.0,1.0,9.0,0,1,1,0,1,0
2,13,0,5.0,4,9.0,1,0,0,0,1,...,4.0,2.0,7.0,7.0,0,1,1,0,1,0
3,16,0,0.0,1,2.0,0,0,1,0,0,...,2.0,2.0,4.0,9.0,0,0,0,0,0,0
4,17,0,0.0,2,0.0,1,0,1,0,0,...,3.0,1.0,1.0,3.0,0,0,0,1,1,0


# 1 Data Analysis

## 1.1 对分类变量哑编码

In [9]:
cat_features

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [10]:
feat_dummies=pd.get_dummies(train_df[cat_features].append(test_df[cat_features]),columns=cat_features,dummy_na=False)

In [11]:
train_df=train_df.T.append((feat_dummies.iloc[:train_df.shape[0]]).T).T
test_df=test_df.T.append((feat_dummies.iloc[train_df.shape[0]:]).T).T

In [12]:
use_features=[x for x in train_df.columns if x not in ['id','target']+cat_features]

In [13]:
X=train_df[use_features]
y=train_df['target']

X_predict=test_df[use_features]

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,stratify=y,random_state=9)

# 2 Base line

In [27]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum() 

def gini_normalized(preds,dtrain):
    y=dtrain.get_label()
    preds=softmax(preds)
    return 'NormGini',-gini(y, preds) / gini(y, y)

def gini_normalized_ordinary(y,preds):
    return gini(y, preds) / gini(y, y)

def modelfit(alg,X,y,X_test,y_test,useTrainCV=True,cv_folds=5,early_stopping_rounds=30):
    if useTrainCV==True:
        xgb_param=alg.get_xgb_params()
        xgtrain=xgb.DMatrix(X,label=y,missing=99999)
        cvresult=xgb.cv(xgb_param,xgtrain,num_boost_round=50000,nfold=cv_folds,feval=gini_normalized,
                        early_stopping_rounds=early_stopping_rounds,verbose_eval=20)
        alg.set_params(n_estimators=cvresult.shape[0])
        print('Best Iteration:',cvresult.shape[0])
        
    alg.fit(X,y,eval_metric=gini_normalized)
    
    #Predict training set:
    dtrain_predictions=alg.predict(X)
    dtrain_predprob=alg.predict_proba(X)[:,1]
    
    #pPredict test set:
    dtest_predictions=alg.predict(X_test)
    dtest_predprob=alg.predict_proba(X_test)[:,1]
    
    #Print Mode report:
    print('Model report on trian:')
    print('Train Accuracy:{0:.4f}'.format(metrics.accuracy_score(y,dtrain_predictions)))
    print('Train AUC:{0:.4f}'.format(metrics.roc_auc_score(y,dtrain_predprob)))
    print('Train Normalized GINI：{0:.6f}'.format(gini_normalized_ordinary(y,dtrain_predprob)))
    print('Model report on test:')
    print('Test Accuracy:{0:.4f}'.format(metrics.accuracy_score(y_test,dtest_predictions)))
    print('Test AUC:{0:.4f}'.format(metrics.roc_auc_score(y_test,dtest_predprob)))
    print('Test Normalized GINI：{0：.6f}'.format(gini_normalized_ordinary(y,dtest_predprob)))
    print('Test classifiction report:')
    print(metrics.classification_report(y_test,dtest_predictions))
    print('Test confusion matrix:')
    plt.figure()
    with sns.axes_style(style='dark'):
        plot_confusion_matrix(metrics.confusion_matrix(y_test,dtest_predictions),classes=['0','1'],
                             title='Confusion Matrix')
    plt.figure()
    feat_imp=pd.Series(alg.booster().get_fscore().sort_values(ascending=True))
    feat_imp.plot(kind='barh',title='Feature Importances',color='green')
    plt.ylabel('Feature Importance Score')
    return feat_imp

def turnParams(model,X,y,searchParam,modelparams,scoring='roc_auc',cv=5):
    gsc=GridSearchCV(estimator=model.__class__(**modelparams),param_grid=searchParam,scoring=scoring,
                    iid=False,cv=cv)
    gsc.fit(X,y)
    cv_scores=pd.DataFrame([gsc.cv_results_['mean_test_score'],
                            gsc.cv_results_['mean_train_score'],
                            gsc.cv_results_['std_test_score'],
                            gsc.cv_results_['std_train_score']],
                            index=['mean_test_score','mean_train_score','std_test_score','std_train_score']).T
    for param in gsc.best_params_.keys():
        modelparams[param]=gsc.best_params_[param]
    print('Best params:')
    for param in gsc.best_params_.keys():
        print(param,':',gsc.best_params_[param])
    print('Best score:',gsc.best_score_)
    print(cv_scores)

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [29]:
params={'base_score': 0.5,
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.01,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': 99999,
 'n_estimators': 100,
 'nthread': -1,
 'objective': 'binary:logistic',
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 573518/21694,
 'seed': 9,
 'silent': True,
 'subsample': 1}

In [30]:
xgbc=XGBClassifier(**params)

In [31]:
modelfit(xgbc,X_train,y_train,X_test,y_test,useTrainCV=True,cv_folds=5,early_stopping_rounds=30)

[0]	train-NormGini:-0.187016+0.00357569	test-NormGini:-0.178447+0.0119071
[20]	train-NormGini:-0.216435+0.00281043	test-NormGini:-0.208185+0.0104587
[40]	train-NormGini:-0.228304+0.0035174	test-NormGini:-0.218731+0.0110757
[60]	train-NormGini:-0.238277+0.00253051	test-NormGini:-0.228027+0.0100179
[80]	train-NormGini:-0.24536+0.00218448	test-NormGini:-0.233988+0.00974585
[100]	train-NormGini:-0.250542+0.00212108	test-NormGini:-0.238359+0.0093527
[120]	train-NormGini:-0.255023+0.00182415	test-NormGini:-0.24195+0.00906924
[140]	train-NormGini:-0.25887+0.00199319	test-NormGini:-0.245007+0.00855294
[160]	train-NormGini:-0.262163+0.00172858	test-NormGini:-0.247743+0.00845473
[180]	train-NormGini:-0.265028+0.00178974	test-NormGini:-0.249892+0.0084719
[200]	train-NormGini:-0.267509+0.00181225	test-NormGini:-0.251788+0.00843153
[220]	train-NormGini:-0.269778+0.00172088	test-NormGini:-0.253658+0.00837805
[240]	train-NormGini:-0.271856+0.0016164	test-NormGini:-0.25514+0.0084865
[260]	train-NormGi

KeyError: '0：'

In [165]:
params['n_estimators']=141

In [166]:
xgb_opt=XGBClassifier(**params)

In [167]:
xgb_opt.fit(X,y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=99999, n_estimators=141, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=26.43671061122891, seed=9, silent=True,
       subsample=1)

In [168]:
pred_prob=xgb_opt.predict_proba(X_predict)[:,1]

In [185]:
predict_result=pd.DataFrame({'id':test_df['id'].astype(int),'target':pd.Series(pred_prob,index=test_df['id'].index)})

In [186]:
predict_result.head()

Unnamed: 0,id,target
0,0,0.430969
1,1,0.431302
2,2,0.436761
3,3,0.281363
4,4,0.492159


In [188]:
predict_result.to_csv('/home/yw/study/Competition/Safe_Driver_Prediction_20171014/20171014_predict_result.csv',index=False)