In [1]:
'''Import basic modules'''
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression

'''import visualization'''
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")
%matplotlib inline

'''Plotly visualization .'''
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
py.init_notebook_mode(connected=True)

'''Display markdown formatted output like bold, italic bold etc.'''
from IPython.display import Markdown
def bold(string):
    display(Markdown(string))

'''Ignore deprecation and future, and user warnings.'''
import warnings as wrn
wrn.filterwarnings('ignore', category = DeprecationWarning) 
wrn.filterwarnings('ignore', category = FutureWarning) 
wrn.filterwarnings('ignore', category = UserWarning) 

## 参考：

https://www.kaggle.com/vikassingh1996/don-t-underestimate-the-power-of-a-logistic-reg

In [2]:
%%time

# Load data
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

print(train.shape)
print(test.shape)

(600000, 25)
(400000, 24)
Wall time: 3.45 s


In [3]:
'''Variable Description'''
def description(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.iloc[0].values
    summary['Second Value'] = df.iloc[1].values
    summary['Third Value'] = df.iloc[2].values
    return summary
bold('**Variable Description of  train Data:**')
description(train)

**Variable Description of  train Data:**

Dataset Shape: (600000, 25)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value
0,id,int64,0,600000,0,1,2
1,bin_0,float64,17894,2,0,1,0
2,bin_1,float64,18003,2,0,1,1
3,bin_2,float64,17930,2,0,0,0
4,bin_3,object,18014,2,F,F,F
5,bin_4,object,18047,2,N,Y,N
6,nom_0,object,18252,3,Red,Red,Red
7,nom_1,object,18156,6,Trapezoid,Star,
8,nom_2,object,18035,6,Hamster,Axolotl,Hamster
9,nom_3,object,18121,6,Russia,,Canada


In [4]:
def replace_nan(data):
    """用众数填充"""
    for column in data.columns:
        if data[column].isna().sum() > 0:
            data[column] = data[column].fillna(data[column].mode()[0])


replace_nan(train)
replace_nan(test)

In [5]:
# 统计用的，没啥意义
train['ord_5_ot'] = 'Others'
train.loc[train['ord_5'].isin(train['ord_5'].value_counts()[:25].sort_index().index), 'ord_5_ot'] = train['ord_5']

In [6]:
train

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,ord_5_ot
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0,Others
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,India,...,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0,Others
2,2,0.0,1.0,0.0,F,N,Red,Triangle,Hamster,Canada,...,3.0,Novice,Freezing,n,P,eN,5.0,9.0,0,Others
3,3,0.0,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,1.0,Novice,Lava Hot,a,C,Fl,3.0,3.0,0,Fl
4,4,0.0,0.0,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0,Others
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,599995,0.0,1.0,0.0,T,N,Red,Polygon,Axolotl,India,...,3.0,Novice,Freezing,a,R,GZ,5.0,8.0,0,GZ
599996,599996,1.0,0.0,0.0,T,Y,Blue,Polygon,Dog,Costa Rica,...,2.0,Novice,Boiling Hot,n,N,sf,3.0,3.0,0,Others
599997,599997,0.0,0.0,0.0,F,Y,Red,Circle,Axolotl,Russia,...,2.0,Contributor,Freezing,n,H,MV,7.0,5.0,0,Others
599998,599998,1.0,1.0,0.0,F,Y,Red,Polygon,Axolotl,India,...,1.0,Master,Warm,m,X,Ey,1.0,5.0,0,Others


In [7]:
%%time
# Feature Engineering

'''Subset'''
target = train['target']
train_id = train['id']
test_id = test['id']
train.drop(['target', 'id','ord_5_ot'], axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

print(train.shape)
print(test.shape)
print(target.shape)

(600000, 23)
(400000, 23)
(600000,)
Wall time: 147 ms


In [8]:
%%time

'''One Hot Encode'''
traintest = pd.concat([train, test])
dummies = pd.get_dummies(traintest, columns=traintest.columns, drop_first=True, sparse=True)
train_ohe = dummies.iloc[:train.shape[0], :]
test_ohe = dummies.iloc[train.shape[0]:, :]

print(train_ohe.shape)
print(test_ohe.shape)

(600000, 5678)
(400000, 5678)
Wall time: 6min 1s


In [9]:
%%time
'''Covert dataframe to spare matrix'''
train_ohe = train_ohe.sparse.to_coo().tocsr()
test_ohe = test_ohe.sparse.to_coo().tocsr()
type(train_ohe)

Wall time: 1.16 s


scipy.sparse.csr.csr_matrix

In [10]:
%%time

# Model
def run_cv_model(train, test, target, model_fn, params={}, eval_fn=None, label='model'):
    
    # 10折交叉验证
    kf = KFold(n_splits=10)
    fold_splits = kf.split(train, target)
    cv_scores = []
    pred_full_test = 0
    pred_train = np.zeros((train.shape[0]))
    i = 1
    
    for dev_index, val_index in fold_splits:
        print('Started ' + label + ' fold ' + str(i) + '/10')
        dev_X, val_X = train[dev_index], train[val_index]
        dev_y, val_y = target[dev_index], target[val_index]
        params2 = params.copy()
        pred_val_y, pred_test_y = model_fn(dev_X, dev_y, val_X, val_y, test, params2)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index] = pred_val_y
        if eval_fn is not None:
            cv_score = eval_fn(val_y, pred_val_y)
            cv_scores.append(cv_score)
            print(label + ' cv score {}: {}'.format(i, cv_score))
        i += 1
    print('{} cv scores : {}'.format(label, cv_scores))
    print('{} cv mean score : {}'.format(label, np.mean(cv_scores)))
    print('{} cv std score : {}'.format(label, np.std(cv_scores)))
    pred_full_test = pred_full_test / 5.0
    results = {'label': label,
              'train': pred_train, 'test': pred_full_test,
              'cv': cv_scores}
    return results

Wall time: 0 ns


In [11]:
def runLR(train_X, train_y, test_X, test_y, test_X2, params):
    """运行逻辑回归"""
    print('Train LR')
    model = LogisticRegression(**params)
    model.fit(train_X, train_y)
    print('Predict 1/2')
    pred_test_y = model.predict_proba(test_X)[:, 1]
    print('Predict 2/2')
    pred_test_y2 = model.predict_proba(test_X2)[:, 1]
    return pred_test_y, pred_test_y2


lr_params = {'solver': 'lbfgs', 'C':  0.1}
results = run_cv_model(train_ohe, test_ohe, target, runLR, lr_params, auc, 'lr')

Started lr fold 1/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 1: 0.7836629559640287
Started lr fold 2/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 2: 0.7850321198397145
Started lr fold 3/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 3: 0.7861692364239734
Started lr fold 4/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 4: 0.7842516907579922
Started lr fold 5/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 5: 0.7869788330491584
Started lr fold 6/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 6: 0.7840372764784854
Started lr fold 7/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 7: 0.7848248643680715
Started lr fold 8/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 8: 0.7862742992423981
Started lr fold 9/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 9: 0.7861102672491634
Started lr fold 10/10
Train LR
Predict 1/2
Predict 2/2
lr cv score 10: 0.7832954853920074
lr cv scores : [0.7836629559640287, 0.7850321198397145, 0.7861692364239734, 0.7842516907579922, 0.7869788330491584, 0.