In [None]:
%%time
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('whitegrid')
import os
import sys
import time
import datetime
from tqdm import tqdm
import lightgbm as lgb
import operator
import xgboost as xgb
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from imblearn.under_sampling import RandomUnderSampler
import warnings
from imblearn.over_sampling import SMOTE
from scipy.stats import ks_2samp
from sklearn import manifold
warnings.filterwarnings("ignore")

print(os.listdir("../input"))
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

### 1. 查看数据大小

In [None]:
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
#数据大小
print('Rows: ',train.shape[0],'Columns: ',train.shape[1])
print(train.head())
print('Rows: ',test.shape[0],'Columns: ',test.shape[1])
print(test.head())

### 2. 查看label分布

In [None]:
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
#label的分布
print(train['target'].value_counts())
sns.countplot(train['target'])
sns.set_style('whitegrid')

### 3. 查看特征缺失值

 ```
简单数据预处理

数据缺失
 1.如果缺值的样本占总数比例极高，直接舍弃；
 2.非连续值特征属性，把NAN作为一个新的类别加入；
 3.连续值特征属性，通过一个step离散化数据，之后把NAN作为一个type加到属性中；

使用scikit-learn 中的RandomForest来拟合缺失的年龄数据
RandomForest是一个用在原始数据中做不同采样，建立多颗DecisionTree，
再进行average等等来降低过拟合现象，提高结果的机器学习算法
 ```

In [None]:
#每个特征的缺省值
total = train.isnull().sum().sort_values(ascending = False)
percent = (train.isnull().sum()/train.isnull().count()*100).sort_values(ascending = False)
missing_train_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
print(missing_train_data)

### 4. 查看特征取值个数

In [None]:
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
#每个特征下的数据取值不同个数
for col in train.columns[2:]:
    print("Number of unique values of {} : {}".format(col, train[col].nunique()))
#看一下var_68这个特征
print('-------------------------------')
print(train['var_68'].value_counts()) 
print('-------------------------------')
print(test['var_68'].value_counts()) 

### 5. 查看每个特征与label之间的相关系数

In [None]:
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
#每一个特征和label之间的相关系数
corr = train.corr()
print(abs(corr['target']).sort_values(ascending=False))

In [None]:
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
#每一个特征和label之间的相关可视化
target_mask = train['target'] == 1
non_target_mask = train['target'] == 0 
statistics_array = []
for col in train.columns[2:]:
    statistic, pvalue = ks_2samp(train.loc[non_target_mask, col], train.loc[target_mask, col])
    statistics_array.append(statistic)
    fig, ax = plt.subplots(1, 1, figsize=(10, 4))
    sns.kdeplot(train.loc[non_target_mask, col], ax=ax, label='Target == 0')
    sns.kdeplot(train.loc[target_mask, col], ax=ax, label='Target == 1')

    ax.set_title('name: {}, statistics: {:.5f}, pvalue: {:5f}'.format(col, statistic, pvalue))
    plt.show()