In [1]:
from sklearn.model_selection import KFold,StratifiedKFold
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('./titanic/train.csv')
test = pd.read_csv('./titanic/test.csv')

In [3]:
train['Age'] = train['Age'].fillna(train['Age'].mode())
test['Age'] = test['Age'].fillna(test['Age'].mode())

train['Embarked'] = train['Embarked'].fillna(train['Embarked'].mode())

train.rename(columns={'Survived':'label'},inplace=True)
test.rename(columns={'Survived':'label'},inplace=True)

train = train.drop(['Name','Ticket'],axis=1)
test = test.drop(['Name','Ticket'],axis=1)

In [4]:
train.head()

Unnamed: 0,PassengerId,label,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [5]:
train.columns

Index(['PassengerId', 'label', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked'],
      dtype='object')

In [None]:
#疑问：分组统计特征，要在train和test中单独求吗？

In [6]:
# 目标编码
# https://mp.weixin.qq.com/s/taMj-x-qLz5sw-7zld5BmA
# target encoding，可以理解为转化率，就是聚合id后对label求mean、sum、count(lable=1)/count(all)，
# 这里需要考虑到数据穿越造成线下严重过拟合的情况，一般需要通过交叉平滑处理，如划分为5折，用其中4折去统计剩下的一折，
# 然后对过大或过小的进行平滑，对测试集直接用训练集来做统计，若数据集有时间因素存在，则需要根据时间顺序来处理；
def kfold_mean(df_train, df_test, target, target_mean_list):
    folds = StratifiedKFold(n_splits=5)

    mean_of_target = df_train[target].mean()

    for fold_, (trn_idx, val_idx) in tqdm(enumerate(folds.split(df_train, y=df_train[target]))):
        tr_x = df_train.iloc[trn_idx, :]
        vl_x = df_train.iloc[val_idx, :]

        for col in target_mean_list:
            df_train.loc[vl_x.index, f'{col}_target_enc'] = vl_x[col].map(tr_x.groupby(col)[target].mean())

    for col in target_mean_list:
        df_train[f'{col}_target_enc'].fillna(mean_of_target, inplace=True)

        df_test[f'{col}_target_enc'] = df_test[col].map(df_train.groupby(col)[f'{col}_target_enc'].mean())

        df_test[f'{col}_target_enc'].fillna(mean_of_target, inplace=True)
    return pd.concat([df_train, df_test], ignore_index=True)

feature_list =  ['Sex','Embarked']
data = pd.concat([train, test], ignore_index=True)
data = kfold_mean(data[~data['label'].isna()], data[data['label'].isna()],'label',feature_list)

data.head()

5it [00:00, 14.34it/s]


Unnamed: 0,Age,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,label,Sex_target_enc,Embarked_target_enc
0,22.0,S,7.25,0,1,3,male,1,0.0,0.195279,0.335271
1,38.0,C,71.2833,0,2,1,female,1,1.0,0.739837,0.589552
2,26.0,S,7.925,0,3,3,female,0,1.0,0.739837,0.335271
3,35.0,S,53.1,0,4,1,female,1,1.0,0.739837,0.335271
4,35.0,S,8.05,0,5,3,male,0,0.0,0.195279,0.335271
5,,Q,8.4583,0,6,3,male,0,0.0,0.195279,0.327869
6,54.0,S,51.8625,0,7,1,male,0,0.0,0.195279,0.335271
7,2.0,S,21.075,1,8,3,male,3,0.0,0.195279,0.335271
8,27.0,S,11.1333,2,9,3,female,0,1.0,0.739837,0.335271
9,14.0,C,30.0708,0,10,2,female,1,1.0,0.739837,0.589552


In [7]:
#数值交叉特征
countfea = ['Fare','Pclass','Age']
#数值特征与数值特征之间做加减乘除
for f1 in tqdm(countfea, desc="count_feas 基本交叉特征"):
    for f2 in countfea:
        if f1 != f2:
            data['{}_{}_ratio'.format(f1, f2)] = data[f1].values / data[f2].values
            data['{}_{}_multi'.format(f1, f2)] = data[f1].values * data[f2].values
            data['{}_{}_add'.format(f1, f2)] = data[f1].values + data[f2].values
            data['{}_{}_diff'.format(f1, f2)] = data[f1].values - data[f2].values
data.head()

count_feas 基本交叉特征: 100%|████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 558.76it/s]


Unnamed: 0,Age,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,label,Sex_target_enc,Embarked_target_enc,Fare_SibSp_ratio,Fare_SibSp_multi,Fare_SibSp_add,Fare_SibSp_diff,SibSp_Fare_ratio,SibSp_Fare_multi,SibSp_Fare_add,SibSp_Fare_diff
0,22.0,S,7.25,0,1,3,male,1,0.0,0.195279,0.335271,7.25,7.25,8.25,6.25,0.137931,7.25,8.25,-6.25
1,38.0,C,71.2833,0,2,1,female,1,1.0,0.739837,0.589552,71.2833,71.2833,72.2833,70.2833,0.014029,71.2833,72.2833,-70.2833
2,26.0,S,7.925,0,3,3,female,0,1.0,0.739837,0.335271,inf,0.0,7.925,7.925,0.0,0.0,7.925,-7.925
3,35.0,S,53.1,0,4,1,female,1,1.0,0.739837,0.335271,53.1,53.1,54.1,52.1,0.018832,53.1,54.1,-52.1
4,35.0,S,8.05,0,5,3,male,0,0.0,0.195279,0.335271,inf,0.0,8.05,8.05,0.0,0.0,8.05,-8.05
5,,Q,8.4583,0,6,3,male,0,0.0,0.195279,0.327869,inf,0.0,8.4583,8.4583,0.0,0.0,8.4583,-8.4583
6,54.0,S,51.8625,0,7,1,male,0,0.0,0.195279,0.335271,inf,0.0,51.8625,51.8625,0.0,0.0,51.8625,-51.8625
7,2.0,S,21.075,1,8,3,male,3,0.0,0.195279,0.335271,7.025,63.225,24.075,18.075,0.142349,63.225,24.075,-18.075
8,27.0,S,11.1333,2,9,3,female,0,1.0,0.739837,0.335271,inf,0.0,11.1333,11.1333,0.0,0.0,11.1333,-11.1333
9,14.0,C,30.0708,0,10,2,female,1,1.0,0.739837,0.589552,30.0708,30.0708,31.0708,29.0708,0.033255,30.0708,31.0708,-29.0708


In [9]:
#类别特征与数值特征之间
catefea = ['Sex','Embarked']
countfea = ['Fare','Age']

for i in tqdm(countfea,desc="count_feas 基本聚合特征"):
    for j in catefea:
        data['{}_{}_mean'.format(i,j)] = data.groupby(j)[i].transform('mean')
        data['{}_{}_median'.format(i,j)] = data.groupby(j)[i].transform('median')
        data['{}_{}_max'.format(i,j)] = data.groupby(j)[i].transform('max')
        data['{}_{}_min'.format(i,j)] = data.groupby(j)[i].transform('min')
        data['{}_{}_std'.format(i,j)] = data.groupby(j)[i].transform('std')
data.head()

count_feas 基本聚合特征: 100%|█████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 44.52it/s]


Unnamed: 0,Age,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,label,Sex_target_enc,...,Age_Sex_mean,Age_Sex_median,Age_Sex_max,Age_Sex_min,Age_Sex_std,Age_Embarked_mean,Age_Embarked_median,Age_Embarked_max,Age_Embarked_min,Age_Embarked_std
0,22.0,S,7.25,0,1,3,male,1,0.0,0.195279,...,30.585228,28.0,80.0,0.33,14.280581,29.245205,28.0,80.0,0.17,14.047507
1,38.0,C,71.2833,0,2,1,female,1,1.0,0.739837,...,28.687088,27.0,76.0,0.17,14.576962,32.33217,30.0,71.0,0.42,15.258092
2,26.0,S,7.925,0,3,3,female,0,1.0,0.739837,...,28.687088,27.0,76.0,0.17,14.576962,29.245205,28.0,80.0,0.17,14.047507
3,35.0,S,53.1,0,4,1,female,1,1.0,0.739837,...,28.687088,27.0,76.0,0.17,14.576962,29.245205,28.0,80.0,0.17,14.047507
4,35.0,S,8.05,0,5,3,male,0,0.0,0.195279,...,30.585228,28.0,80.0,0.33,14.280581,29.245205,28.0,80.0,0.17,14.047507


In [None]:
#类别与类别特征逻辑上的交叉
def cross_two(name_1,name_2):
    new_col=[]
    encode=0
    dic={}
    val_1=base_info[name_1]
    val_2=base_info[name_2]
    for i in tqdm(range(len(val_1))):
        tmp=str(val_1[i])+'_'+str(val_2[i])
        if tmp in dic:
            new_col.append(dic[tmp])
        else:
            dic[tmp]=encode
            new_col.append(encode)
            encode+=1
    return new_col

new_col=cross_two('enttypegb','enttypeitem')#作企业类型-小类的交叉特征
base_info['enttypegb_enttypeitem']=new_col

In [None]:
#偏离值特征
group_list = ['HYZK', 'ZHIYE', 'ZHICHEN', 'ZHIWU', 'XUELI', 'DWJJLX', 'DWSSHY', 'GRZHZT']
num_feature_list = ['GRYJCE', 'DKFFE', 'DKLL', 'DKYE', 'GRJCJS', 'GRZHSNJZYE', 'GRZHDNGJYE']                   
for group in group_list:
    for feature in num_feature_list:
        tmp = data.groupby(group)[feature].agg([sum, min, max, np.mean]).reset_index()
        tmp = pd.merge(data, tmp, on=group, how='left')
        data['{}-mean_gb_{}'.format(feature, group)] = data[feature] - tmp['mean']
        data['{}-min_gb_{}'.format(feature, group)] = data[feature] - tmp['min']
        data['{}-max_gb_{}'.format(feature, group)] = data[feature] - tmp['max']
        data['{}/sum_gb_{}'.format(feature, group)] = data[feature] / tmp['sum']

In [None]:
# 其他衍生变量 mean 和 std
for df in [data_train, data_test_a]:
    for item in ['n0','n1','n2','n2.1','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:
        df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')
        df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')


In [None]:
# 频数统计
cat_col = ['HYZK', 'ZHIYE', 'ZHICHEN', 'ZHIWU', 'XUELI', 'DWJJLX', 'DWSSHY', 'GRZHZT']
for col in cat_col:
    data[col + '_COUNT'] = data[col].map(data[col].value_counts())
    col_idx = data[col].value_counts()
    for idx in col_idx[col_idx < 10].index:
        data[col] = data[col].replace(idx, -1)  

In [None]:
#分箱
for fea in tqdm(count_feas, desc="分箱特征"):
    # 通过除法映射到间隔均匀的分箱中，每个分箱的取值范围都是loanAmnt/1000
    data['{}_bin1'.format(fea)] = np.floor_divide(data[fea], 1000)
    ## 通过对数函数映射到指数宽度分箱
    data['{}_bin2'.format(fea)] = np.floor(np.log10(data[fea]))

In [None]:
#分桶
def bucket(name,bucket_len):
    gap_list=[base_info[name].quantile(i/bucket_len) for i in range(bucket_len+1)]
    len_data=len(base_info[name])
    new_col=[]
    for i in base_info[name].values:
        for j in range(len(gap_list)):
            if gap_list[j]>=i:
                encode=j
                break
        new_col.append(encode)
    return new_col

In [None]:
# 二阶统计特征
us_feature = []
# count 编码 ，和lablending编码一个意思，一种编码方式
for col in ['gender', 'age', 'province', 'model','make']:
    data['{}_count'.format(col)] = data.groupby(col)[col].transform('count')
    data['{}_category'.format(col)] = data[col].astype('category')
    data['{}_category'.format(col)] = data['{}_category'.format(col)].cat.codes
    us_feature.append('{}_count'.format(col))
    us_feature.append('{}_category'.format(col))

In [None]:
# 交叉组合统计，就是组合特征的共现频次，例如是“gender_make_count男性且用华为手机”
corss_feature = ['gender', 'age', 'province', 'model','make']
while len(corss_feature) != 0:
    f = corss_feature.pop()
    for col in corss_feature:
        data['{}_{}_count'.format(f, col)] = data.groupby([f, col])[col].transform('count')
        data['{}_{}_category'.format(f, col)] = data[f] + '_' + data[col]
        data['{}_{}_category'.format(f, col)] = data['{}_{}_category'.format(f, col)].astype('category')
        data['{}_{}_category'.format(f, col)] = data['{}_{}_category'.format(f, col)].cat.codes
        us_feature.append('{}_{}_count'.format(f, col))
        us_feature.append('{}_{}_category'.format(f, col))

In [None]:
# 特征unique count特征
for index, col1 in enumerate(['age', 'province', 'city', 'model']):
    for col2 in ['age', 'province', 'city', 'model'][index:]:
        data['{}_in_{}_count'.format(col1, col2)] = data.groupby(col1)[col2].transform('count')
        data['{}_in_{}_nunique'.format(col1, col2)] = data.groupby(col1)[col2].transform('nunique')
        data['{}_in_{}_nunique/{}_in_{}_count'.format(col1, col2, col1, col2)] = data['{}_in_{}_nunique'.format(col1,col2)] /data['{}_in_{}_count'.format(col1,col2)]

        data['{}_in_{}_count'.format(col2, col1)] = data.groupby(col2)[col1].transform('count')
        data['{}_in_{}_nunique'.format(col2, col1)] = data.groupby(col2)[col1].transform('nunique')
        data['{}_in_{}_nunique/{}_in_{}_count'.format(col2, col1, col2, col1)] = data['{}_in_{}_nunique'.format(col2,col1)] / data['{}_in_{}_count'.format(col2, col1)]

        us_feature.append('{}_in_{}_count'.format(col1, col2))
        us_feature.append('{}_in_{}_nunique'.format(col1, col2))
        us_feature.append('{}_in_{}_nunique/{}_in_{}_count'.format(col1, col2, col1, col2))

        us_feature.append('{}_in_{}_count'.format(col2, col1))
        us_feature.append('{}_in_{}_nunique'.format(col2, col1))
        us_feature.append('{}_in_{}_nunique/{}_in_{}_count'.format(col2, col1, col2, col1))

In [None]:
#label-encode:subGrade,postCode,title
# 高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):
    le = LabelEncoder()
    le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
    data_train[col] = le.transform(list(data_train[col].astype(str).values))
    data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')


In [None]:
#构建序列特征，例如构建每个用户的登录行为序列
#构建序列
launch_grp = pd.DataFrame()

user_id = []
launch_date_str = []
for i in launch.groupby('user_id'):
    launch_date = []
    user_id.append(i[0])
    for j in i[1]['date']:
        launch_date.append(j)
    launch_date_str.append(str(launch_date))
launch_grp['user_id'] = list(user_id)
launch_grp['launch_date_str'] = list(launch_date_str)
launch_grp.head()

In [None]:
#想象力特征